keras-team · Humbulani1234 · Dec 22, 2024 · Jan 3, 2025
diff --git a/examples/structured_data/classification_with_grn_and_vsn.py b/examples/structured_data/classification_with_grn_and_vsn.py
@@ -2,7 +2,7 @@
 Title: Classification with Gated Residual and Variable Selection Networks
 Author: [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/)
 Date created: 2021/02/10
-Last modified: 2021/02/10
+Last modified: 2025/01/03
 Description: Using Gated Residual and Variable Selection Networks for income level prediction.
 Accelerator: GPU
 """
@@ -46,6 +46,8 @@
 """
 
 import os
+import subprocess
+import tarfile
 
 # Only the TensorFlow backend supports string inputs.
 os.environ["KERAS_BACKEND"] = "tensorflow"
@@ -108,13 +110,37 @@
     "income_level",
 ]
 
-data_url = "https://archive.ics.uci.edu/static/public/20/census+income.zip"
+data_url = "https://archive.ics.uci.edu/static/public/117/census+income+kdd.zip"
 keras.utils.get_file(origin=data_url, extract=True)
+
+"""
+determine the downloaded .tar.gz file path and
+extract the files from the downloaded .tar.gz file
+"""
+
+extracted_path = os.path.join(
+    os.path.expanduser("~"), ".keras", "datasets", "census+income+kdd.zip"
+)
+for root, dirs, files in os.walk(extracted_path):
+    for file in files:
+        if file.endswith(".tar.gz"):
+            tar_gz_path = os.path.join(root, file)
+            with tarfile.open(tar_gz_path, "r:gz") as tar:
+                tar.extractall(path=root)
+
 train_data_path = os.path.join(
-    os.path.expanduser("~"), ".keras", "datasets", "adult.data"
+    os.path.expanduser("~"),
+    ".keras",
+    "datasets",
+    "census+income+kdd.zip",
+    "census-income.data",
 )
 test_data_path = os.path.join(
-    os.path.expanduser("~"), ".keras", "datasets", "adult.test"
+    os.path.expanduser("~"),
+    ".keras",
+    "datasets",
+    "census+income+kdd.zip",
+    "census-income.test",
 )
 
 data = pd.read_csv(train_data_path, header=None, names=CSV_HEADER)
@@ -157,6 +183,21 @@
 valid_data.to_csv(valid_data_file, index=False, header=False)
 test_data.to_csv(test_data_file, index=False, header=False)
 
+
+"""
+clean the directory for the downloaded files except the .tar.gz file and
+also remove the empty directories
+"""
+
+subprocess.run(
+    f'find {extracted_path} -type f ! -name "*.tar.gz" -exec rm -f {{}} +',
+    shell=True,
+    check=True,
+)
+subprocess.run(
+    f"find {extracted_path} -type d -empty -exec rmdir {{}} +", shell=True, check=True
+)
+
 """
 ## Define dataset metadata
 
@@ -219,10 +260,10 @@ def process(features, target):
             features[feature_name] = keras.ops.cast(features[feature_name], "string")
     # Get the instance weight.
     weight = features.pop(WEIGHT_COLUMN_NAME)
-    return features, target, weight
+    return dict(features), target, weight
 
 
-def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):
+def get_dataset_from_csv(csv_file_path, batch_size, shuffle=False):
     dataset = tf.data.experimental.make_csv_dataset(
         csv_file_path,
         batch_size=batch_size,
@@ -277,7 +318,7 @@ def encode_inputs(inputs, encoding_size):
             # Since we are not using a mask token nor expecting any out of vocabulary
             # (oov) token, we set mask_token to None and  num_oov_indices to 0.
             index = layers.StringLookup(
-                vocabulary=vocabulary, mask_token=None, num_oov_indices=0
+                vocabulary=vocabulary, mask_token=None, num_oov_indices=1
             )
             # Convert the string input values into integer indices.
             value_index = index(inputs[feature_name])
@@ -312,6 +353,10 @@ def __init__(self, units):
     def call(self, inputs):
         return self.linear(inputs) * self.sigmoid(inputs)
 
+    # to remove the build warnings
+    def build(self):
+        self.built = True
+
 
 """
 ## Implement the Gated Residual Network
@@ -347,6 +392,10 @@ def call(self, inputs):
         x = self.layer_norm(x)
         return x
 
+    # to remove the build warnings
+    def build(self):
+        self.build = True
+
 
 """
 ## Implement the Variable Selection Network
@@ -388,6 +437,10 @@ def call(self, inputs):
         outputs = keras.ops.squeeze(tf.matmul(v, x, transpose_a=True), axis=1)
         return outputs
 
+    # to remove the build warnings
+    def build(self):
+        self.built = True
+
 
 """
 ## Create Gated Residual and Variable Selection Networks model
@@ -415,7 +468,7 @@ def create_model(encoding_size):
 learning_rate = 0.001
 dropout_rate = 0.15
 batch_size = 265
-num_epochs = 20
+num_epochs = 1
 encoding_size = 16
 
 model = create_model(encoding_size)
@@ -433,7 +486,9 @@ def create_model(encoding_size):
 
 print("Start training the model...")
 train_dataset = get_dataset_from_csv(
-    train_data_file, shuffle=True, batch_size=batch_size
+    train_data_file,
+    batch_size=batch_size,
+    shuffle=True,
 )
 valid_dataset = get_dataset_from_csv(valid_data_file, batch_size=batch_size)
 model.fit(

diff --git a/examples/structured_data/ipynb/classification_with_grn_and_vsn.ipynb b/examples/structured_data/ipynb/classification_with_grn_and_vsn.ipynb
@@ -10,7 +10,7 @@
     "\n",
     "**Author:** [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/)<br>\n",
     "**Date created:** 2021/02/10<br>\n",
-    "**Last modified:** 2021/02/10<br>\n",
+    "**Last modified:** 2025/01/03<br>\n",
     "**Description:** Using Gated Residual and Variable Selection Networks for income level prediction."
    ]
   },
@@ -76,6 +76,8 @@
    "outputs": [],
    "source": [
     "import os\n",
+    "import subprocess\n",
+    "import tarfile\n",
     "\n",
     "# Only the TensorFlow backend supports string inputs.\n",
     "os.environ[\"KERAS_BACKEND\"] = \"tensorflow\"\n",
@@ -152,11 +154,55 @@
     "    \"income_level\",\n",
     "]\n",
     "\n",
-    "data_url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/census-income-mld/census-income.data.gz\"\n",
-    "data = pd.read_csv(data_url, header=None, names=CSV_HEADER)\n",
+    "data_url = \"https://archive.ics.uci.edu/static/public/117/census+income+kdd.zip\"\n",
+    "keras.utils.get_file(origin=data_url, extract=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "determine the downloaded .tar.gz file path and\n",
+    "extract the files from the downloaded .tar.gz file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "extracted_path = os.path.join(\n",
+    "    os.path.expanduser(\"~\"), \".keras\", \"datasets\", \"census+income+kdd.zip\"\n",
+    ")\n",
+    "for root, dirs, files in os.walk(extracted_path):\n",
+    "    for file in files:\n",
+    "        if file.endswith(\".tar.gz\"):\n",
+    "            tar_gz_path = os.path.join(root, file)\n",
+    "            with tarfile.open(tar_gz_path, \"r:gz\") as tar:\n",
+    "                tar.extractall(path=root)\n",
+    "\n",
+    "train_data_path = os.path.join(\n",
+    "    os.path.expanduser(\"~\"),\n",
+    "    \".keras\",\n",
+    "    \"datasets\",\n",
+    "    \"census+income+kdd.zip\",\n",
+    "    \"census-income.data\",\n",
+    ")\n",
+    "test_data_path = os.path.join(\n",
+    "    os.path.expanduser(\"~\"),\n",
+    "    \".keras\",\n",
+    "    \"datasets\",\n",
+    "    \"census+income+kdd.zip\",\n",
+    "    \"census-income.test\",\n",
+    ")\n",
     "\n",
-    "test_data_url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/census-income-mld/census-income.test.gz\"\n",
-    "test_data = pd.read_csv(test_data_url, header=None, names=CSV_HEADER)\n",
+    "data = pd.read_csv(train_data_path, header=None, names=CSV_HEADER)\n",
+    "test_data = pd.read_csv(test_data_path, header=None, names=CSV_HEADER)\n",
     "\n",
     "print(f\"Data shape: {data.shape}\")\n",
     "print(f\"Test data shape: {test_data.shape}\")\n",
@@ -235,7 +281,36 @@
     "\n",
     "train_data.to_csv(train_data_file, index=False, header=False)\n",
     "valid_data.to_csv(valid_data_file, index=False, header=False)\n",
-    "test_data.to_csv(test_data_file, index=False, header=False)"
+    "test_data.to_csv(test_data_file, index=False, header=False)\n",
+    ""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "clean the directory for the downloaded files except the .tar.gz file and\n",
+    "also remove the empty directories"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "subprocess.run(\n",
+    "    f'find {extracted_path} -type f ! -name \"*.tar.gz\" -exec rm -f {{}} +',\n",
+    "    shell=True,\n",
+    "    check=True,\n",
+    ")\n",
+    "subprocess.run(\n",
+    "    f\"find {extracted_path} -type d -empty -exec rmdir {{}} +\", shell=True, check=True\n",
+    ")"
    ]
   },
   {
@@ -288,9 +363,12 @@
     ")\n",
     "# Feature default values.\n",
     "COLUMN_DEFAULTS = [\n",
-    "    [0.0]\n",
-    "    if feature_name in NUMERIC_FEATURE_NAMES + [TARGET_FEATURE_NAME, WEIGHT_COLUMN_NAME]\n",
-    "    else [\"NA\"]\n",
+    "    (\n",
+    "        [0.0]\n",
+    "        if feature_name\n",
+    "        in NUMERIC_FEATURE_NAMES + [TARGET_FEATURE_NAME, WEIGHT_COLUMN_NAME]\n",
+    "        else [\"NA\"]\n",
+    "    )\n",
     "    for feature_name in CSV_HEADER\n",
     "]"
    ]
@@ -324,10 +402,10 @@
     "            features[feature_name] = keras.ops.cast(features[feature_name], \"string\")\n",
     "    # Get the instance weight.\n",
     "    weight = features.pop(WEIGHT_COLUMN_NAME)\n",
-    "    return features, target, weight\n",
+    "    return dict(features), target, weight\n",
     "\n",
     "\n",
-    "def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):\n",
+    "def get_dataset_from_csv(csv_file_path, batch_size, shuffle=False):\n",
     "    dataset = tf.data.experimental.make_csv_dataset(\n",
     "        csv_file_path,\n",
     "        batch_size=batch_size,\n",
@@ -409,7 +487,7 @@
     "            # Since we are not using a mask token nor expecting any out of vocabulary\n",
     "            # (oov) token, we set mask_token to None and  num_oov_indices to 0.\n",
     "            index = layers.StringLookup(\n",
-    "                vocabulary=vocabulary, mask_token=None, num_oov_indices=0\n",
+    "                vocabulary=vocabulary, mask_token=None, num_oov_indices=1\n",
     "            )\n",
     "            # Convert the string input values into integer indices.\n",
     "            value_index = index(inputs[feature_name])\n",
@@ -457,6 +535,10 @@
     "\n",
     "    def call(self, inputs):\n",
     "        return self.linear(inputs) * self.sigmoid(inputs)\n",
+    "\n",
+    "    # to remove the build warnings\n",
+    "    def build(self):\n",
+    "        self.built = True\n",
     ""
    ]
   },
@@ -506,6 +588,10 @@
     "        x = inputs + self.gated_linear_unit(x)\n",
     "        x = self.layer_norm(x)\n",
     "        return x\n",
+    "\n",
+    "    # to remove the build warnings\n",
+    "    def build(self):\n",
+    "        self.build = True\n",
     ""
    ]
   },
@@ -561,6 +647,10 @@
     "\n",
     "        outputs = keras.ops.squeeze(tf.matmul(v, x, transpose_a=True), axis=1)\n",
     "        return outputs\n",
+    "\n",
+    "    # to remove the build warnings\n",
+    "    def build(self):\n",
+    "        self.built = True\n",
     ""
    ]
   },
@@ -617,7 +707,7 @@
     "learning_rate = 0.001\n",
     "dropout_rate = 0.15\n",
     "batch_size = 265\n",
-    "num_epochs = 20\n",
+    "num_epochs = 1\n",
     "encoding_size = 16\n",
     "\n",
     "model = create_model(encoding_size)\n",
@@ -635,7 +725,9 @@
     "\n",
     "print(\"Start training the model...\")\n",
     "train_dataset = get_dataset_from_csv(\n",
-    "    train_data_file, shuffle=True, batch_size=batch_size\n",
+    "    train_data_file,\n",
+    "    batch_size=batch_size,\n",
+    "    shuffle=True,\n",
     ")\n",
     "valid_dataset = get_dataset_from_csv(valid_data_file, batch_size=batch_size)\n",
     "model.fit(\n",