Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 64 additions & 9 deletions examples/structured_data/classification_with_grn_and_vsn.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Title: Classification with Gated Residual and Variable Selection Networks
Author: [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/)
Date created: 2021/02/10
Last modified: 2021/02/10
Last modified: 2025/01/03
Description: Using Gated Residual and Variable Selection Networks for income level prediction.
Accelerator: GPU
"""
Expand Down Expand Up @@ -46,6 +46,8 @@
"""

import os
import subprocess
import tarfile

# Only the TensorFlow backend supports string inputs.
os.environ["KERAS_BACKEND"] = "tensorflow"
Expand Down Expand Up @@ -108,13 +110,37 @@
"income_level",
]

data_url = "https://archive.ics.uci.edu/static/public/20/census+income.zip"
data_url = "https://archive.ics.uci.edu/static/public/117/census+income+kdd.zip"
keras.utils.get_file(origin=data_url, extract=True)

"""
determine the downloaded .tar.gz file path and
extract the files from the downloaded .tar.gz file
"""

extracted_path = os.path.join(
os.path.expanduser("~"), ".keras", "datasets", "census+income+kdd.zip"
)
for root, dirs, files in os.walk(extracted_path):
for file in files:
if file.endswith(".tar.gz"):
tar_gz_path = os.path.join(root, file)
with tarfile.open(tar_gz_path, "r:gz") as tar:
tar.extractall(path=root)

train_data_path = os.path.join(
os.path.expanduser("~"), ".keras", "datasets", "adult.data"
os.path.expanduser("~"),
".keras",
"datasets",
"census+income+kdd.zip",
"census-income.data",
)
test_data_path = os.path.join(
os.path.expanduser("~"), ".keras", "datasets", "adult.test"
os.path.expanduser("~"),
".keras",
"datasets",
"census+income+kdd.zip",
"census-income.test",
)

data = pd.read_csv(train_data_path, header=None, names=CSV_HEADER)
Expand Down Expand Up @@ -157,6 +183,21 @@
valid_data.to_csv(valid_data_file, index=False, header=False)
test_data.to_csv(test_data_file, index=False, header=False)


"""
clean the directory for the downloaded files except the .tar.gz file and
also remove the empty directories
"""

subprocess.run(
f'find {extracted_path} -type f ! -name "*.tar.gz" -exec rm -f {{}} +',
shell=True,
check=True,
)
subprocess.run(
f"find {extracted_path} -type d -empty -exec rmdir {{}} +", shell=True, check=True
)

"""
## Define dataset metadata

Expand Down Expand Up @@ -219,10 +260,10 @@ def process(features, target):
features[feature_name] = keras.ops.cast(features[feature_name], "string")
# Get the instance weight.
weight = features.pop(WEIGHT_COLUMN_NAME)
return features, target, weight
return dict(features), target, weight


def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):
def get_dataset_from_csv(csv_file_path, batch_size, shuffle=False):
dataset = tf.data.experimental.make_csv_dataset(
csv_file_path,
batch_size=batch_size,
Expand Down Expand Up @@ -277,7 +318,7 @@ def encode_inputs(inputs, encoding_size):
# Since we are not using a mask token nor expecting any out of vocabulary
# (oov) token, we set mask_token to None and num_oov_indices to 0.
index = layers.StringLookup(
vocabulary=vocabulary, mask_token=None, num_oov_indices=0
vocabulary=vocabulary, mask_token=None, num_oov_indices=1
)
# Convert the string input values into integer indices.
value_index = index(inputs[feature_name])
Expand Down Expand Up @@ -312,6 +353,10 @@ def __init__(self, units):
def call(self, inputs):
return self.linear(inputs) * self.sigmoid(inputs)

# to remove the build warnings
def build(self):
self.built = True


"""
## Implement the Gated Residual Network
Expand Down Expand Up @@ -347,6 +392,10 @@ def call(self, inputs):
x = self.layer_norm(x)
return x

# to remove the build warnings
def build(self):
self.build = True


"""
## Implement the Variable Selection Network
Expand Down Expand Up @@ -388,6 +437,10 @@ def call(self, inputs):
outputs = keras.ops.squeeze(tf.matmul(v, x, transpose_a=True), axis=1)
return outputs

# to remove the build warnings
def build(self):
self.built = True


"""
## Create Gated Residual and Variable Selection Networks model
Expand Down Expand Up @@ -415,7 +468,7 @@ def create_model(encoding_size):
learning_rate = 0.001
dropout_rate = 0.15
batch_size = 265
num_epochs = 20
num_epochs = 1
encoding_size = 16

model = create_model(encoding_size)
Expand All @@ -433,7 +486,9 @@ def create_model(encoding_size):

print("Start training the model...")
train_dataset = get_dataset_from_csv(
train_data_file, shuffle=True, batch_size=batch_size
train_data_file,
batch_size=batch_size,
shuffle=True,
)
valid_dataset = get_dataset_from_csv(valid_data_file, batch_size=batch_size)
model.fit(
Expand Down
120 changes: 106 additions & 14 deletions examples/structured_data/ipynb/classification_with_grn_and_vsn.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"\n",
"**Author:** [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/)<br>\n",
"**Date created:** 2021/02/10<br>\n",
"**Last modified:** 2021/02/10<br>\n",
"**Last modified:** 2025/01/03<br>\n",
"**Description:** Using Gated Residual and Variable Selection Networks for income level prediction."
]
},
Expand Down Expand Up @@ -76,6 +76,8 @@
"outputs": [],
"source": [
"import os\n",
"import subprocess\n",
"import tarfile\n",
"\n",
"# Only the TensorFlow backend supports string inputs.\n",
"os.environ[\"KERAS_BACKEND\"] = \"tensorflow\"\n",
Expand Down Expand Up @@ -152,11 +154,55 @@
" \"income_level\",\n",
"]\n",
"\n",
"data_url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/census-income-mld/census-income.data.gz\"\n",
"data = pd.read_csv(data_url, header=None, names=CSV_HEADER)\n",
"data_url = \"https://archive.ics.uci.edu/static/public/117/census+income+kdd.zip\"\n",
"keras.utils.get_file(origin=data_url, extract=True)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text"
},
"source": [
"determine the downloaded .tar.gz file path and\n",
"extract the files from the downloaded .tar.gz file"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab_type": "code"
},
"outputs": [],
"source": [
"extracted_path = os.path.join(\n",
" os.path.expanduser(\"~\"), \".keras\", \"datasets\", \"census+income+kdd.zip\"\n",
")\n",
"for root, dirs, files in os.walk(extracted_path):\n",
" for file in files:\n",
" if file.endswith(\".tar.gz\"):\n",
" tar_gz_path = os.path.join(root, file)\n",
" with tarfile.open(tar_gz_path, \"r:gz\") as tar:\n",
" tar.extractall(path=root)\n",
"\n",
"train_data_path = os.path.join(\n",
" os.path.expanduser(\"~\"),\n",
" \".keras\",\n",
" \"datasets\",\n",
" \"census+income+kdd.zip\",\n",
" \"census-income.data\",\n",
")\n",
"test_data_path = os.path.join(\n",
" os.path.expanduser(\"~\"),\n",
" \".keras\",\n",
" \"datasets\",\n",
" \"census+income+kdd.zip\",\n",
" \"census-income.test\",\n",
")\n",
"\n",
"test_data_url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/census-income-mld/census-income.test.gz\"\n",
"test_data = pd.read_csv(test_data_url, header=None, names=CSV_HEADER)\n",
"data = pd.read_csv(train_data_path, header=None, names=CSV_HEADER)\n",
"test_data = pd.read_csv(test_data_path, header=None, names=CSV_HEADER)\n",
"\n",
"print(f\"Data shape: {data.shape}\")\n",
"print(f\"Test data shape: {test_data.shape}\")\n",
Expand Down Expand Up @@ -235,7 +281,36 @@
"\n",
"train_data.to_csv(train_data_file, index=False, header=False)\n",
"valid_data.to_csv(valid_data_file, index=False, header=False)\n",
"test_data.to_csv(test_data_file, index=False, header=False)"
"test_data.to_csv(test_data_file, index=False, header=False)\n",
""
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text"
},
"source": [
"clean the directory for the downloaded files except the .tar.gz file and\n",
"also remove the empty directories"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab_type": "code"
},
"outputs": [],
"source": [
"subprocess.run(\n",
" f'find {extracted_path} -type f ! -name \"*.tar.gz\" -exec rm -f {{}} +',\n",
" shell=True,\n",
" check=True,\n",
")\n",
"subprocess.run(\n",
" f\"find {extracted_path} -type d -empty -exec rmdir {{}} +\", shell=True, check=True\n",
")"
]
},
{
Expand Down Expand Up @@ -288,9 +363,12 @@
")\n",
"# Feature default values.\n",
"COLUMN_DEFAULTS = [\n",
" [0.0]\n",
" if feature_name in NUMERIC_FEATURE_NAMES + [TARGET_FEATURE_NAME, WEIGHT_COLUMN_NAME]\n",
" else [\"NA\"]\n",
" (\n",
" [0.0]\n",
" if feature_name\n",
" in NUMERIC_FEATURE_NAMES + [TARGET_FEATURE_NAME, WEIGHT_COLUMN_NAME]\n",
" else [\"NA\"]\n",
" )\n",
" for feature_name in CSV_HEADER\n",
"]"
]
Expand Down Expand Up @@ -324,10 +402,10 @@
" features[feature_name] = keras.ops.cast(features[feature_name], \"string\")\n",
" # Get the instance weight.\n",
" weight = features.pop(WEIGHT_COLUMN_NAME)\n",
" return features, target, weight\n",
" return dict(features), target, weight\n",
"\n",
"\n",
"def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):\n",
"def get_dataset_from_csv(csv_file_path, batch_size, shuffle=False):\n",
" dataset = tf.data.experimental.make_csv_dataset(\n",
" csv_file_path,\n",
" batch_size=batch_size,\n",
Expand Down Expand Up @@ -409,7 +487,7 @@
" # Since we are not using a mask token nor expecting any out of vocabulary\n",
" # (oov) token, we set mask_token to None and num_oov_indices to 0.\n",
" index = layers.StringLookup(\n",
" vocabulary=vocabulary, mask_token=None, num_oov_indices=0\n",
" vocabulary=vocabulary, mask_token=None, num_oov_indices=1\n",
" )\n",
" # Convert the string input values into integer indices.\n",
" value_index = index(inputs[feature_name])\n",
Expand Down Expand Up @@ -457,6 +535,10 @@
"\n",
" def call(self, inputs):\n",
" return self.linear(inputs) * self.sigmoid(inputs)\n",
"\n",
" # to remove the build warnings\n",
" def build(self):\n",
" self.built = True\n",
""
]
},
Expand Down Expand Up @@ -506,6 +588,10 @@
" x = inputs + self.gated_linear_unit(x)\n",
" x = self.layer_norm(x)\n",
" return x\n",
"\n",
" # to remove the build warnings\n",
" def build(self):\n",
" self.build = True\n",
""
]
},
Expand Down Expand Up @@ -561,6 +647,10 @@
"\n",
" outputs = keras.ops.squeeze(tf.matmul(v, x, transpose_a=True), axis=1)\n",
" return outputs\n",
"\n",
" # to remove the build warnings\n",
" def build(self):\n",
" self.built = True\n",
""
]
},
Expand Down Expand Up @@ -617,7 +707,7 @@
"learning_rate = 0.001\n",
"dropout_rate = 0.15\n",
"batch_size = 265\n",
"num_epochs = 20\n",
"num_epochs = 1\n",
"encoding_size = 16\n",
"\n",
"model = create_model(encoding_size)\n",
Expand All @@ -635,7 +725,9 @@
"\n",
"print(\"Start training the model...\")\n",
"train_dataset = get_dataset_from_csv(\n",
" train_data_file, shuffle=True, batch_size=batch_size\n",
" train_data_file,\n",
" batch_size=batch_size,\n",
" shuffle=True,\n",
")\n",
"valid_dataset = get_dataset_from_csv(valid_data_file, batch_size=batch_size)\n",
"model.fit(\n",
Expand Down
Loading