In [None]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow_decision_forests as tfdf
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
"""
train_iteration: ith model developed
num_trials: number of trials the model goes through during auto hyperparameter tuning
"""

train_iteration = 1
num_trials = 20

In [None]:
"""
Read in the train data
"""
df = pd.read_csv('../data/adult.train.csv', header=0)

df

In [None]:
"""
Remove missing values
"""
df.replace(' ?', pd.NA, inplace=True)
df.dropna(inplace=True)

In [None]:
"""
Remove duplicates
"""
df.drop_duplicates(inplace=True)

In [None]:
"""
Print out the unique values
"""
df_unique_values = pd.DataFrame(columns=['unique_values', 'count'])
for col in df.columns:
    df_unique_values.loc[col] = [df[col].unique(), df[col].nunique()]

df_unique_values

In [None]:
"""
Categorize the column values to integers
"""
def categorize_column(df, col):
    unique_values = df[col].unique()
    categories = {}
    for i in range(len(unique_values)):
        categories[unique_values[i]] = i
    df[col] = df[col].map(categories)
    return df

for col in df.columns:
    df = categorize_column(df, col)

df

In [None]:
"""
One-hot encoding for cateogorical features
"""
# df = pd.get_dummies(df, columns=['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'])
# df.head()

In [None]:
"""
Verify the number of unique values
"""
df_min_max = pd.DataFrame(columns=['min', 'max'])
for col in df.columns:
    df_min_max.loc[col] = [df[col].min(), df[col].max()]

df_min_max

In [None]:
"""
Initialize the features, drop the target column and the education column because we already have education-num
"""
X = df.drop(['salary', 'education'], axis=1)
y = df['salary']

In [None]:
"""
Read in the test data
"""
df_test = pd.read_csv('../data/adult.test.csv', header=0)

df_test

In [None]:
"""
Remove missing values from test data
"""
df_test.replace(' ?', pd.NA, inplace=True)
df_test.dropna(inplace=True)

In [None]:
"""
Drop duplicates from test data
"""
df_test.drop_duplicates(inplace=True)

In [None]:
"""
Categorize the test data
"""
for col in df_test.columns:
    df_test = categorize_column(df_test, col)

In [None]:
"""
Initialize the test features, drop the target column and the education column because we already have education-num
"""
X_test = df_test.drop(['salary', 'education'], axis=1)
y_test = df_test['salary']

In [None]:
"""
Combine X and y into a single dataframe for training data and convert to a TensorFlow dataset
"""
df_train = pd.concat([X, y], axis=1)
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(df_train, label='salary')

"""
Categorize X_test and y_test into a single dataframe for testing data and convert to a TensorFlow dataset
"""
df_test = pd.concat([X_test, y_test], axis=1)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(df_test, label='salary')

In [None]:
"""
Compute the class weights to address the class imbalance problems
"""

# class_weights = compute_class_weight('balanced', classes=[0, 1], y=y)

# class_weights_dict = {0: class_weights[0], 1: class_weights[1]}

# class_weights_dict

In [None]:
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))

print(sess)

tf.config.list_physical_devices('GPU')

tf.test.gpu_device_name()

In [None]:
tuner = tfdf.tuner.RandomSearch(num_trials=num_trials, use_predefined_hps=True)

with tf.device('/GPU:0'):
    tuned_model = tfdf.keras.GradientBoostedTreesModel(tuner=tuner)
    # tuned_model.fit(train_ds, verbose=2, class_weight=class_weights_dict) # Run the model with class weights
    tuned_model.fit(train_ds, verbose=2)

In [None]:
tuning_logs = tuned_model.make_inspector().tuning_logs()
sorted_logs = tuning_logs.sort_values('score', ascending=False)
sorted_logs.to_csv(f'../results/gbtm-auto-auto/gbtm-auto-auto-labeled-unweighted-{train_iteration}-LOGS.csv')

In [None]:
sorted_logs

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(tuning_logs["score"], label="current trial")
plt.plot(tuning_logs["score"].cummax(), label="best trial")
plt.title(f"gbtm-auto-auto-{train_iteration}")
plt.xlabel("Tuning step")
plt.ylabel("Tuning score")
plt.legend()
plt.savefig(f'../results/gbtm-auto-auto/gbtm-auto-auto-labeled-unweighted-{train_iteration}-LOGS.png')
plt.show()

In [None]:
predictions = tuned_model.predict(test_ds)
print(predictions)

# make predictions binary
predictions = [1 if prediction > 0.5 else 0 for prediction in predictions]

# add new row 'predictions' to test_ds and save as csv
df_test['predictions'] = predictions
df_test.to_csv(f'../results/gbtm-auto-auto/gbtm-auto-auto-labeled-unweighted-{train_iteration}-PREDICTIONS.csv')

In [None]:
# open csv file
df_predictions = pd.read_csv(f'../results/gbtm-auto-auto/gbtm-auto-auto-labeled-unweighted-1-PREDICTIONS.csv', header=0)

# get actual values from last column of csv file
predictions = df_predictions['predictions'].tolist()
actual = df_predictions['salary'].tolist()

In [None]:
# calculate accuracy
correct = 0
incorrect = 0
for i in range(len(predictions)):
    if predictions[i] == actual[i]:
        correct += 1
    else:
        incorrect += 1
accuracy = correct / len(predictions)

print("Accuracy: ", accuracy, "=", correct, "/", str(len(predictions)))

In [None]:
# Display confusion matrix
confusion_matrix = confusion_matrix(actual, predictions)
print(confusion_matrix)

In [None]:
# Display classification report
class_report = classification_report(actual, predictions)
print(class_report)