In [1]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow_decision_forests as tfdf
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

2023-12-09 17:51:28.852859: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-09 17:51:28.852905: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-09 17:51:28.852919: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-09 17:51:28.856791: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
"""
train_iteration: ith model developed
num_trials: number of trials the model goes through during auto hyperparameter tuning
"""

train_iteration = 1
num_trials = 20

In [3]:
"""
Read in the train data
"""
df = pd.read_csv('../data/adult.train.csv', header=0)

df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [4]:
"""
Remove missing values
"""
df.replace(' ?', pd.NA, inplace=True)
df.dropna(inplace=True)

In [5]:
"""
Remove duplicates
"""
df.drop_duplicates(inplace=True)

In [6]:
"""
Print out the unique values
"""
df_unique_values = pd.DataFrame(columns=['unique_values', 'count'])
for col in df.columns:
    df_unique_values.loc[col] = [df[col].unique(), df[col].nunique()]

df_unique_values

Unnamed: 0,unique_values,count
age,"[39, 50, 38, 53, 28, 37, 49, 52, 31, 42, 30, 2...",72
workclass,"[ State-gov, Self-emp-not-inc, Private, Fed...",7
fnlwgt,"[77516, 83311, 215646, 234721, 338409, 284582,...",20263
education,"[ Bachelors, HS-grad, 11th, Masters, 9th, ...",16
education-num,"[13, 9, 7, 14, 5, 10, 12, 4, 16, 11, 15, 3, 6,...",16
marital-status,"[ Never-married, Married-civ-spouse, Divorce...",7
occupation,"[ Adm-clerical, Exec-managerial, Handlers-cl...",14
relationship,"[ Not-in-family, Husband, Wife, Own-child, ...",6
race,"[ White, Black, Asian-Pac-Islander, Amer-In...",5
sex,"[ Male, Female]",2


In [7]:
"""
Categorize the column values to integers
"""
def categorize_column(df, col):
    unique_values = df[col].unique()
    categories = {}
    for i in range(len(unique_values)):
        categories[unique_values[i]] = i
    df[col] = df[col].map(categories)
    return df

for col in df.columns:
    df = categorize_column(df, col)

df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,1,0,0,1,1,1,0,0,1,0,1,0,0
2,2,2,2,1,1,2,2,0,0,0,1,0,0,0,0
3,3,2,3,2,2,1,2,1,1,0,1,0,0,0,0
4,4,2,4,0,0,1,3,2,1,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,36,2,20261,6,6,1,9,2,0,1,1,0,15,0,0
32557,16,2,579,1,1,1,8,1,0,0,1,0,0,0,1
32558,47,2,18256,1,1,6,0,4,0,1,1,0,0,0,0
32559,24,2,20262,1,1,0,0,3,0,0,1,0,9,0,0


In [None]:
"""
One-hot encoding for cateogorical features
"""
# df = pd.get_dummies(df, columns=['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'])
# df.head()

In [8]:
"""
Verify the number of unique values
"""
df_min_max = pd.DataFrame(columns=['min', 'max'])
for col in df.columns:
    df_min_max.loc[col] = [df[col].min(), df[col].max()]

df_min_max

Unnamed: 0,min,max
age,0,71
workclass,0,6
fnlwgt,0,20262
education,0,15
education-num,0,15
marital-status,0,6
occupation,0,13
relationship,0,5
race,0,4
sex,0,1


In [15]:
"""
Initialize the features, drop the target column and the education column because we already have education-num
"""
X = df.drop(['salary', 'education'], axis=1)
y = df['salary']

In [9]:
"""
Read in the test data
"""
df_test = pd.read_csv('../data/adult.test.csv', header=0)

df_test

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K
16277,64,?,321403,HS-grad,9,Widowed,?,Other-relative,Black,Male,0,0,40,United-States,<=50K
16278,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K
16279,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K


In [10]:
"""
Remove missing values from test data
"""
df_test.replace(' ?', pd.NA, inplace=True)
df_test.dropna(inplace=True)

In [11]:
"""
Drop duplicates from test data
"""
df_test.drop_duplicates(inplace=True)

In [12]:
"""
Categorize the test data
"""
for col in df_test.columns:
    df_test = categorize_column(df_test, col)

In [13]:
"""
Initialize the test features, drop the target column and the education column because we already have education-num
"""
X_test = df_test.drop(['salary', 'education'], axis=1)
y_test = df_test['salary']

In [16]:
"""
Combine X and y into a single dataframe for training data and convert to a TensorFlow dataset
"""
df_train = pd.concat([X, y], axis=1)
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(df_train, label='salary')

"""
Categorize X_test and y_test into a single dataframe for testing data and convert to a TensorFlow dataset
"""
df_test = pd.concat([X_test, y_test], axis=1)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(df_test, label='salary')



2023-12-09 17:54:45.336422: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:07:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-12-09 17:54:45.341379: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:07:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-12-09 17:54:45.341616: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:07:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-12-09 17:54:45.342665: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:07:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-12-09 17:54:45.342863: I tensorflow/compile

In [None]:
"""
Compute the class weights to address the class imbalance problems
"""

# class_weights = compute_class_weight('balanced', classes=[0, 1], y=y)

# class_weights_dict = {0: class_weights[0], 1: class_weights[1]}

# class_weights_dict

In [None]:
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))

print(sess)

tf.config.list_physical_devices('GPU')

tf.test.gpu_device_name()

In [None]:
tuner = tfdf.tuner.RandomSearch(num_trials=num_trials, use_predefined_hps=True)

with tf.device('/GPU:0'):
    tuned_model = tfdf.keras.GradientBoostedTreesModel(tuner=tuner)
    # tuned_model.fit(train_ds, verbose=2, class_weight=class_weights_dict) # Run the model with class weights
    tuned_model.fit(train_ds, verbose=2)

In [None]:
tuning_logs = tuned_model.make_inspector().tuning_logs()
sorted_logs = tuning_logs.sort_values('score', ascending=False)
sorted_logs.to_csv(f'../results/gbtm-auto-auto/gbtm-auto-auto-labeled-unweighted-{train_iteration}-LOGS.csv')

In [None]:
sorted_logs

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(tuning_logs["score"], label="current trial")
plt.plot(tuning_logs["score"].cummax(), label="best trial")
plt.title(f"gbtm-auto-auto-{train_iteration}")
plt.xlabel("Tuning step")
plt.ylabel("Tuning score")
plt.legend()
plt.savefig(f'../results/gbtm-auto-auto/gbtm-auto-auto-labeled-unweighted-{train_iteration}-LOGS.png')
plt.show()

In [None]:
predictions = tuned_model.predict(test_ds)
print(predictions)

# make predictions binary
predictions = [1 if prediction > 0.5 else 0 for prediction in predictions]

# add new row 'predictions' to test_ds and save as csv
df_test['predictions'] = predictions
df_test.to_csv(f'../results/gbtm-auto-auto/gbtm-auto-auto-labeled-unweighted-{train_iteration}-PREDICTIONS.csv')

In [16]:
# open csv file
df_predictions = pd.read_csv(f'../results/gbtm-auto-auto/gbtm-auto-auto-labeled-unweighted-1-PREDICTIONS.csv', header=0)

# get actual values from last column of csv file
predictions = df_predictions['predictions'].tolist()
actual = df_predictions['salary'].tolist()

In [19]:
# calculate accuracy
correct = 0
incorrect = 0
for i in range(len(predictions)):
    if predictions[i] == actual[i]:
        correct += 1
    else:
        incorrect += 1
accuracy = correct / len(predictions)

print("Accuracy: ", accuracy, "=", correct, "/", str(len(predictions)))

Accuracy:  0.8624751161247511 = 5199 / 6028


In [17]:
# Display confusion matrix
confusion_matrix = confusion_matrix(actual, predictions)
print(confusion_matrix)

[[4256  271]
 [ 558  943]]


In [18]:
# Display classification report
class_report = classification_report(actual, predictions)
print(class_report)

              precision    recall  f1-score   support

           0       0.88      0.94      0.91      4527
           1       0.78      0.63      0.69      1501

    accuracy                           0.86      6028
   macro avg       0.83      0.78      0.80      6028
weighted avg       0.86      0.86      0.86      6028

