In [41]:
# from twisted.conch.scripts.tkconch import frame
from ucimlrepo import fetch_ucirepo 
import ssl

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [42]:
# giving an SSLCertVerificationError when trying to fetch UCI repo
ssl._create_default_https_context = ssl._create_unverified_context

# Data Preprocessing
For milestone 2, we are completing all data preprocessing operations including fetching the data, dropping unnecessary columns, encoding categorical data and normalizing continous columns. Finally, we will be splitting the data into train, test and validation sets using a 80/20 split between test and training data, and 70/30 split between split training data and a validation set (because the data set is so large we can afford to do this).

In [43]:
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
features = pd.DataFrame(adult.data.features)
target = pd.DataFrame(adult.data.targets)


#drop the education column as it is already represented in the education_num column
features = features.drop(columns=['education'])
data = pd.concat([features, target], axis=1)

print("before drop missing values: ", data.shape)
# drop missing values
data = data.dropna()

print("after drop missing values: ", data.shape)


before drop missing values:  (48842, 14)
after drop missing values:  (47621, 14)


In [44]:
# print(data['workclass'].value_counts())
# print(data['marital-status'].value_counts())
# print(data['occupation'].value_counts())
# print(data['relationship'].value_counts())
# print(data['race'].value_counts())
# print(data['sex'].value_counts())
# print(data['native-country'].value_counts())
# print(data['income'].value_counts())

before = {'workclass': data['workclass'].unique(),
          'marital-status': data['marital-status'].unique(),
          'occupation': data['occupation'].unique(),
          'relationship': data['relationship'].unique(),
          'race': data['race'].unique(),
          'native-country': data['native-country'].unique(),
          'income': data['income'].unique()
          }

In [45]:
# print(data['workclass'].value_counts())
# 
# print(data['marital-status'].value_counts())
# print(data['occupation'].value_counts())
# print(data['relationship'].value_counts())
# print(data['race'].value_counts())
# print(data['sex'].value_counts())
# print(data['native-country'].value_counts())
# print(data['income'].value_counts())

## Categorical Data Preprocessing
using LabelEncoder to convert categorical data to numerical data

In [46]:
cate_colname = ['workclass', 'marital-status', 'occupation', 'relationship', 'native-country','race']

from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder()

for i in cate_colname:
    data[i] = labelEncoder.fit_transform(data[i])
data.head()


Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,7,77516,13,4,1,1,4,Male,2174,0,40,39,<=50K
1,50,6,83311,13,2,4,0,4,Male,0,0,13,39,<=50K
2,38,4,215646,9,0,6,1,4,Male,0,0,40,39,<=50K
3,53,4,234721,7,2,6,0,2,Male,0,0,40,39,<=50K
4,28,4,338409,13,2,10,5,2,Female,0,0,40,5,<=50K


In [47]:
after = {'workclass': data['workclass'].unique(),
          'marital-status': data['marital-status'].unique(),
          'occupation': data['occupation'].unique(),
          'relationship': data['relationship'].unique(),
          'race': data['race'].unique(),
          'native-country': data['native-country'].unique(),
          'income': data['income'].unique()
          }

In [48]:
# check before and after by comparing the unique values
for i in before.keys():
    print(f"col_name: {i} before: {len(before[i])} after: {len(after[i])}")

col_name: workclass before: 9 after: 9
col_name: marital-status before: 7 after: 7
col_name: occupation before: 15 after: 15
col_name: relationship before: 6 after: 6
col_name: race before: 5 after: 5
col_name: native-country before: 42 after: 42
col_name: income before: 4 after: 4


## Handling binary data

male is 0 
female is 1

income less then 50k is 0
income greater then 50k is 1

In [49]:
sex_map = {
    'Male': 0,
    'Female': 1
}

income_map = {
    '<=50K': 0,
    '>50K': 1,
    '<=50K.': 0,
    '>50K.': 1
}

# replace the values in the column
data['sex'] = data['sex'].replace(sex_map)
data['income'] = data['income'].replace(income_map)


In [50]:
continues_colname = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']

from sklearn.preprocessing import StandardScaler

#normalize the continues data
scaler = StandardScaler()
data[continues_colname] = scaler.fit_transform(data[continues_colname])

## Splitting Data

In [51]:
#split the data into training and testing data
from sklearn.model_selection import train_test_split

X = data.drop(columns=['income'])
y = data['income']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [52]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,0.026501,7,-1.062924,13,4,1,1,4,0,0.144629,-0.217456,-0.048943,39,0
1,0.837781,6,-1.008031,13,2,4,0,4,0,-0.145735,-0.217456,-2.251188,39,0
2,-0.047252,4,0.245517,9,0,6,1,4,0,-0.145735,-0.217456,-0.048943,39,0
3,1.059039,4,0.426206,7,2,6,0,2,0,-0.145735,-0.217456,-0.048943,39,0
4,-0.78478,4,1.408394,13,2,10,5,2,1,-0.145735,-0.217456,-0.048943,5,0


In [53]:
import sys
from pathlib import Path

# Add the src directory to the system path
src_path = Path('./src')  # Path to the src directory relative to your notebook
sys.path.append(str(src_path.resolve()))

In [54]:
from model import svm
import importlib
importlib.reload(svm)

from model.svm import svm_

C = 0.001
learning_rate = 0.0005
epoch = 500


my_svm = svm_(learning_rate=learning_rate,epoch=epoch,C_value=C,X=X_train,Y=y_train)
# X_train = X_train.to_numpy()
# y_train = y_train.to_numpy().reshape(-1, 1)

# train model
# ensuring y is in the set {-1, 1}
y_train_preprocessed = 2 * y_train -1
print("Training SVM...")
training_losses, validation_losses = my_svm.train(X_train, y_train_preprocessed)
print("Training complete.")

Training SVM...
Epoch 0: Train Loss = 30.3506, Val Loss = 7.5892
Epoch 50: Train Loss = 24.0858, Val Loss = 6.0481
Epoch 100: Train Loss = 17.8211, Val Loss = 4.5070
Epoch 150: Train Loss = 14.6460, Val Loss = 3.7260
Epoch 200: Train Loss = 14.6460, Val Loss = 3.7260
Epoch 250: Train Loss = 14.6460, Val Loss = 3.7260
Epoch 300: Train Loss = 14.6460, Val Loss = 3.7260
Epoch 350: Train Loss = 14.6460, Val Loss = 3.7260
Epoch 400: Train Loss = 14.6460, Val Loss = 3.7260
Epoch 450: Train Loss = 14.6460, Val Loss = 3.7260
Epoch 499: Train Loss = 14.6460, Val Loss = 3.7260
Training complete.


In [55]:
# testing the model
print("Evaluating SVM...")
y_test_preprocessed = 2 * y_test -1
my_svm.evaluate(X_test,y_test_preprocessed)

Evaluating SVM...
Accuracy on test dataset: 0.752755905511811


In [56]:
# Logistic Regression with Regularization
from model import LogisticRegression
import importlib
importlib.reload(LogisticRegression)

from model.LogisticRegression import LogisticRegression_


lr_learning_rate = 0.001
lr_epoch = 1500
c = 0.001

lr = LogisticRegression_(c,lr_learning_rate, lr_epoch, X_train, y_train)

lr.train()

  from .autonotebook import tqdm as notebook_tqdm


Epoch 1: Loss 0.6931471805600515
Epoch 101: Loss 0.5447287219413788
Epoch 201: Loss 0.5321826515990068
Epoch 301: Loss 0.52183006212708
Epoch 401: Loss 0.5130931614773312
Epoch 501: Loss 0.5055978158209238
Epoch 601: Loss 0.4990878790312947
Epoch 701: Loss 0.4933791503760858
Epoch 801: Loss 0.4883338362782691
Epoch 901: Loss 0.4838456675252571
Epoch 1001: Loss 0.4798307378657138
Epoch 1101: Loss 0.4762215633168857
Epoch 1201: Loss 0.47296305243611403
Epoch 1301: Loss 0.4700096700632505
Epoch 1401: Loss 0.46732338123361283
Train end


In [57]:
print("Evaluating LogisticRegression ...")
lr.evaluate(X_test, y_test)

Evaluating LogisticRegression ...
Accuracy: 0.7625196850393701


In [64]:
%load_ext autoreload
%autoreload 2

#Random Forests

from model.random_forest import random_forest

rf = random_forest(forest_size=100)

rf.fit(X_train.values, y_train_preprocessed) 

y_pred, y_pred_probabilities = rf.predict(X_test.values)

accuracy_score, entropy_loss = rf.evaluate(y_test_preprocessed, y_pred, y_pred_probabilities)

print(f"Accuracy: {accuracy_score}")
print(f"Cross Entropy Loss: {entropy_loss}")



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
fitting model...
aggregated predictions for  0  samples.
aggregated predictions for  1000  samples.
aggregated predictions for  2000  samples.
aggregated predictions for  3000  samples.
aggregated predictions for  4000  samples.
aggregated predictions for  5000  samples.
aggregated predictions for  6000  samples.
aggregated predictions for  7000  samples.
aggregated predictions for  8000  samples.
aggregated predictions for  9000  samples.
Accuracy: 0.8038845144356955
Cross Entropy Loss: 5.852489974869777
