In [None]:
import data_loader
import numpy as np
import pandas as pd
import sklearn.preprocessing as preprocessing
import matplotlib.pyplot as plt
import sklearn.linear_model as linear_model
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# np.random.seed(0)
# np.__version__

In [None]:
column_names = [
     'age', 'workclass', 'fnlwgt', 'education', 'education-num', 
     'marital-status', 'occupation', 'relationship', 'race', 'sex', 
     'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'salary']
# df = pd.read_csv("data/adult.data", header=None, names=column_names)
# print(df.shape)
# df.sample(n=3)

In [None]:
train, validation = data_loader.load_train_data('data/adult.data')
train['train'] = 1
train['test'] = 0
# train.sample(n=3)

In [None]:
validation['train'] = 0
validation['test'] = 0
# validation.sample(n=3)

In [None]:
test = data_loader.load_test_data('data/adult.test')
test['train'] = 0
test['test'] = 1
# test.sample(n=3)

In [None]:
df_combined = pd.concat([train, validation, test])
df_combined.columns = column_names + ['train','test']
df_combined.sample(n=3)

In [None]:
for c in df_combined.columns:
    df_combined[c] = df_combined[c].replace(' ?', np.nan)

df_combined['salary'] = df_combined['salary'].replace(' <=50K.', ' <=50K')
df_combined['salary'] = df_combined['salary'].replace(' <=50K', 0)
df_combined['salary'] = df_combined['salary'].replace(' >50K.', ' >50K')
df_combined['salary'] = df_combined['salary'].replace(' >50K', 1)
df_combined.dropna(how='any',inplace=True)

In [None]:
df_combined['salary'].unique()

In [None]:
df_encoded = pd.get_dummies(df_combined)

for i in df_encoded.columns:
    print(i)

In [None]:
# df_encoded.to_csv('data/combined_adult.csv')

In [None]:
df_train = df_encoded[(df_encoded['train']==1) & (df_encoded['test']==0)].copy()
df_valid = df_encoded[(df_encoded['train']==0) & (df_encoded['test']==0)].copy()
df_test  = df_encoded[(df_encoded['train']==0) & (df_encoded['test']==1)].copy()

In [None]:
Y_train = df_train['salary'].copy()
Y_valid = df_valid['salary'].copy()
Y_test = df_test['salary'].copy()
df_train.drop(["train", "test", "salary"], axis=1, inplace=True)
df_valid.drop(["train", "test", "salary"], axis=1, inplace=True)
df_test.drop(["train", "test", "salary"], axis=1, inplace=True)
X_train = df_train.copy()
X_valid = df_valid.copy()
X_test = df_test.copy()

In [None]:
# Models
#lr = LogisticRegression(solver='lbfgs', max_iter=500)
dt = DecisionTreeClassifier()
rf = RandomForestClassifier(n_estimators=200)
gb = GradientBoostingClassifier(n_estimators=200)

#models = [lr, dt, rf, gb]
names = ['LR', 'Decision Tree','Random Forest', 'Gradient Boosting']

In [None]:
# Logistic Regression

iter_val = [500, 1000, 5000, 10000]

for i in range(len(iter_val)):
    lr = LogisticRegression(solver='lbfgs', max_iter=iter_val[i])
    print('Iterations: {}\n'.format(iter_val[i]))
    # Predict validation data
    lr.fit(X_train, Y_train)
    val_lr_prediction = lr.predict(X_valid)
    val_lr_acc_score = accuracy_score(Y_valid, val_lr_prediction)
    print('Validation - Model: {}, Accuracy: {}\n'.format(names[0], val_lr_acc_score))
    
    # Predict test data
    lr.fit(X_train, Y_train)
    test_lr_prediction = lr.predict(X_test)
    test_lr_acc_score = accuracy_score(Y_test, test_lr_prediction)
    print('Test - Model: {}, Accuracy: {}\n'.format(names[0], test_lr_acc_score))
    
    print('------------------\n')

In [None]:
# Decision Tree

# Predict validation data
dt.fit(X_train, Y_train)
val_dt_prediction = dt.predict(X_valid)
val_dt_acc_score = accuracy_score(Y_valid, val_dt_prediction)
print('Validation - Model: {}, Accuracy: {}'.format(names[1], val_dt_acc_score))

# Predict test data
dt.fit(X_train, Y_train)
test_dt_prediction = dt.predict(X_test)
test_dt_acc_score = accuracy_score(Y_test, test_dt_prediction)
print('Test - Model: {}, Accuracy: {}'.format(names[1], test_dt_acc_score))

In [None]:
# Random Forest

# Predict validation data
rf.fit(X_train, Y_train)
val_rf_prediction = rf.predict(X_valid)
val_rf_acc_score = accuracy_score(Y_valid, val_rf_prediction)
print('Validation - Model: {}, Accuracy: {}'.format(names[2], val_rf_acc_score))

# Predict test data
rf.fit(X_train, Y_train)
test_rf_prediction = rf.predict(X_test)
test_rf_acc_score = accuracy_score(Y_test, test_rf_prediction)
print('Test - Model: {}, Accuracy: {}'.format(names[2], test_rf_acc_score))

In [None]:
# Gradient Boosting

# Predict validation data
gb.fit(X_train, Y_train)
val_gb_prediction = gb.predict(X_valid)
val_gb_acc_score = accuracy_score(Y_valid, val_gb_prediction)
print('Validation - Model: {}, Accuracy: {}'.format(names[3], val_gb_acc_score))

# Predict test data
gb.fit(X_train, Y_train)
test_gb_prediction = gb.predict(X_test)
test_gb_acc_score = accuracy_score(Y_test, test_gb_prediction)
print('Test - Model: {}, Accuracy: {}'.format(names[3], test_gb_acc_score))

### Memo:

In [None]:
"""
# Prediction on data
print('Accuracy for validation data: \n')

for i in range(len(models)):
    score = models[i].fit(X_train, Y_train)
    prediction = models[i].predict(X_valid)
    acc_score = accuracy_score(Y_valid, prediction)
    print('-'*40)
    print('Model: {}, Accuracy: {}'.format(names[i], acc_score))
"""   

In [None]:
"""
# Prediction on test data
print('Accuracy for test data:\n')

for i in range(len(models)):
    score = models[i].fit(X_train, Y_train)
    prediction = models[i].predict(X_test)
    acc_score = accuracy_score(Y_test, prediction)
    print('-'*40)
    print('Model: {}, Accuracy: {}'.format(names[i], acc_score))
"""

In [None]:
"""
# Printing correlation:

coefs = pd.Series(dt.feature_importances_, index=X_train.columns)
print(coefs2.sort_values(ascending = False))

coefs3 = pd.Series(rf.feature_importances_, index=X_train.columns)
print(coefs3.sort_values(ascending = False))

coefs4 = pd.Series(gb.feature_importances_, index=X_train.columns)
print(coefs4.sort_values(ascending = False))
"""