In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings

warnings.filterwarnings("ignore") # to ignore warnings being printed on the console


def get_workclass_value(x):
    if x == "Without-pay":
        return 7
    elif x == "Private":
        return 6
    elif x == "State-gov":
        return 5
    elif x == "Self-emp-not-inc":
        return 4
    elif x == "Local-gov":
        return 3
    elif x == "Federal-gov":
        return 2
    else:
        return 1
    
    
def get_marital_status_value(x):
    if x == "Never-married":
        return 7
    elif x == "Separated":
        return 6
    elif x == "Married-spouse-absent":
        return 5
    elif x == "Widowed":
        return 4
    elif x == "Divorced":
        return 3
    elif x == "Married-civ-spouse":
        return 2
    else:
        return 1


def get_occupation_value(x):
    if x in ["Exec-managerial", "Prof-specialty", "Protective-serv"]:
        return 1
    elif x in ["Sales", "Transport-moving", "Tech-support", "Craft-repair"]:
        return 2
    else:
        return 3


def get_relationship_value(x):
    if x == "Own-child":
        return 6
    elif x == "Other-relative":
        return 5
    elif x == "Unmarried":
        return 4
    elif x == "Not-in-family":
        return 3
    elif x == "Husband":
        return 2
    else:
        return 1


def get_race_value(x):
    if x == "Other":
        return 5
    elif x == "Amer-Indian-Eskimo":
        return 4
    elif x == "Black":
        return 3
    elif x == "White":
        return 2
    else:
        return 1


def get_sex_value(x):
    if x == "Male":
        return 2
    else:
        return 1

    
def get_native_country_value(x):
    if x in ["United-States", "Cuba", "Poland", "Thailand", "Ecuador", "China", "South", "Scotland", "Greece", "Ireland", "Hungary"]:
        return 2
    elif x in ["India", "England", "Canada", "Germany", "Iran", "Philippines", "Cambodia", "Taiwan", "France", "Italy", "Japan", "Yugoslavia", "Hong"]:
        return 1
    else:
        return 3
    

def get_class_value(x):
    if x == ">50K":
        return 1
    else:
        return 0
    

temp_df1 = pd.read_csv("readonly/adult.txt", header=None, sep=", ")
temp_df2 = pd.read_csv("readonly/testdata.txt", header=None, sep=", ")
df = pd.concat([temp_df1, temp_df2]).sample(frac=1)
df.columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "class"]
df["class"] = df["class"].apply(lambda x: x.replace(".", ""))
df = df[df["workclass"] != '?']
df = df[df["education"] != '?']
df = df[df["marital-status"] != '?']
df = df[df["occupation"] != '?']
df = df[df["relationship"] != '?']
df = df[df["race"] != '?']
df = df[df["sex"] != '?']
df = df[df["native-country"] != '?']

below = df[df["class"] == "<=50K"].sample(n=11208)
above = df[df["class"] == ">50K"]

train_data = pd.concat([above, below]).sample(frac=1)
train_data = train_data.drop("capital-loss", axis=1)
train_data = train_data.drop("education", axis=1)
train_data = train_data.drop("fnlwgt", axis=1)
train_data['workclass'] = train_data['workclass'].apply(get_workclass_value)
train_data['marital-status'] = train_data['marital-status'].apply(get_marital_status_value)
train_data['occupation'] = train_data['occupation'].apply(get_occupation_value)
train_data['relationship'] = train_data['relationship'].apply(get_relationship_value)
train_data['race'] = train_data['race'].apply(get_race_value)
train_data['sex'] = train_data['sex'].apply(get_sex_value)
train_data['native-country'] = train_data['native-country'].apply(get_native_country_value)
train_data['class'] = train_data['class'].apply(get_class_value)
train_data.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,hours-per-week,native-country,class
5498,64,6,11,2,2,2,1,2,0,40,1,0
31164,49,6,10,2,1,2,2,2,0,40,2,1
26606,54,6,1,2,2,2,1,2,0,40,2,0
9470,43,6,13,2,1,2,2,2,0,47,2,1
1347,25,6,13,7,1,6,2,2,0,40,2,0


In [2]:
feature_matrix_df = train_data.iloc[:, :-1]
labels_df = train_data.iloc[:, -1]
feature_matrix = feature_matrix_df.values
labels = labels_df.values

train_data, test_data, train_labels, test_labels = train_test_split(feature_matrix, labels, test_size=0.2, random_state=42)

transformed_train_data = MinMaxScaler().fit_transform(train_data)
transformed_test_data = MinMaxScaler().fit_transform(test_data)

transformed_train_data[:5,:]

array([[0.26027397, 0.        , 0.6       , 0.16666667, 0.        ,
        0.2       , 0.25      , 1.        , 0.        , 0.29591837,
        0.5       ],
       [0.65753425, 0.83333333, 0.53333333, 0.5       , 1.        ,
        0.6       , 0.5       , 0.        , 0.        , 0.19387755,
        1.        ],
       [0.4109589 , 0.83333333, 0.8       , 0.16666667, 0.        ,
        0.2       , 0.25      , 1.        , 0.1502415 , 0.39795918,
        0.5       ],
       [0.16438356, 0.33333333, 0.86666667, 1.        , 0.        ,
        0.4       , 0.25      , 1.        , 0.        , 0.5       ,
        0.5       ],
       [0.06849315, 0.66666667, 0.6       , 1.        , 1.        ,
        0.8       , 0.5       , 1.        , 0.        , 0.3877551 ,
        1.        ]])

In [3]:
def train_model(model_name, Model, tuned_parameters, X_train, y_train, fold_num=10, scoring_function="f1_macro",
                useRandomizedSearch=True):
    if useRandomizedSearch == False:
        clf = GridSearchCV(Model, tuned_parameters, cv=fold_num, scoring=scoring_function)
    else:
        clf = RandomizedSearchCV(Model, tuned_parameters, cv=fold_num, scoring=scoring_function)
    clf.fit(X_train, y_train)

    print("Best parameters set found:")
    print(clf.best_params_)
    print("Best score: %0.3f" % clf.best_score_)
    print("Scores on training set:")
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    meanMap = {}
    stdMap = {}
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        meanMap[str(params)] = mean
        stdMap[str(params)] = std
    for params in sorted(meanMap, key=meanMap.get, reverse=True):
        print("%0.3f (+/-%0.03f) for %r" % (meanMap[params], stdMap[params] * 2, params))

    return clf.best_params_

models_dict = {}
print("\n\nTRAINING DATA RESULTS:")
print("\n\nBaggingClassifier Classifier")
tuned_parameters = {'random_state': [10],
                    'n_estimators': np.arange(10, 41, 4),
                    'max_samples': [0.7, 0.8],
                    'max_features': [0.7, 0.8],
                    'oob_score': [True, False]}
models_dict["BaggingClassifier()"] = train_model("BaggingClassifier", BaggingClassifier(),
                                                tuned_parameters, transformed_train_data, train_labels,
                                                useRandomizedSearch=True)

print("\n\nMultinomialNB Classifier")
tuned_parameters = {'fit_prior': [True, False],
                    'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 2]}
models_dict["MultinomialNB()"] = train_model("MultinomialNB", MultinomialNB(), tuned_parameters,
                                             transformed_train_data, train_labels, useRandomizedSearch=True)

print("\n\nLogisticRegression Classifier")
tuned_parameters = {'random_state': [10],
                    'C': [0.01, 0.1, 1, 0.001],
                    'fit_intercept': [True, False],
                    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                    'warm_start': [True, False]}
models_dict["LogisticRegression()"] = train_model("LogisticRegression", LogisticRegression(), tuned_parameters,
                                                  transformed_train_data, train_labels, useRandomizedSearch=True)

print("\n\nAdaBoostClassifier Classifier")
tuned_parameters = {'random_state': [10],
                    'n_estimators': range(10, 41, 4),
                    'learning_rate': [0.001, 0.01, 0.1, 1],
                    'algorithm': ['SAMME.R', 'SAMME']}
models_dict["AdaBoostClassifier()"] = train_model("AdaBoostClassifier", AdaBoostClassifier(), tuned_parameters,
                                                  transformed_train_data, train_labels, useRandomizedSearch=True)

print("\n\nRandomForestClassifier Classifier")
tuned_parameters = {'random_state': [10],
                    'n_estimators': np.arange(10, 41, 4),
                    'max_depth': [10, 20, 30, 40],
                    'min_samples_split': [2, 3, 5, 10],
                    'warm_start': [True, False],
                    'min_samples_leaf': [1, 2, 4]}
models_dict["RandomForestClassifier()"] = train_model("RandomForestClassifier", RandomForestClassifier(),
                                                      tuned_parameters, transformed_train_data, train_labels,
                                                      useRandomizedSearch=True)

print("\n\nGradientBoostingClassifier Classifier")
tuned_parameters = {'random_state': [10],
                    'n_estimators': range(10, 41, 4),
                    'max_depth': range(3, 8),
                    'min_samples_split': range(2, 5),
                    'min_samples_leaf': range(40, 60, 4),
                    'max_features': range(5, 10),
                    'subsample': [0.6, 0.7, 0.75, 0.8, 0.85, 0.9]}
models_dict["GradientBoostingClassifier()"] = train_model("GradientBoostingClassifier",
                                                          GradientBoostingClassifier(), tuned_parameters,
                                                          transformed_train_data, train_labels,
                                                          useRandomizedSearch=True)
print("\n\nDecisionTreeClassifier Classifier")
tuned_parameters = {'random_state': [10],
                    'max_depth': range(10, 30, 3),
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 3, 4]}
models_dict["DecisionTreeClassifier()"] = train_model("DecisionTreeClassifier", DecisionTreeClassifier(),
                                                      tuned_parameters, transformed_train_data, train_labels,
                                                      useRandomizedSearch=True)

print("\n\nMLPClassifier Classifier")
tuned_parameters = {'random_state': [10],
                    'max_iter': range(200, 500, 100),
                    'hidden_layer_sizes': [(40, 40, 40), (50, 50, ), (70, 50, 30,), (70, 30,), (30,)],
                    'activation': ['relu', 'identity', 'logistic', 'tanh'],
                    'solver': ['lbfgs', 'sgd', 'adam'],
                    'shuffle': [True, False],
                    'learning_rate_init': [0.001, 0.01, 0.1],
                    'learning_rate': ['constant', 'invscaling', 'adaptive']}
models_dict["MLPClassifier()"] = train_model("MLPClassifier", MLPClassifier(), tuned_parameters,
                                             transformed_train_data, train_labels, useRandomizedSearch=True)

print("\n\nSVC Classifier")
tuned_parameters = {'random_state': [10],
                    'kernel': ['rbf', 'linear', 'poly', 'sigmoid'],
                    'degree': np.arange(2, 20),
                    'C': np.arange(1, 10)}
models_dict["SVC()"] = train_model("SVC", SVC(), tuned_parameters, transformed_train_data, train_labels,
                                   useRandomizedSearch=True)

print("\n\nKNeighborsClassifier Classifier")
tuned_parameters = {'n_neighbors': np.arange(1, 31),
                    'weights': ["uniform", "distance"],
                    'p': np.arange(1, 6),
                    'metric': ["minkowski", "braycurtis", "canberra", "matching", "dice", "kulsinski",
                               "rogerstanimoto", "russellrao", "sokalmichener", "sokalsneath"]}
models_dict["KNeighborsClassifier()"] = train_model("KNeighborsClassifier", KNeighborsClassifier(),
                                                    tuned_parameters, transformed_train_data, train_labels,
                                                    useRandomizedSearch=True)



TRAINING DATA RESULTS:


BaggingClassifier Classifier
Best parameters set found:
{'random_state': 10, 'oob_score': False, 'n_estimators': 26, 'max_samples': 0.8, 'max_features': 0.7}
Best score: 0.827
Scores on training set:
0.827 (+/-0.015) for "{'random_state': 10, 'oob_score': False, 'n_estimators': 26, 'max_samples': 0.8, 'max_features': 0.7}"
0.824 (+/-0.013) for "{'random_state': 10, 'oob_score': False, 'n_estimators': 38, 'max_samples': 0.7, 'max_features': 0.8}"
0.824 (+/-0.013) for "{'random_state': 10, 'oob_score': True, 'n_estimators': 38, 'max_samples': 0.7, 'max_features': 0.8}"
0.823 (+/-0.016) for "{'random_state': 10, 'oob_score': False, 'n_estimators': 18, 'max_samples': 0.7, 'max_features': 0.7}"
0.823 (+/-0.016) for "{'random_state': 10, 'oob_score': True, 'n_estimators': 18, 'max_samples': 0.7, 'max_features': 0.7}"
0.823 (+/-0.016) for "{'random_state': 10, 'oob_score': False, 'n_estimators': 26, 'max_samples': 0.7, 'max_features': 0.8}"
0.823 (+/-0.016) for "{'r

Best parameters set found:
{'random_state': 10, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 16}
Best score: 0.801
Scores on training set:
0.801 (+/-0.007) for "{'random_state': 10, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 16}"
0.799 (+/-0.007) for "{'random_state': 10, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 16}"
0.798 (+/-0.010) for "{'random_state': 10, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_depth': 16}"
0.797 (+/-0.009) for "{'random_state': 10, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 19}"
0.791 (+/-0.012) for "{'random_state': 10, 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_depth': 19}"
0.789 (+/-0.008) for "{'random_state': 10, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 19}"
0.788 (+/-0.013) for "{'random_state': 10, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 25}"
0.788 (+/-0.012) for "{'random_state': 10, 'min_samples_split': 10, 'min_samples_leaf': 1

In [4]:
def run_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
#     print()
#     try:
#         print("Name: " + name + "\nFeature Importance:")
#         print(model.feature_importances_)
#     except:
#         print("Feature Importance missing.")
    return accuracy_score(y_test, y_predict), f1_score(y_test, y_predict), precision_score(y_test,
                                                                                           y_predict), recall_score(
        y_test, y_predict)


def run_test(models_dict, X_train, X_test, y_train, y_test):
    print("Acc\tF1\tPrec\tRecall\tModel")
    for name in models_dict.keys():
        model = eval(name)
        model.set_params(**models_dict[name])
        acc, f1, prec, rec = run_model(name, model, X_train, X_test, y_train, y_test)
        print("%.4f\t%.4f\t%.4f\t%.4f\t%s" % (acc, f1, prec, rec, name))
        
print("\n\n\nTESTING DATA RESULTS:\n\n")
run_test(models_dict, transformed_train_data, transformed_test_data, train_labels, test_labels)




TESTING DATA RESULTS:


Acc	F1	Prec	Recall	Model
0.8118	0.8193	0.7902	0.8506	BaggingClassifier()
0.7799	0.7946	0.7469	0.8488	MultinomialNB()
0.8044	0.8136	0.7793	0.8510	LogisticRegression()
0.8080	0.8122	0.7971	0.8279	AdaBoostClassifier()
0.8189	0.8292	0.7868	0.8764	RandomForestClassifier()
0.8218	0.8297	0.7966	0.8657	GradientBoostingClassifier()
0.7944	0.8025	0.7743	0.8328	DecisionTreeClassifier()
0.8120	0.8205	0.7872	0.8568	MLPClassifier()
0.8093	0.8230	0.7699	0.8839	SVC()
0.7870	0.7922	0.7758	0.8092	KNeighborsClassifier()
