In [None]:
# Import Relevant Outliers for the Work Required
import sklearn.preprocessing as prep
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import math
import itertools as iter

from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.calibration import CalibratedClassifierCV

In [None]:
# Load up datafram and test data
df_test = pd.read_csv(r'test-pub.csv')
df = pd.read_csv(r'train.csv').sample(frac=1)

In [None]:
###### PREPROCESSING ######

In [None]:
# Example code from Jun that turns all classifications into binaries
# This will remove any unknown entries and assist with using the datas to our advantage
df_onehot = pd.get_dummies(df)

keys = df_onehot.keys()
data_keys = [k for k in keys
    if '?' not in k and k[-3:] != "50K"]
data_train = df_onehot[data_keys]
target_train = df_onehot["Salary_ >50K"]

df_onehot1 = pd.get_dummies(df_test)
# add all zero to non-existing keys
for k in data_keys:
    if k not in df_onehot1.keys():
        df_onehot1[k] = 0

data_test = df_onehot1[data_keys]

In [None]:
# Trim Features Used in Testing
data_train_features = [i for i in data_train.keys()]

# Remove Final Weight and ID, they may be used at a later stage but are being ignored for training
data_train_features.remove('ID')
data_train_features.remove('Fnlwgt')

# Since binarisation blows the native country category out we will be keeping only the American natives for training
native_keys = [i for i in data_train.keys() if 'Native' in i]
native_keys.remove('Native country_ United-States')
data_train_features = [i for i in data_train_features if i not in native_keys]

In [None]:
# Look at the descrete functions for data preprocessing for normalisation
def normalize_values(dataframe):
    max_abs_scalar = prep.MaxAbsScaler()
    min_max_scalar = prep.MinMaxScaler()
    standard_scalar = prep.StandardScaler()
    robust_scalar = prep.RobustScaler()


    col_names = ['Work hours per week','Age','Education years','Capital gain','Capital loss']

    scaled_features = dataframe.copy()
    features = scaled_features[col_names]
    scaler = min_max_scalar.fit(features.values)
    features = scaler.transform(features.values)
    dataframe[col_names] = features
    return dataframe

data_train = normalize_values(data_train)
data_test = normalize_values(data_test)

In [None]:
# data_train[data_train_features].describe()
data_test[data_train_features].describe()

In [None]:
###### Classification Class ######

In [None]:
class classifier():
    features = []
    clf = False
    clf_model = False
    setting_distributions = {}
    tuned_settings = {}
    train_data = False
    train_target = False
    validation_data = False
    validation_target = False
    best_roc_score = 0
    best_output = False
    
    best_feature_scores = 0
    best_features = []
    
    def __init__(self):
  
        self.features = data_train_features
        
        data = data_train[self.features]
        sample_weight = data_train['Fnlwgt']
        data_target = target_train
        
        n_training_samples = int(len(data) *.90)
        n_validation_samples = len(data) - n_training_samples
        self.train_data = data.head(n_training_samples)
        self.train_target = data_target.head(n_training_samples)
        self.train_weight = sample_weight.head(n_training_samples)
        self.validation_data = data.tail(n_validation_samples)
        self.validation_target = data_target.tail(n_validation_samples)

        
    def init_model(self,settings=False):
        if not self.clf:
            return
        
        if settings:
            self.clf_model = self.clf(**settings)
        else:
            self.clf_model = self.clf()
        
    def train(self):
        try:
            self.clf_model.fit(X=self.train_data[self.features], y=self.train_target, sample_weight=self.train_weight)
        except TypeError:
            print('Sample Weight Ignored')
            self.clf_model.fit(X=self.train_data[self.features], y=self.train_target)
        
    def validate(self):
        validation_probabilities = self.clf_model.predict_proba(self.validation_data[self.features])[:,1]
        score = roc_auc_score(y_true=self.validation_target, y_score=validation_probabilities)
        
        if score > self.best_roc_score:
            self.best_roc_score = score
            test_probabilities = self.clf_model.predict_proba(data_test[self.features])[:,1]
            self.best_output = df_test.copy()
            self.best_output['Predicted'] = test_probabilities
            print('Score Improved')
        return score
    
    def export(self, file_name, best=False):
        if best:
            output = self.best_output
        else:
            test_probabilities = self.clf_model.predict_proba(data_test[self.features])[:,1]
            output = df_test.copy()
            output['Predicted'] = test_probabilities
            
        output[["ID","Predicted"]].to_csv(file_name, index=False)
        
    def tune_settings(self):            
        tuning_clf = RandomizedSearchCV(self.clf_model, self.setting_distributions, scoring='roc_auc')
        search = tuning_clf.fit(X=self.train_data[self.features],y=self.train_target)
        self.tuned_settings = search.best_params_
        return self.tuned_settings
    
    def find_best_features(self, n_features):
        status = 0
        best_roc_auc_score = 0
        features = [i for i in self.features if i not in self.best_features]
        if len(features) < n_features:
            n_combinations = 0
        else:
            n_combinations = math.factorial(len(features)) / math.factorial(n_features) / math.factorial(len(features)-n_features)

        print('Combinations: ' + str(n_combinations))

        for i in iter.combinations(features, r=n_features):
            feats = list(i) + self.best_features
            self.clf_model.fit(X=self.train_data[feats],y=self.train_target)
            validation_probabilities = self.clf_model.predict_proba(self.validation_data[feats])[:,1]
            score = roc_auc_score(y_true=self.validation_target, y_score=validation_probabilities)
            if score > best_roc_auc_score:
                best_roc_auc_score = score
                best_key = i
                print(str(best_key) + ' : ' + str(best_roc_auc_score))

            status += 1
            if status % 50 == 0:                
                print('Completed ' + str(status) + ' of ' + str(n_combinations))


        print(str(best_key) + '\n' + str(best_roc_auc_score))
        self.best_features += best_key
        self.best_feature_scores = score
        return score

In [None]:
##### Model Training #####

In [None]:
#### Random Forest Classifier ####
from sklearn.ensemble import RandomForestClassifier as rdmfrst
rdm_frst_clf = classifier()
rdm_frst_clf.clf = rdmfrst

In [None]:
rdm_frst_clf.__init__() # reset data sets

In [None]:
# Train and Validate
rdm_frst_clf.init_model(settings=rdm_frst_clf.tuned_settings)
rdm_frst_clf.train()
rdm_frst_clf.validate()

In [None]:
for i in range(10):
    score = rdm_frst_clf.find_best_features(1)

In [None]:
rdm_frst_clf.features = rdm_frst_clf.best_features

In [None]:
# Tune Settings
rdm_frst_clf.setting_distributions = {
    'n_estimators': list(range(0,2000,10)), # number of trees in the forest
    'bootstrap': [True,False], # repeatably sample from training data
    'oob_score': [True,False], # test with points not used in set
    'criterion': ['gini', 'entropy'], # scored nodes
    'max_depth': list(range(1,100)) + [None], # max depth of the tree
    'max_features': ['auto', 'sqrt', 'log2'], # n features to make split discision
}
rdm_frst_clf.tune_settings()

In [None]:
# Output
rdm_frst_clf.export('rdm_frst_v5.csv', best=True)

In [None]:
def return_feature_importance(keys,clf):
    key_importance = []
    for i in range(len(keys)):
        key_importance += [(keys[i], clf.feature_importances_[i])]
    return sorted(key_importance, key=lambda a: a[1], reverse=True)

top_tree_features = [i[0] for i in return_feature_importance(rdm_frst_clf.features,rdm_frst_clf.clf_model)[:20]]
top_tree_features

In [None]:
#### Nearest Neighbour Classifier ####
from sklearn.neighbors import KNeighborsClassifier as knn
knn_clf = classifier()
knn_clf.clf = knn

In [None]:
knn_clf.__init__() # reset data sets

In [None]:
# Train and Validate
knn_clf.init_model(settings=knn_clf.tuned_settings)
knn_clf.train()
knn_clf.validate()

In [None]:
for i in range(10):
    score = knn_clf.find_best_features(1)

In [None]:
knn_clf.features = knn_clf.best_features
knn_clf.features

In [None]:
# Tune Settings
knn_clf.setting_distributions = {
    'n_neighbors': list(range(1,100,10)), # number of neighbours to use
    'weights':['uniform', 'distance'], # neighbours are weighted uniformly or distance
    'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'], # algoirthm to determin neighbours
    'p': [1,2], # 1: manhattan distance, 2: eclidian distance
}
knn_clf.tune_settings()

In [None]:
# Output
knn_clf.export('knn_v5.csv', best=True)

In [None]:
#### Logistic Regression Classifier ####
from sklearn.linear_model import LogisticRegression as log_reg
log_reg_clf = classifier()
log_reg_clf.clf = log_reg

In [None]:
log_reg_clf.__init__() # reset data sets

In [None]:
# Train and Validate
temp_settings = log_reg_clf.tuned_settings
log_reg_clf.init_model(settings=temp_settings)

log_reg_clf.train()
log_reg_clf.validate()

In [None]:
for i in range(1):
    score = log_reg_clf.find_best_features(1)

In [None]:
log_reg_clf.features = log_reg_clf.best_features
# log_reg_clf.best_features = []

In [None]:
# Tune Settings
log_reg_clf.setting_distributions = {
    'solver':['lbfgs', 'liblinear'], # alogirthm used for optimization
    'max_iter': list(range(0,6000,100)), # maximum iterations for the solver to converge
    'fit_intercept': [True, False], # should a bias be applied to the desicion function
}
log_reg_clf.tune_settings()


In [None]:
# Output
log_reg_clf.export('lr_v5.csv', best=True)

In [None]:
#### Adaboost Classifier ####
from sklearn.ensemble import AdaBoostClassifier as ada_boost
ada_boost_clf = classifier()
ada_boost_clf.clf = ada_boost

In [None]:
ada_boost_clf.__init__() # reset data sets

In [None]:
# Train and Validate
ada_boost_clf.init_model(settings=ada_boost_clf.tuned_settings)

ada_boost_clf.train()
ada_boost_clf.validate()

In [None]:
# Tune Settings
ada_boost_clf.setting_distributions = {
    'n_estimators':list(range(1,1000)), # maximum number of estimators to boost
    'learning_rate':list([0.0001,0.001,0.01,0.1,1]), # weight of each subsequent classifier
}
ada_boost_clf.tune_settings()

In [None]:
# ada_boost_clf.best_features = []
ada_boost_clf.features = ada_boost_clf.best_features

In [None]:
for i in range(10):
    score = ada_boost_clf.find_best_features(1)

In [None]:
# Output
ada_boost_clf.export('ada_boost_v1.csv', best=True)