In [1]:
import pandas as pd
import os
from os import listdir
import numpy as np
from scipy.stats import skew, kurtosis
import math
from scipy import stats
import timeit

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier,VotingClassifier)
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis,QuadraticDiscriminantAnalysis

# Modelling Helpers
from sklearn.preprocessing import Imputer , Normalizer , scale,StandardScaler,LabelEncoder 
from sklearn.feature_selection import RFECV
from sklearn.model_selection import cross_val_score,GridSearchCV, learning_curve,train_test_split,StratifiedKFold, KFold 
from sklearn.metrics import accuracy_score

In [2]:
def entropy(array):
    a = list(array)
    d = {x:a.count(x) for x in a}
    value = list(d.values())
    h = 0
    for i in value:
        h = h + i/len(a) * math.log2(i/len(a))
    return -h

In [3]:
def describe(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
    train_number, test_number,  features  = X_train.shape[0], X_test.shape[0],X_train.shape[1] 
    classes = len(y.unique())
    category = []
    each_outlier = []
    for col in X:
        if len(X[col].unique()) == 2:
            category.append('bin')          
        elif len(X[col].unique()) < 21:   # %
            category.append('cat')    
        else:
            category.append('con')
            z = np.abs(stats.zscore(X[col]))
            each_outlier.append(len(z[z>3])/(train_number+test_number))
    binary, categorical, continous  = category.count('bin')/features, category.count('cat')/features, category.count('con')/features   
    meancor_class = []
    for i in y.unique():
        subdataset = X[y==i]
        if subdataset.shape[0] > 10:
            a = [subdataset[col].nunique() for col in subdataset.columns]
            cutdim = features - a.count(1)
            indices = [i for i, x in enumerate(a) if x != 1]
            subdataset = subdataset[subdataset.columns[indices]]
            meancor_class.append(np.mean(np.unique(np.reshape(abs(subdataset.corr()).values,cutdim*cutdim))[:-1]))
    meancor = np.mean(meancor_class)
    outliers = np.mean(each_outlier)
    classentropy = entropy(y)/math.log2(classes)
    des = np.array([train_number, test_number,classes,features,binary,categorical,continous,meancor,outliers,classentropy])  
    description = pd.DataFrame(des)       # %
    description.index=index
    return description
index=['train_number','test_number','classes','features','binary','categorical','continous','meancor','outliers','classentropy']

In [7]:
#ignore warnings
import warnings
warnings.filterwarnings('ignore')
def implement(X_train,y_train, X_test, y_test):
    Kfold = StratifiedKFold(n_splits=10)
    ### scale 
    scaler = StandardScaler()  
    scaler.fit(X_train)
    X_train= scaler.transform(X_train)
    X_test= scaler.transform(X_test)
    # learn
    classifiers=[DecisionTreeClassifier(random_state=7),SVC(random_state=7),LogisticRegression(random_state=7),
                 KNeighborsClassifier(),MLPClassifier(random_state=7),
                 GaussianNB(),RandomForestClassifier(random_state=7),AdaBoostClassifier(DecisionTreeClassifier(random_state=0), random_state=7)]
    time = []
    cv_results = []
    test_score=[]
    for classifier in classifiers:
        start = timeit.default_timer()
        cv_results.append(cross_val_score(classifier,X_train,y_train, scoring = "accuracy", cv = Kfold, n_jobs=4))
        classifier.fit(X_train,y_train)
        test_score.append(classifier.score(X_test, y_test))
        stop = timeit.default_timer()
        time.append(stop-start)
    
    cv_means = []
    cv_std = []
    for cv_result in cv_results:
        cv_means.append(cv_result.mean())
        cv_std.append(cv_result.std())

    cv_res = pd.DataFrame({"Algorithm":["DecisionTree","SVC","LogisticRegression","KNeighboors","MultipleLayerPerceptron",
                                        'NaiveBayes',"RandomForest","AdaBoost"],"CrossValMeans":cv_means,"CrossValerrors": cv_std,
                      'Time':time,'test':test_score})
    return cv_res

In [5]:
arcene=pd.read_csv('Data/arcene.csv')
label = 'Class'
y = arcene[label]  # %
X = arcene.drop([label],axis=1)
description = describe(X,y)
description

Unnamed: 0,0
train_number,150.0
test_number,50.0
classes,2.0
features,10000.0
binary,0.0061
categorical,0.0878
continous,0.9061
meancor,0.188664
outliers,0.013436
classentropy,0.989588


In [31]:
i=0
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=100)
arcene_res=implement(X_train,y_train, X_test, y_test)
arcene_res.index = arcene_res.Algorithm
arcene_res=arcene_res.drop(['Algorithm'],axis=1)
for seed in range(100):
    i = i+1 
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=seed)
    res=implement(X_train,y_train, X_test, y_test)
    res.index = res.Algorithm
    res=res.drop(['Algorithm'],axis=1)
    arcene_res = (i*arcene_res+res)/(i+1)
arcene_res

Unnamed: 0_level_0,CrossValMeans,CrossValerrors,Time,test
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DecisionTree,0.703787,0.107101,1.281386,0.707129
SVC,0.752136,0.108303,1.340125,0.757426
LogisticRegression,0.763742,0.092641,1.615689,0.844554
KNeighboors,0.808154,0.097384,0.550664,0.798614
MultipleLayerPerceptron,0.760116,0.101103,6.754241,0.747327
NaiveBayes,0.554663,0.115142,0.380704,0.546139
RandomForest,0.76263,0.101794,0.464299,0.778614
AdaBoost,0.70166,0.108003,1.284511,0.70198


In [38]:
#arcene_res=arcene_res.drop(['Algorithm'],axis=1)
arcene_res.reset_index(level=0, inplace=True)

In [40]:
arcene_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.703787,0.107101,1.281386,0.707129
1,SVC,0.752136,0.108303,1.340125,0.757426
2,LogisticRegression,0.763742,0.092641,1.615689,0.844554
3,KNeighboors,0.808154,0.097384,0.550664,0.798614
4,MultipleLayerPerceptron,0.760116,0.101103,6.754241,0.747327
5,NaiveBayes,0.554663,0.115142,0.380704,0.546139
6,RandomForest,0.76263,0.101794,0.464299,0.778614
7,AdaBoost,0.70166,0.108003,1.284511,0.70198


In [41]:
arcene_res.to_csv('output/arcene.csv')

In [12]:
description.columns = ['arcene']
description.to_csv('describe/arcene.csv')
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=200)
arcene_res=implement(X_train,y_train, X_test, y_test)
arcene_res.to_csv('output/arcene.csv')
arcene_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.794762,0.089016,3.185719,0.6
1,SVC,0.786726,0.097955,1.575428,0.7
2,LogisticRegression,0.860714,0.104838,1.614382,0.82
3,KNeighboors,0.853512,0.065398,0.554779,0.68
4,MultipleLayerPerceptron,0.807619,0.094574,6.609307,0.66
5,NaiveBayes,0.505833,0.122206,0.381905,0.46
6,RandomForest,0.789345,0.107007,0.438919,0.64
7,AdaBoost,0.774702,0.069124,1.270584,0.58


In [14]:
#ignore warnings
import warnings
warnings.filterwarnings('ignore')
def implement1(X_train,y_train, X_test, y_test):
    Kfold = StratifiedKFold(n_splits=4)
    ### scale 
    scaler = StandardScaler()  
    scaler.fit(X_train)
    X_train= scaler.transform(X_train)
    X_test= scaler.transform(X_test)
   # Tune parameters  1. Decision Tree
    param_grid = {'criterion': ['gini', 'entropy'],  #scoring methodology; two supported formulas for calculating information gain - default is gini
              'splitter': ['best', 'random'], #splitting methodology; two supported strategies - default is best
             # 'max_depth': [4,10,50,None], #max depth tree can grow; default is none
              #'min_samples_split': [2,5,10], #minimum subset size BEFORE new split (fraction is % of total); default is 2 [2,5,10,.03,.05]
              #'min_samples_leaf': [1,5,10], #minimum subset size AFTER new split split (fraction is % of total); default is 1 [1,5,10,.03,.05],
              'max_features': ['auto',None], #max features to consider when performing split; default none or all
              'random_state': [0] #seed or control random number generator: https://www.quora.com/What-is-seed-in-random-number-generation
             }

    dt_model = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid, scoring = 'accuracy', cv = Kfold,n_jobs= 4, verbose = 0)
    dt_model.fit(X_train,y_train)
    dt_best = dt_model.best_estimator_
    # 2. SVC classifier
    SVMC = SVC(probability=True)
    svc_param_grid = {'kernel': ['rbf'], 
                  'gamma': [ 0.001,0.01, 1],
                  'C': [0.01,1,  100 ],
                 'random_state': [0]}
    gsSVMC = GridSearchCV(SVMC,param_grid = svc_param_grid, cv=Kfold, scoring="accuracy", n_jobs= 4, verbose = 0)
    gsSVMC.fit(X_train,y_train)
    SVMC_best = gsSVMC.best_estimator_
    ### 3. Logistic Regression Classifier
    lgr = LogisticRegression(penalty='l2',solver='lbfgs')
    lgr_param_grid = {
       'max_iter':[40,100,200],
       'C': [0.01,1, 100],
       'random_state': [0]}
    gslr = GridSearchCV(lgr,param_grid = lgr_param_grid, scoring = 'accuracy', cv=Kfold,n_jobs= 4, verbose=0)
    gslr.fit(X_train,y_train)
    lr_best = gslr.best_estimator_
    ### 6. KNN
    knn = KNeighborsClassifier()
    knn_param_grid = {"n_neighbors": np.arange(3, 25, 2),
    "metric": ["euclidean", "cityblock"]}
    gsknn = GridSearchCV(knn,param_grid = knn_param_grid, cv=Kfold,scoring = 'accuracy', n_jobs= 4,verbose=0)
    gsknn.fit(X_train,y_train)
    knn_best = gsknn.best_estimator_
    ### 8.MLP
    mlp = MLPClassifier(max_iter=1000,tol=0.0001)
    mlp_param_grid = {
            'hidden_layer_sizes': [(100,),(50,)],
            'activation': ['tanh'],
            'solver': ['adam'],
            'alpha': [0.0001,  0.01],
            'learning_rate': ['constant','adaptive'],
            'random_state': [0]
        }
    gsmlp = GridSearchCV(mlp,param_grid = mlp_param_grid,cv=Kfold,verbose=0,  n_jobs= 4)
    gsmlp.fit(X_train,y_train)
    gsmlp_best = gsmlp.best_estimator_
      # 10. RandomForest
    RFC = RandomForestClassifier()
    rf_param_grid = {#"max_depth": [4,10,50,None],
                      "max_features": ['auto',None],
                     # "min_samples_split": [3, 10],
                     # "min_samples_leaf": [1, 5],
                      #"bootstrap": [False],
                      "n_estimators" :[10,50,100],
                      "criterion": ['gini', 'entropy'],
                        'random_state': [0]}
    gsRFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=Kfold, scoring="accuracy", n_jobs= 4, verbose = 0)
    gsRFC.fit(X_train,y_train)
    RFC_best = gsRFC.best_estimator_
    # 11. Adaboost
    DTC = DecisionTreeClassifier(random_state=0)
    adaDTC = AdaBoostClassifier(DTC, random_state=7)
    ada_param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
                  "base_estimator__splitter" :   ["best", "random"],
                  "n_estimators" :[10,  100],
                  "learning_rate":  [0.01, 1.0, 100.0],
                     'random_state': [0]}
    gsadaDTC = GridSearchCV(adaDTC,param_grid = ada_param_grid, cv=Kfold, scoring="accuracy", n_jobs= 4, verbose = 0)
    gsadaDTC.fit(X_train,y_train)
    ada_best = gsadaDTC.best_estimator_
    
    # learn
    classifiers=[dt_best,SVMC_best,lr_best,knn_best,gsmlp_best,GaussianNB(),RFC_best,ada_best]
    time = []
    cv_results = []
    test_score=[]
    for classifier in classifiers:
        start = timeit.default_timer()
        cv_results.append(cross_val_score(classifier,X_train,y_train, scoring = "accuracy", cv = Kfold, n_jobs=4))
        classifier.fit(X_train,y_train)
        test_score.append(classifier.score(X_test, y_test))
        stop = timeit.default_timer()
        time.append(stop-start)
    
    cv_means = []
    cv_std = []
    for cv_result in cv_results:
        cv_means.append(cv_result.mean())
        cv_std.append(cv_result.std())

    cv_res = pd.DataFrame({"Algorithm":["DecisionTree","SVC","LogisticRegression","KNeighboors","MultipleLayerPerceptron",
                                        'NaiveBayes',"RandomForest","AdaBoost"],"CrossValMeans":cv_means,"CrossValerrors": cv_std,
                      'Time':time,'test':test_score})
    return cv_res

Fitting 4 folds for each of 288 candidates, totalling 1152 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  76 tasks      | elapsed:    2.3s
[Parallel(n_jobs=4)]: Done 376 tasks      | elapsed:   15.4s
[Parallel(n_jobs=4)]: Done 876 tasks      | elapsed:   36.9s
[Parallel(n_jobs=4)]: Done 1152 out of 1152 | elapsed:   49.8s finished


0.7533333333333333

In [18]:
credit=pd.read_csv('Data/credit.csv')
credit = credit.drop(['ID'],axis=1)
label = 'default.payment.next.month'
y = credit[label]  # %
X = credit.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
credit_res=implement(X_train,y_train, X_test, y_test)
credit_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.72831,0.009321,3.606902,0.7252
1,SVC,0.820488,0.005207,82.099281,0.8144
2,LogisticRegression,0.812045,0.002171,0.788667,0.803333
3,KNeighboors,0.792354,0.005936,14.930116,0.786267
4,MultipleLayerPerceptron,0.814089,0.004594,53.467542,0.810933
5,NaiveBayes,0.669822,0.031445,0.151334,0.663733
6,RandomForest,0.807466,0.004164,2.324265,0.800533
7,AdaBoost,0.785466,0.006868,18.395862,0.7748


In [19]:
credit_res.to_csv('output/credit.csv')

In [20]:
adult=pd.read_csv('Data/adult.csv')

In [21]:
adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [28]:
adult.workclass=adult.workclass.replace(' ?',adult.workclass.mode()[0])

In [31]:
adult.occupation=adult.occupation.replace(' ?',adult.occupation.mode()[0])
adult['native-country']=adult['native-country'].replace(' ?',adult['native-country'].mode()[0])

In [33]:
cols=['workclass','education','marital-status','occupation','relationship','race','sex','native-country']
labelencoder=LabelEncoder()
for col in cols:
    adult[col] = labelencoder.fit_transform(adult[col])
label = 'class'
y = adult[label]  # %
X = adult.drop([label],axis=1)
description = describe(X,y)
description.columns = ['adult']

FileNotFoundError: [Errno 2] No such file or directory: '../describe/adult.csv'

In [34]:
description.to_csv('describe/adult.csv')

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
adult_res=implement(X_train,y_train, X_test, y_test)
adult_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.811963,0.006127,1.099756,0.811727
1,SVC,0.848925,0.006018,101.689901,0.853411
2,LogisticRegression,0.823564,0.006248,0.793124,0.829089
3,KNeighboors,0.82998,0.005255,20.056257,0.82917
4,MultipleLayerPerceptron,0.850018,0.00747,81.654333,0.850872
5,NaiveBayes,0.804892,0.005196,0.412078,0.803783
6,RandomForest,0.850919,0.004026,1.800816,0.850954
7,AdaBoost,0.828862,0.008026,32.940332,0.827369


In [42]:
adult_res.to_csv('output/adult.csv')

In [43]:
bioresponse=pd.read_csv('Data/bioresponse.csv')
label = 'target'
y = bioresponse[label]  # %
X = bioresponse.drop([label],axis=1)
description = describe(X,y)
description.columns = ['bioresponse']
description.to_csv('describe/bioresponse.csv')
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
bioresponse_res=implement(X_train,y_train, X_test, y_test)
bioresponse_res.to_csv('output/bioresponse.csv')
bioresponse_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.718799,0.028363,6.240876,0.745203
1,SVC,0.771076,0.029711,54.459361,0.775053
2,LogisticRegression,0.726968,0.028033,31.002467,0.733475
3,KNeighboors,0.750092,0.020989,14.270545,0.744136
4,MultipleLayerPerceptron,0.758262,0.028624,71.477207,0.767591
5,NaiveBayes,0.605396,0.021591,1.040884,0.565032
6,RandomForest,0.762893,0.025287,1.239404,0.799574
7,AdaBoost,0.708484,0.027577,4.347092,0.724947


In [44]:
bioresponse_res.to_csv('output/bioresponse.csv')

In [53]:
digit=pd.read_csv('Data/digit.csv')
du = digit.nunique()
digit = digit.drop(digit.columns[du ==1],axis=1)
label = 'label'
y = digit[label]  # %
X = digit.drop([label],axis=1)
description = describe(X,y)
description.columns = ['digit']
description.to_csv('describe/digit.csv')
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
digit_res=implement(X_train,y_train, X_test, y_test)
digit_res.to_csv('output/digit.csv')
digit_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.850953,0.007497,38.496567,0.846857
1,SVC,0.956189,0.003465,989.778891,0.957429
2,LogisticRegression,0.907173,0.004502,1482.803813,0.905905
3,KNeighboors,0.937365,0.00439,700.641372,0.936952
4,MultipleLayerPerceptron,0.966158,0.0021,123.627193,0.967714
5,NaiveBayes,0.528064,0.008095,6.855807,0.528095
6,RandomForest,0.935143,0.00533,11.553082,0.933429
7,AdaBoost,0.851873,0.007886,37.507837,0.845619


In [57]:
internetad=pd.read_csv('Data/internetad.csv')
labelencoder=LabelEncoder()
internetad['class'] = labelencoder.fit_transform(internetad['class'])
label = 'class'
y = internetad[label]  # %
X = internetad.drop([label],axis=1)
description = describe(X,y)
description.columns = ['internetad']
description.to_csv('describe/internetad.csv')

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
internetad_res=implement(X_train,y_train, X_test, y_test)
internetad_res.to_csv('output/internetad.csv')
internetad_res 

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.961793,0.013087,5.939167,0.963415
1,SVC,0.962201,0.012281,19.339123,0.967073
2,LogisticRegression,0.964232,0.014129,13.42006,0.963415
3,KNeighboors,0.952045,0.016741,7.130864,0.95
4,MultipleLayerPerceptron,0.966681,0.015783,40.720596,0.964634
5,NaiveBayes,0.787308,0.018222,0.779171,0.779268
6,RandomForest,0.96992,0.015403,1.444854,0.980488
7,AdaBoost,0.961793,0.016219,18.123823,0.969512


In [62]:
micromass=pd.read_csv('Data/micromass.csv')
label = 'Class'
y = micromass[label]  # %
X = micromass.drop([label],axis=1)
description = describe(X,y)
description.columns = ['micromass']
description.to_csv('describe/micromass.csv')
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
micromass_res=implement(X_train,y_train, X_test, y_test)
micromass_res.to_csv('output/micromass.csv')
micromass_res 

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.786057,0.052659,0.334813,0.755245
1,SVC,0.508543,0.050301,2.25421,0.41958
2,LogisticRegression,0.885139,0.027279,11.985113,0.797203
3,KNeighboors,0.592936,0.06795,0.357981,0.58042
4,MultipleLayerPerceptron,0.867042,0.052934,6.570259,0.804196
5,NaiveBayes,0.713681,0.064792,0.170112,0.699301
6,RandomForest,0.802764,0.080051,0.218039,0.783217
7,AdaBoost,0.782791,0.061121,0.345133,0.79021
