In [1]:
import pandas as pd
import os
from os import listdir
import numpy as np
from scipy.stats import skew, kurtosis
import math
from scipy import stats
import timeit

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier,VotingClassifier)
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis,QuadraticDiscriminantAnalysis

# Modelling Helpers
from sklearn.preprocessing import Imputer , Normalizer , scale,StandardScaler,LabelEncoder 
from sklearn.feature_selection import RFECV
from sklearn.model_selection import cross_val_score,GridSearchCV, learning_curve,train_test_split,StratifiedKFold, KFold 
from sklearn.metrics import accuracy_score

In [48]:
##### ignore warnings
import warnings
warnings.filterwarnings('ignore')
def implement(X_train,y_train, X_test, y_test):
    Kfold = StratifiedKFold(n_splits=5)
    ### scale 
    scaler = StandardScaler()  
    scaler.fit(X_train)
    X_train= scaler.transform(X_train)
    X_test= scaler.transform(X_test)
   # Tune parameters  1. Decision Tree
    param_grid = {'criterion': ['gini', 'entropy'],  #scoring methodology; two supported formulas for calculating information gain - default is gini
              'splitter': ['best', 'random'], #splitting methodology; two supported strategies - default is best
              'max_depth': [4,10,50,None], #max depth tree can grow; default is none
              'min_samples_split': [2,5,10], #minimum subset size BEFORE new split (fraction is % of total); default is 2 [2,5,10,.03,.05]
              'min_samples_leaf': [1,5,10], #minimum subset size AFTER new split split (fraction is % of total); default is 1 [1,5,10,.03,.05],
              'max_features': ['auto',None], #max features to consider when performing split; default none or all
              'random_state': [0] #seed or control random number generator: https://www.quora.com/What-is-seed-in-random-number-generation
             }

    dt_model = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid, scoring = 'accuracy', cv = Kfold,n_jobs= 4, verbose = 0)
    dt_model.fit(X_train,y_train)
    dt_best = dt_model.best_estimator_
    # 2. SVC classifier
    SVMC = SVC(probability=True)
    svc_param_grid = {'kernel': ['rbf'], 
                  'gamma': [ 0.001,0.05, 1],
                  'C': [0.001, 0.05,1, 50, 1000],
                 'random_state': [0]}
    gsSVMC = GridSearchCV(SVMC,param_grid = svc_param_grid, cv=Kfold, scoring="accuracy", n_jobs= 4, verbose = 0)
    gsSVMC.fit(X_train,y_train)
    SVMC_best = gsSVMC.best_estimator_
    ### 3. Logistic Regression Classifier
    lgr = LogisticRegression(penalty='l2',solver='lbfgs')
    lgr_param_grid = {
       'max_iter':[40,60,80,100,200],
       'C': [0.001,0.01,0.1,1,10, 100, 1000],
       'random_state': [0]}
    gslr = GridSearchCV(lgr,param_grid = lgr_param_grid, scoring = 'accuracy', cv=Kfold,n_jobs= 4, verbose=0)
    gslr.fit(X_train,y_train)
    lr_best = gslr.best_estimator_
    ### 6. KNN
    knn = KNeighborsClassifier()
    knn_param_grid = {"n_neighbors": np.arange(3, 29, 2),
    "metric": ["euclidean", "cityblock"]}
    gsknn = GridSearchCV(knn,param_grid = knn_param_grid, cv=Kfold,scoring = 'accuracy', n_jobs= 4,verbose=0)
    gsknn.fit(X_train,y_train)
    knn_best = gsknn.best_estimator_
    ### 8.MLP
    mlp = MLPClassifier(max_iter=1000,tol=0.0001)
    mlp_param_grid = {
            'hidden_layer_sizes': [(100,),(50,)],
            'activation': ['tanh','relu'],
            'solver': ['adam'],
            'alpha': [0.0001,  0.01],
            'learning_rate': ['constant','adaptive'],
            'random_state': [0]
        }
    gsmlp = GridSearchCV(mlp,param_grid = mlp_param_grid,cv=Kfold,verbose=0,  n_jobs= 4)
    gsmlp.fit(X_train,y_train)
    gsmlp_best = gsmlp.best_estimator_
      # 10. RandomForest
    RFC = RandomForestClassifier()
    rf_param_grid = {"max_depth": [4,10,50,None],
                      "max_features": [0.1, 0.25,'auto',None],
                      "min_samples_split": [3, 10],
                      "min_samples_leaf": [1, 5],
                      #"bootstrap": [False],
                      "n_estimators" :[10,50,100],
                      "criterion": ['gini', 'entropy'],
                        'random_state': [0]}
    gsRFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=Kfold, scoring="accuracy", n_jobs= 4, verbose = 0)
    gsRFC.fit(X_train,y_train)
    RFC_best = gsRFC.best_estimator_
    # 11. Adaboost
    DTC1 = DecisionTreeClassifier(random_state=0,max_depth=1)
    DTC2 = DecisionTreeClassifier(random_state=0,max_depth=3)
    #DTC3 = dt_best
    adaDTC = AdaBoostClassifier(random_state=7)
    ada_param_grid = { "base_estimator": [DTC1,DTC2],
              "base_estimator__criterion" : ["gini", "entropy"],
                  "base_estimator__splitter" :   ["best", "random"],
                  "n_estimators" :[10, 50, 100, 500],
                  "learning_rate":  [0.001, 0.05, 1.0, 50],
                     'random_state': [0]}
    gsadaDTC = GridSearchCV(adaDTC,param_grid = ada_param_grid, cv=Kfold, scoring="accuracy", n_jobs= 4, verbose = 0)
    gsadaDTC.fit(X_train,y_train)
    ada_best = gsadaDTC.best_estimator_
    
    # learn
    classifiers=[dt_best,SVMC_best,lr_best,knn_best,gsmlp_best,GaussianNB(),RFC_best,ada_best]
    time = []
    cv_results = []
    test_score=[]
    for classifier in classifiers:
        start = timeit.default_timer()
        cv_results.append(cross_val_score(classifier,X_train,y_train, scoring = "accuracy", cv = Kfold, n_jobs=4))
        classifier.fit(X_train,y_train)
        test_score.append(classifier.score(X_test, y_test))
        stop = timeit.default_timer()
        time.append(stop-start)
    
    cv_means = []
    cv_std = []
    for cv_result in cv_results:
        cv_means.append(cv_result.mean())
        cv_std.append(cv_result.std())

    cv_res = pd.DataFrame({"Algorithm":["DecisionTree","SVC","LogisticRegression","KNeighboors","MultipleLayerPerceptron",
                                        'NaiveBayes',"RandomForest","AdaBoost"],"CrossValMeans":cv_means,"CrossValerrors": cv_std,
                      'Time':time,'test':test_score})
    return cv_res

In [62]:
australia = pd.read_csv('australia.csv')
label = 'A15'
y = australia[label]  # %
X = australia.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
Kfold = StratifiedKFold(n_splits=5)
    ### scale 
scaler = StandardScaler()  
scaler.fit(X_train)
X_train= scaler.transform(X_train)
X_test= scaler.transform(X_test)
    # 11. Adaboost
DTC1 = DecisionTreeClassifier(random_state=0,max_depth=1)
DTC2 = DecisionTreeClassifier(random_state=0,max_depth=2)
    #DTC3 = dt_best
adaDTC = AdaBoostClassifier(random_state=7)
ada_param_grid = { "base_estimator": [DTC1,DTC2],
              "base_estimator__criterion" : ["gini", "entropy"],
                  "base_estimator__splitter" :   ["best", "random"],
                  "n_estimators" :[10, 50, 100, 500],
                  "learning_rate":  [0.001, 0.05, 1.0, 50],
                     'random_state': [0]}
gsadaDTC = GridSearchCV(adaDTC,param_grid = ada_param_grid, cv=Kfold, scoring="accuracy", n_jobs= 4, verbose = 0)
gsadaDTC.fit(X_train,y_train)
ada_best = gsadaDTC.best_estimator_
ada_best

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='random'),
          learning_rate=0.05, n_estimators=100, random_state=0)

In [4]:
os.chdir('Data/')

In [15]:
australia = pd.read_csv('australia.csv')
label = 'A15'
y = australia[label]  # %
X = australia.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
australia_res=implement(X_train,y_train, X_test, y_test)
australia_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.86854,0.024594,0.013761,0.809249
1,SVC,0.88596,0.025053,0.13082,0.815029
2,LogisticRegression,0.868484,0.025563,0.018205,0.855491
3,KNeighboors,0.880172,0.027313,0.021524,0.849711
4,MultipleLayerPerceptron,0.893652,0.017124,3.540894,0.791908
5,NaiveBayes,0.849104,0.014745,0.01786,0.878613
6,RandomForest,0.895556,0.028839,0.224608,0.849711
7,AdaBoost,0.893652,0.015985,0.048266,0.83815


In [16]:
heart=pd.read_csv('heart.csv')
label = 'target'
y = heart[label]  # %
X = heart.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
heart_res=implement(X_train,y_train, X_test, y_test)
heart_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.72744,0.057208,0.010602,0.75
1,SVC,0.819517,0.048822,0.030564,0.842105
2,LogisticRegression,0.837005,0.04115,0.016026,0.868421
3,KNeighboors,0.823961,0.049866,0.014156,0.815789
4,MultipleLayerPerceptron,0.788502,0.04983,1.805728,0.802632
5,NaiveBayes,0.828406,0.044245,0.013114,0.776316
6,RandomForest,0.841353,0.045112,0.235822,0.828947
7,AdaBoost,0.85913,0.029442,0.037001,0.815789


In [38]:
heart=pd.read_csv('heart.csv')
label = 'target'
y = heart[label]  # %
X = heart.drop([label],axis=1)
i=0
heart_res.index = heart_res.Algorithm
heart_res=heart_res.drop(['Algorithm'],axis=1)
for seed in [10,50,111,544]:
    i = i+1 
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=seed)
    res=implement(X_train,y_train, X_test, y_test)
    res.index = res.Algorithm
    res=res.drop(['Algorithm'],axis=1)
    heart_res = (i*heart_res+res)/(i+1)
heart_res

Unnamed: 0_level_0,CrossValMeans,CrossValerrors,Time,test
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DecisionTree,0.791022,0.063087,0.012497,0.75
SVC,0.826272,0.048929,0.02599,0.828947
LogisticRegression,0.830735,0.04507,0.017068,0.818421
KNeighboors,0.844805,0.049468,0.014158,0.823684
MultipleLayerPerceptron,0.803254,0.054861,1.528185,0.794737
NaiveBayes,0.822851,0.045111,0.013493,0.805263
RandomForest,0.859761,0.053189,0.172275,0.828947
AdaBoost,0.857924,0.049205,0.136641,0.818421


In [41]:
heart_res.to_csv('../output/heart.csv')
australia_res.to_csv('../output/australia.csv')

In [18]:
abalone = pd.read_csv('abalone.csv')
# abalone = abalone.drop(['id','Unnamed: 32'],axis=1)     
le = LabelEncoder()              # label encoding
col = 'Sex'
abalone[col] = le.fit_transform(abalone[col])
label = 'Class_number_of_rings'
y = abalone[label]  # %
X = abalone.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
abalone_res=implement(X_train,y_train, X_test, y_test)
abalone_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.265313,0.016428,0.026204,0.266029
1,SVC,0.280314,0.012911,7.855573,0.249761
2,LogisticRegression,0.271542,0.024174,0.805574,0.25933
3,KNeighboors,0.267254,0.011736,0.064555,0.242105
4,MultipleLayerPerceptron,0.27708,0.005969,18.683232,0.262201
5,NaiveBayes,0.241284,0.021799,0.046803,0.211483
6,RandomForest,0.281538,0.015173,0.40762,0.239234
7,AdaBoost,0.278047,0.020433,0.430373,0.262201


In [26]:
iris = pd.read_csv('iris.csv')
iris = iris.drop(['Id'],axis=1)
label = 'Species'
y = iris[label]  # %
X = iris.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
iris_res=implement(X_train,y_train, X_test, y_test)
iris_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.973485,0.036015,0.016378,0.973684
1,SVC,0.974242,0.033744,0.013937,0.973684
2,LogisticRegression,0.956385,0.026469,0.034087,0.947368
3,KNeighboors,0.983333,0.020412,0.017028,0.947368
4,MultipleLayerPerceptron,0.974242,0.033744,0.618183,0.947368
5,NaiveBayes,0.948052,0.040501,0.012126,0.973684
6,RandomForest,0.965909,0.031419,0.031712,0.973684
7,AdaBoost,0.974242,0.033744,0.151764,0.921053


In [27]:
i=0
iris_res.index = iris_res.Algorithm
iris_res=iris_res.drop(['Algorithm'],axis=1)
for seed in [10,50,111,134,544]:
    i = i+1 
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=seed)
    res=implement(X_train,y_train, X_test, y_test)
    res.index = res.Algorithm
    res=res.drop(['Algorithm'],axis=1)
    iris_res = (i*iris_res+res)/(i+1)
iris_res

Unnamed: 0_level_0,CrossValMeans,CrossValerrors,Time,test
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DecisionTree,0.943203,0.036575,0.011399,0.938596
SVC,0.96892,0.0363,0.01347,0.951754
LogisticRegression,0.95512,0.039853,0.033133,0.95614
KNeighboors,0.973555,0.024553,0.014229,0.942982
MultipleLayerPerceptron,0.964104,0.033908,0.687887,0.951754
NaiveBayes,0.948067,0.039463,0.013189,0.95614
RandomForest,0.96904,0.03481,0.045004,0.938596
AdaBoost,0.971824,0.032765,0.086309,0.934211


In [34]:
iris_res.reset_index(level=0, inplace=True)
iris_res.to_csv('../output/iris.csv')

In [20]:
breast = pd.read_csv('breast.csv')
breast = breast.drop(['id','Unnamed: 32'],axis=1)     # drop columns
le = LabelEncoder()              # label encoding
col = 'diagnosis'
breast[col] = le.fit_transform(breast[col])
label = 'diagnosis'
y = breast[label]  # %
X = breast.drop([label],axis=1)
breast_res=implement(X_train,y_train, X_test, y_test)
breast_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.973485,0.036015,0.010776,0.973684
1,SVC,0.974242,0.033744,0.012752,0.973684
2,LogisticRegression,0.956385,0.026469,0.032391,0.947368
3,KNeighboors,0.983333,0.020412,0.014742,0.947368
4,MultipleLayerPerceptron,0.974242,0.033744,0.615049,0.947368
5,NaiveBayes,0.948052,0.040501,0.016503,0.973684
6,RandomForest,0.965909,0.031419,0.031161,0.973684
7,AdaBoost,0.974242,0.033744,0.15199,0.921053


In [21]:
vehicle = pd.read_csv('vehicle.csv')
le = LabelEncoder()              # label encoding
col = 'Class'
vehicle[col] = le.fit_transform(vehicle[col])
label = 'Class'
y = vehicle[label]  # %
X = vehicle.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
vehicle_res=implement(X_train,y_train, X_test, y_test)
vehicle_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.701834,0.024235,0.015881,0.669811
1,SVC,0.824875,0.015774,0.191353,0.858491
2,LogisticRegression,0.804416,0.024037,0.134638,0.787736
3,KNeighboors,0.718982,0.043256,0.023229,0.665094
4,MultipleLayerPerceptron,0.8472,0.02846,5.69834,0.834906
5,NaiveBayes,0.45739,0.033663,0.01821,0.466981
6,RandomForest,0.760042,0.031972,0.2119,0.721698
7,AdaBoost,0.757017,0.021714,1.799966,0.75


In [22]:
churn=pd.read_csv('churn.csv')
label = 'class'
y = churn[label]  # %
X = churn.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
churn_res=implement(X_train,y_train, X_test, y_test)
churn_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.93227,0.004901,0.036208,0.9328
1,SVC,0.917071,0.006822,6.312235,0.9104
2,LogisticRegression,0.86427,0.005202,0.031503,0.872
3,KNeighboors,0.891731,0.00395,0.40553,0.9008
4,MultipleLayerPerceptron,0.923204,0.006416,15.831526,0.9328
5,NaiveBayes,0.872007,0.010369,0.023425,0.868
6,RandomForest,0.955739,0.006354,2.137583,0.9592
7,AdaBoost,0.95894,0.009788,1.533242,0.9632


In [28]:
abalone_res.to_csv('../output/abalone.csv')
breast_res.to_csv('../output/breast.csv')
vehicle_res.to_csv('../output/vehicle.csv')
churn_res.to_csv('../output/churn.csv')
iris_res.to_csv('../output/iris.csv')

In [30]:
nursery=pd.read_csv('nursery.csv')
labelencoder=LabelEncoder()
for col in nursery.columns:
    nursery[col] = labelencoder.fit_transform(nursery[col])
label = 'class'
y = nursery[label]  # %
X = nursery.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
nursery_res=implement(X_train,y_train, X_test, y_test)
nursery_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.99321,0.002016,0.042991,0.994136
1,SVC,0.99537,0.00126,23.596945,0.998765
2,LogisticRegression,0.763889,0.009092,0.132048,0.765741
3,KNeighboors,0.941461,0.001049,0.216553,0.961111
4,MultipleLayerPerceptron,0.999691,0.000252,29.452827,0.999383
5,NaiveBayes,0.639506,0.012736,0.054225,0.641049
6,RandomForest,0.993416,0.002042,0.789455,0.995062
7,AdaBoost,0.993107,0.002448,0.049574,0.993827


In [31]:
glass=pd.read_csv('glass.csv')
le = LabelEncoder()              # label encoding
col = 'Type'
glass[col] = le.fit_transform(glass[col])
label = 'Type'
y = glass[label]  # %
X = glass.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
glass_res=implement(X_train,y_train, X_test, y_test)
glass_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.711383,0.090077,0.012819,0.666667
1,SVC,0.690963,0.095707,0.029522,0.722222
2,LogisticRegression,0.695589,0.039725,0.111053,0.611111
3,KNeighboors,0.728628,0.059785,0.015082,0.685185
4,MultipleLayerPerceptron,0.718506,0.054962,1.758305,0.740741
5,NaiveBayes,0.467414,0.141637,0.016575,0.62963
6,RandomForest,0.75528,0.053558,0.147412,0.833333
7,AdaBoost,0.767045,0.044485,0.335022,0.814815


In [39]:
glass=pd.read_csv('glass.csv')
le = LabelEncoder()              # label encoding
col = 'Type'
glass[col] = le.fit_transform(glass[col])
label = 'Type'
y = glass[label]  # %
X = glass.drop([label],axis=1)
i=0
glass_res.index = glass_res.Algorithm
glass_res=glass_res.drop(['Algorithm'],axis=1)
for seed in [10,50,111,544]:
    i = i+1 
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=seed)
    res=implement(X_train,y_train, X_test, y_test)
    res.index = res.Algorithm
    res=res.drop(['Algorithm'],axis=1)
    glass_res = (i*glass_res+res)/(i+1)
glass_res

Unnamed: 0_level_0,CrossValMeans,CrossValerrors,Time,test
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DecisionTree,0.674078,0.077687,0.011994,0.577778
SVC,0.69623,0.066641,0.032909,0.688889
LogisticRegression,0.654506,0.047467,0.11716,0.644444
KNeighboors,0.70599,0.068,0.014394,0.696296
MultipleLayerPerceptron,0.710823,0.062894,1.605365,0.692593
NaiveBayes,0.400509,0.091538,0.014133,0.481481
RandomForest,0.790156,0.044929,0.215842,0.781481
AdaBoost,0.767754,0.061145,1.431788,0.751852


In [40]:
nursery_res.to_csv('../output/nursery.csv')
glass_res.to_csv('../output/glass.csv')

In [35]:
dermatology=pd.read_csv('dermatology.csv')
dermatology['Age']=dermatology['Age'].replace('?','40').astype(int)
np.mean(dermatology['Age'])
label = 'class'
y = dermatology[label]  # %
X = dermatology.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
dermatology_res=implement(X_train,y_train, X_test, y_test)
dermatology_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.941709,0.047755,0.017373,0.945652
1,SVC,0.974928,0.024493,0.102364,0.98913
2,LogisticRegression,0.970937,0.024438,0.062781,0.98913
3,KNeighboors,0.971335,0.024013,0.020335,0.967391
4,MultipleLayerPerceptron,0.97101,0.02123,1.191955,0.98913
5,NaiveBayes,0.872308,0.021988,0.022115,0.858696
6,RandomForest,0.97829,0.013478,0.288239,0.98913
7,AdaBoost,0.967171,0.024301,0.196695,0.967391


In [36]:
i=0
dermatology_res.index = dermatology_res.Algorithm
dermatology_res=dermatology_res.drop(['Algorithm'],axis=1)
for seed in [10,134,544]:
    i = i+1 
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=seed)
    res=implement(X_train,y_train, X_test, y_test)
    res.index = res.Algorithm
    res=res.drop(['Algorithm'],axis=1)
    dermatology_res = (i*dermatology_res+res)/(i+1)

In [53]:
dermatology_res.reset_index(level=0, inplace=True)
glass_res.reset_index(level=0, inplace=True)
heart_res.reset_index(level=0, inplace=True)
glass_res.to_csv('../output/glass.csv')
dermatology_res.to_csv('../output/dermatology.csv')
heart_res.to_csv('../output/heart.csv')

In [42]:
splice=pd.read_csv('splice.csv')
splice = splice.drop(['Instance_name'],axis=1)
labelencoder=LabelEncoder()
for col in splice.columns:
    splice[col] = labelencoder.fit_transform(splice[col])
label = 'Class'
y = splice[label]  # %
X = splice.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
splice_res=implement(X_train,y_train, X_test, y_test)
splice_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.883384,0.016293,0.065721,0.892231
1,SVC,0.844894,0.004656,3.448954,0.853383
2,LogisticRegression,0.812284,0.008028,0.082867,0.83208
3,KNeighboors,0.785943,0.021205,0.360578,0.824561
4,MultipleLayerPerceptron,0.866642,0.006853,3.938624,0.873434
5,NaiveBayes,0.8917,0.041872,0.044389,0.924812
6,RandomForest,0.949841,0.0093,1.123701,0.9599
7,AdaBoost,0.960277,0.012217,16.438581,0.963659


In [43]:
thyroid=pd.read_csv('thyroid.csv')
label = 'Class'
y = thyroid[label]  # %
X = thyroid.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
thyroid_res=implement(X_train,y_train, X_test, y_test)
thyroid_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.690055,0.020093,0.025182,0.677143
1,SVC,0.712892,0.015352,2.837037,0.711429
2,LogisticRegression,0.707685,0.021493,0.128406,0.714286
3,KNeighboors,0.694327,0.019815,0.141572,0.708571
4,MultipleLayerPerceptron,0.661964,0.018085,12.517951,0.688571
5,NaiveBayes,0.180031,0.013038,0.026856,0.18
6,RandomForest,0.717662,0.019551,0.548575,0.687143
7,AdaBoost,0.710052,0.01606,0.832977,0.705714


In [44]:
banknote=pd.read_csv('banknote.csv')
label = 'Class'
y = banknote[label]  # %
X = banknote.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
banknote_res=implement(X_train,y_train, X_test, y_test)
banknote_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.986398,0.003623,0.013158,1.0
1,SVC,1.0,0.0,0.039168,1.0
2,LogisticRegression,0.986422,0.012036,0.026405,0.994169
3,KNeighboors,0.998063,0.002372,0.016387,1.0
4,MultipleLayerPerceptron,1.0,0.0,1.520235,1.0
5,NaiveBayes,0.838663,0.02981,0.014736,0.845481
6,RandomForest,0.992228,0.004939,0.167983,0.994169
7,AdaBoost,1.0,0.0,0.346665,1.0


In [45]:
diabetes=pd.read_csv('diabetes.csv')
label = 'Outcome'
y = diabetes[label]  # %
X = diabetes.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
diabetes_res=implement(X_train,y_train, X_test, y_test)
diabetes_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.720486,0.008244,0.011249,0.765625
1,SVC,0.762031,0.028705,0.079726,0.776042
2,LogisticRegression,0.756814,0.023897,0.015787,0.807292
3,KNeighboors,0.770802,0.012281,0.018683,0.760417
4,MultipleLayerPerceptron,0.750023,0.028386,3.471224,0.723958
5,NaiveBayes,0.746499,0.032657,0.013027,0.755208
6,RandomForest,0.770817,0.004818,0.321262,0.78125
7,AdaBoost,0.770879,0.018996,0.05438,0.78125


In [46]:
dermatology_res.to_csv('../output/dermatology.csv')
splice_res.to_csv('../output/splice.csv')
thyroid_res.to_csv('../output/thyroid.csv')
banknote_res.to_csv('../output/banknote.csv')
diabetes_res.to_csv('../output/diabetes.csv')

In [49]:
wine=pd.read_csv('wine.csv')
label = 'quality'
y = wine[label]  # %
X = wine.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
wine_res=implement(X_train,y_train, X_test, y_test)
wine_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.589436,0.033903,0.039787,0.6425
1,SVC,0.629647,0.023405,1.310914,0.6175
2,LogisticRegression,0.581294,0.040867,0.08837,0.5875
3,KNeighboors,0.595444,0.040252,0.041529,0.595
4,MultipleLayerPerceptron,0.616271,0.030125,9.472935,0.6275
5,NaiveBayes,0.534713,0.032359,0.016956,0.54
6,RandomForest,0.668839,0.031319,0.253021,0.6925
7,AdaBoost,0.592211,0.032718,0.058572,0.5525


In [50]:
mushroom=pd.read_csv('mushroom.csv')
labelencoder=LabelEncoder()
for col in mushroom.columns:
    mushroom[col] = labelencoder.fit_transform(mushroom[col])
label = 'class'
y = mushroom[label]  # %
X = mushroom.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
mushroom_res=implement(X_train,y_train, X_test, y_test)
mushroom_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,1.0,0.0,0.047934,1.0
1,SVC,1.0,0.0,1.982856,1.0
2,LogisticRegression,0.965371,0.003407,0.111862,0.966519
3,KNeighboors,1.0,0.0,0.579747,1.0
4,MultipleLayerPerceptron,1.0,0.0,3.301533,1.0
5,NaiveBayes,0.92319,0.002978,0.046192,0.933038
6,RandomForest,1.0,0.0,0.082508,1.0
7,AdaBoost,1.0,0.0,0.483681,1.0


In [51]:
wine_res.to_csv('../output/wine.csv')
mushroom_res.to_csv('../output/mushroom.csv')