In [1]:
import pandas as pd
import os
from os import listdir
import numpy as np
from scipy.stats import skew, kurtosis
import math
from scipy import stats
import timeit

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier,VotingClassifier)
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis,QuadraticDiscriminantAnalysis

# Modelling Helpers
from sklearn.preprocessing import Imputer , Normalizer , scale,StandardScaler,LabelEncoder 
from sklearn.feature_selection import RFECV
from sklearn.model_selection import cross_val_score,GridSearchCV, learning_curve,train_test_split,StratifiedKFold, KFold 
from sklearn.metrics import accuracy_score

In [59]:
os.chdir('Data/')
filelist=os.popen('ls').read()
files=filelist.split('\n')
files=files[:-1]

['Iris.csv',
 'abalone.csv',
 'adult.csv',
 'australia.csv',
 'banknote.csv',
 'bioresponse.csv',
 'breast.csv',
 'churn.csv',
 'credit.csv',
 'dermatology.csv',
 'diabetes.csv',
 'digit.csv',
 'heart.csv',
 'internetad.csv',
 'micromass.csv',
 'mushrooms.csv',
 'nursery.csv',
 'splice.csv',
 'thyroid.csv',
 'vehicle.csv']

In [29]:
def entropy(array):
    a = list(array)
    d = {x:a.count(x) for x in a}
    value = list(d.values())
    h = 0
    for i in value:
        h = h + i/len(a) * math.log2(i/len(a))
    return -h

In [180]:
def describe(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
    train_number, test_number,  features  = X_train.shape[0], X_test.shape[0],X_train.shape[1] 
    classes = len(y.unique())
    category = []
    each_outlier = []
    for col in X:
        if len(X[col].unique()) == 2:
            category.append('bin')          
        elif len(X[col].unique()) < 21:   # %
            category.append('cat')    
        else:
            category.append('con')
            z = np.abs(stats.zscore(X[col]))
            each_outlier.append(len(z[z>3])/(train_number+test_number))
    binary, categorical, continous  = category.count('bin')/features, category.count('cat')/features, category.count('con')/features   
    meancor_class = []
    for i in y.unique():
        subdataset = X[y==i]
        if subdataset.shape[0] > 10:
            a = [subdataset[col].nunique() for col in subdataset.columns]
            cutdim = features - a.count(1)
            indices = [i for i, x in enumerate(a) if x != 1]
            subdataset = subdataset[subdataset.columns[indices]]
            meancor_class.append(np.mean(np.unique(np.reshape(abs(subdataset.corr()).values,cutdim*cutdim))[:-1]))
    meancor = np.mean(meancor_class)
    outliers = np.mean(each_outlier)
    classentropy = entropy(y)/math.log2(classes)
    des = np.array([train_number, test_number,classes,features,binary,categorical,continous,meancor,outliers,classentropy])  
    description = pd.DataFrame(des)       # %
    description.index=index
    return description
index=['train_number','test_number','classes','features','binary','categorical','continous','meancor','outliers','classentropy']

In [70]:
iris = pd.read_csv('iris.csv')
iris = iris.drop(['Id'],axis=1)
label = 'Species'

y = iris[label]  # %
X = iris.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
description = describe(X,y)
description.columns = ['iris']
description.to_csv('../describe/iris.csv')

In [71]:
X_train.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
86,6.7,3.1,4.7,1.5
46,5.1,3.8,1.6,0.2
135,7.7,3.0,6.1,2.3
7,5.0,3.4,1.5,0.2
65,6.7,3.1,4.4,1.4


In [97]:
#ignore warnings
import warnings
warnings.filterwarnings('ignore')
def implement(X_train,y_train, X_test, y_test):
    Kfold = StratifiedKFold(n_splits=10)
    ### scale 
    scaler = StandardScaler()  
    scaler.fit(X_train)
    X_train= scaler.transform(X_train)
    X_test= scaler.transform(X_test)
   # Tune parameters  1. Decision Tree
    param_grid = {'criterion': ['gini', 'entropy'],  #scoring methodology; two supported formulas for calculating information gain - default is gini
              'splitter': ['best', 'random'], #splitting methodology; two supported strategies - default is best
              'max_depth': [4,10,50,None], #max depth tree can grow; default is none
              'min_samples_split': [2,5,10], #minimum subset size BEFORE new split (fraction is % of total); default is 2 [2,5,10,.03,.05]
              'min_samples_leaf': [1,5,10], #minimum subset size AFTER new split split (fraction is % of total); default is 1 [1,5,10,.03,.05],
              'max_features': ['auto',None], #max features to consider when performing split; default none or all
              'random_state': [0] #seed or control random number generator: https://www.quora.com/What-is-seed-in-random-number-generation
             }

    dt_model = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid, scoring = 'accuracy', cv = Kfold,n_jobs= 4, verbose = 0)
    dt_model.fit(X_train,y_train)
    dt_best = dt_model.best_estimator_
    # 2. SVC classifier
    SVMC = SVC(probability=True)
    svc_param_grid = {'kernel': ['rbf'], 
                  'gamma': [ 0.001,0.01,0.1, 1],
                  'C': [0.001, 0.01,0.1,1, 10, 100, 1000],
                 'random_state': [0]}
    gsSVMC = GridSearchCV(SVMC,param_grid = svc_param_grid, cv=Kfold, scoring="accuracy", n_jobs= 4, verbose = 0)
    gsSVMC.fit(X_train,y_train)
    SVMC_best = gsSVMC.best_estimator_
    ### 3. Logistic Regression Classifier
    lgr = LogisticRegression(penalty='l2',solver='lbfgs')
    lgr_param_grid = {
       'max_iter':[40,60,80,100,200],
       'C': [0.001,0.01,0.1,1,10, 100, 1000],
       'random_state': [0]}
    gslr = GridSearchCV(lgr,param_grid = lgr_param_grid, scoring = 'accuracy', cv=Kfold,n_jobs= 4, verbose=0)
    gslr.fit(X_train,y_train)
    lr_best = gslr.best_estimator_
    ### 6. KNN
    knn = KNeighborsClassifier()
    knn_param_grid = {"n_neighbors": np.arange(1, 31, 2),
    "metric": ["euclidean", "cityblock"]}
    gsknn = GridSearchCV(knn,param_grid = knn_param_grid, cv=Kfold,scoring = 'accuracy', n_jobs= 4,verbose=0)
    gsknn.fit(X_train,y_train)
    knn_best = gsknn.best_estimator_
    ### 8.MLP
    mlp = MLPClassifier(max_iter=1000,tol=0.0001)
    mlp_param_grid = {
            'hidden_layer_sizes': [(100,),(50,)],
            'activation': ['tanh','relu'],
            'solver': ['adam'],
            'alpha': [0.0001,  0.01],
            'learning_rate': ['constant','adaptive'],
            'random_state': [0]
        }
    gsmlp = GridSearchCV(mlp,param_grid = mlp_param_grid,cv=Kfold,verbose=0,  n_jobs= 4)
    gsmlp.fit(X_train,y_train)
    gsmlp_best = gsmlp.best_estimator_
      # 10. RandomForest
    RFC = RandomForestClassifier()
    rf_param_grid = {"max_depth": [4,10,50,None],
                      "max_features": [0.1, 0.25, 0.5,'auto',None],
                      "min_samples_split": [3, 10],
                      "min_samples_leaf": [1, 5],
                      #"bootstrap": [False],
                      "n_estimators" :[10,50,100],
                      "criterion": ['gini', 'entropy'],
                        'random_state': [0]}
    gsRFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=Kfold, scoring="accuracy", n_jobs= 4, verbose = 0)
    gsRFC.fit(X_train,y_train)
    RFC_best = gsRFC.best_estimator_
    # 11. Adaboost
    DTC = DecisionTreeClassifier(random_state=0)
    adaDTC = AdaBoostClassifier(DTC, random_state=7)
    ada_param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
                  "base_estimator__splitter" :   ["best", "random"],
                  "n_estimators" :[10, 50, 100, 500],
                  "learning_rate":  [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
                     'random_state': [0]}
    gsadaDTC = GridSearchCV(adaDTC,param_grid = ada_param_grid, cv=Kfold, scoring="accuracy", n_jobs= 4, verbose = 0)
    gsadaDTC.fit(X_train,y_train)
    ada_best = gsadaDTC.best_estimator_
    
    # learn
    classifiers=[dt_best,SVMC_best,lr_best,knn_best,gsmlp_best,GaussianNB(),RFC_best,ada_best]
    time = []
    cv_results = []
    test_score=[]
    for classifier in classifiers:
        start = timeit.default_timer()
        cv_results.append(cross_val_score(classifier,X_train,y_train, scoring = "accuracy", cv = Kfold, n_jobs=4))
        classifier.fit(X_train,y_train)
        test_score.append(classifier.score(X_test, y_test))
        stop = timeit.default_timer()
        time.append(stop-start)
    
    cv_means = []
    cv_std = []
    for cv_result in cv_results:
        cv_means.append(cv_result.mean())
        cv_std.append(cv_result.std())

    cv_res = pd.DataFrame({"Algorithm":["DecisionTree","SVC","LogisticRegression","KNeighboors","MultipleLayerPerceptron",
                                        'NaiveBayes',"RandomForest","AdaBoost"],"CrossValMeans":cv_means,"CrossValerrors": cv_std,
                      'Time':time,'test':test_score})
    return cv_res

In [109]:
iris_res=implement(X_train,y_train, X_test, y_test)

In [113]:
iris_res.to_csv('../output/iris.csv')

In [121]:
breast = pd.read_csv('breast.csv')
breast = breast.drop(['id','Unnamed: 32'],axis=1)     # drop columns
le = LabelEncoder()              # label encoding
col = 'diagnosis'
breast[col] = le.fit_transform(breast[col])
label = 'diagnosis'
y = breast[label]  # %
X = breast.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
description = describe(X,y)
description.columns = ['breast']
description.to_csv('../describe/breast.csv')

In [122]:
breast_res=implement(X_train,y_train, X_test, y_test)

In [123]:
breast_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.957473,0.036322,0.021802,0.916084
1,SVC,0.976462,0.021063,0.059527,0.972028
2,LogisticRegression,0.97879,0.012786,0.034617,0.965035
3,KNeighboors,0.96456,0.032407,0.031993,0.972028
4,MultipleLayerPerceptron,0.978788,0.022218,1.56692,0.979021
5,NaiveBayes,0.927011,0.034638,0.025976,0.93007
6,RandomForest,0.971592,0.027797,0.66991,0.916084
7,AdaBoost,0.948382,0.033228,0.028091,0.902098


In [124]:
breast_res.to_csv('../output/breast.csv')

In [128]:
abalone = pd.read_csv('abalone.csv')
abalone.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Sex,4177,3.0,M,1528.0,,,,,,,
Length,4177,,,,0.523992,0.120093,0.075,0.45,0.545,0.615,0.815
Diameter,4177,,,,0.407881,0.0992399,0.055,0.35,0.425,0.48,0.65
Height,4177,,,,0.139516,0.0418271,0.0,0.115,0.14,0.165,1.13
Whole_weight,4177,,,,0.828742,0.490389,0.002,0.4415,0.7995,1.153,2.8255
Shucked_weight,4177,,,,0.359367,0.221963,0.001,0.186,0.336,0.502,1.488
Viscera_weight,4177,,,,0.180594,0.109614,0.0005,0.0935,0.171,0.253,0.76
Shell_weight,4177,,,,0.238831,0.139203,0.0015,0.13,0.234,0.329,1.005
Class_number_of_rings,4177,,,,9.93368,3.22417,1.0,8.0,9.0,11.0,29.0


In [129]:
abalone = pd.read_csv('abalone.csv')
# abalone = abalone.drop(['id','Unnamed: 32'],axis=1)     
le = LabelEncoder()              # label encoding
col = 'Sex'
abalone[col] = le.fit_transform(abalone[col])
label = 'Class_number_of_rings'
y = abalone[label]  # %
X = abalone.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
description = describe(X,y)
description.columns = ['abalone']
description.to_csv('../describe/abalone.csv')

In [130]:
abalone_res=implement(X_train,y_train, X_test, y_test)
abalone_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.261765,0.02022,0.038715,0.252632
1,SVC,0.280313,0.022903,9.647782,0.249761
2,LogisticRegression,0.272549,0.031626,1.301132,0.25933
3,KNeighboors,0.274218,0.025982,0.090781,0.235407
4,MultipleLayerPerceptron,0.279731,0.021577,22.984903,0.254545
5,NaiveBayes,0.244574,0.029464,0.055922,0.211483
6,RandomForest,0.280715,0.025083,0.849657,0.241148
7,AdaBoost,0.20737,0.022425,0.077349,0.192344


In [136]:
australia = pd.read_csv('australia.csv')
label = 'A15'
y = australia[label]  # %
X = australia.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
description = describe(X,y)
description.columns = ['australia']
description.to_csv('../describe/australia.csv')

In [137]:
australia_res=implement(X_train,y_train, X_test, y_test)
australia_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.88586,0.045197,0.020114,0.809249
1,SVC,0.891742,0.046467,0.145858,0.83237
2,LogisticRegression,0.874359,0.048188,0.031018,0.855491
3,KNeighboors,0.874359,0.03846,0.031812,0.855491
4,MultipleLayerPerceptron,0.88405,0.04099,5.056243,0.791908
5,NaiveBayes,0.849246,0.049723,0.024456,0.878613
6,RandomForest,0.891817,0.051585,0.264628,0.861272
7,AdaBoost,0.851056,0.053121,0.03352,0.745665


In [138]:
vehicle = pd.read_csv('vehicle.csv')
vehicle.head()

Unnamed: 0,COMPACTNESS,CIRCULARITY,DISTANCE_CIRCULARITY,RADIUS_RATIO,PR.AXIS_ASPECT_RATIO,MAX.LENGTH_ASPECT_RATIO,SCATTER_RATIO,ELONGATEDNESS,PR.AXIS_RECTANGULARITY,MAX.LENGTH_RECTANGULARITY,SCALED_VARIANCE_MAJOR,SCALED_VARIANCE_MINOR,SCALED_RADIUS_OF_GYRATION,SKEWNESS_ABOUT_MAJOR,SKEWNESS_ABOUT_MINOR,KURTOSIS_ABOUT_MAJOR,KURTOSIS_ABOUT_MINOR,HOLLOWS_RATIO,Class
0,95,48,83,178,72,10,162,42,20,159,176,379,184,70,6,16,187,197,van
1,91,41,84,141,57,9,149,45,19,143,170,330,158,72,9,14,189,199,van
2,104,50,106,209,66,10,207,32,23,158,223,635,220,73,14,9,188,196,saab
3,93,41,82,159,63,9,144,46,19,143,160,309,127,63,6,10,199,207,van
4,85,44,70,205,103,52,149,45,19,144,241,325,188,127,9,11,180,183,bus


In [140]:
vehicle = pd.read_csv('vehicle.csv')
le = LabelEncoder()              # label encoding
col = 'Class'
vehicle[col] = le.fit_transform(vehicle[col])
label = 'Class'
y = vehicle[label]  # %
X = vehicle.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
description = describe(X,y)
description.columns = ['vehicle']
description.to_csv('../describe/vehicle.csv')

In [143]:
vehicle_res=implement(X_train,y_train, X_test, y_test)
vehicle_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.717704,0.047241,0.026921,0.70283
1,SVC,0.843637,0.023886,0.30304,0.834906
2,LogisticRegression,0.805961,0.044972,0.302055,0.787736
3,KNeighboors,0.73482,0.043853,0.034577,0.65566
4,MultipleLayerPerceptron,0.8549,0.031311,6.572449,0.811321
5,NaiveBayes,0.444889,0.044754,0.027133,0.466981
6,RandomForest,0.758634,0.034055,0.677931,0.716981
7,AdaBoost,0.687911,0.057003,0.044036,0.70283


In [145]:
churn=pd.read_csv('churn.csv')
label = 'class'
y = churn[label]  # %
X = churn.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
description = describe(X,y)
description.columns = ['churn']
description.to_csv('../describe/churn.csv')

In [149]:
churn_res=implement(X_train,y_train, X_test, y_test)
churn_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.944805,0.012713,0.1946,0.9512
1,SVC,0.925338,0.012984,9.104529,0.9376
2,LogisticRegression,0.864547,0.015946,0.088513,0.8744
3,KNeighboors,0.89333,0.009501,0.38214,0.8984
4,MultipleLayerPerceptron,0.926668,0.011069,31.448615,0.9328
5,NaiveBayes,0.872276,0.012148,0.045634,0.868
6,RandomForest,0.960272,0.008697,3.179116,0.9624
7,AdaBoost,0.922406,0.011049,0.25565,0.9328


In [150]:
abalone_res.to_csv('../output/abalone.csv')
australia_res.to_csv('../output/australia.csv')
vehicle_res.to_csv('../output/vehicle.csv')
churn_res.to_csv('../output/churn.csv')

In [153]:
nursery=pd.read_csv('nursery.csv')
nursery.describe(include='all').T

Unnamed: 0,count,unique,top,freq
parents,12960,3,pretentious,4320
has_nurs,12960,5,proper,2592
form,12960,4,incomplete,3240
children,12960,4,1,3240
housing,12960,3,less_conv,4320
finance,12960,2,inconv,6480
social,12960,3,slightly_prob,4320
health,12960,3,recommended,4320
class,12960,5,not_recom,4320


In [181]:
nursery=pd.read_csv('nursery.csv')
labelencoder=LabelEncoder()
for col in nursery.columns:
    nursery[col] = labelencoder.fit_transform(nursery[col])
label = 'class'
y = nursery[label]  # %
X = nursery.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
description = describe(X,y)
description.columns = ['nursery']
description.to_csv('../describe/nursery.csv')

In [183]:
nursery_res=implement(X_train,y_train, X_test, y_test)
nursery_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.994547,0.002949,0.198281,0.994136
1,SVC,0.996296,0.001852,33.118687,0.997531
2,LogisticRegression,0.763776,0.015008,0.178434,0.765741
3,KNeighboors,0.947016,0.005435,0.547293,0.961111
4,MultipleLayerPerceptron,0.999897,0.000308,55.114204,0.999383
5,NaiveBayes,0.639087,0.014958,0.053256,0.641049
6,RandomForest,0.994959,0.001977,2.925033,0.995679
7,AdaBoost,0.994032,0.002754,0.167261,0.993827


In [184]:
nursery_res.to_csv('../output/nursery.csv')

In [188]:
glass=pd.read_csv('glass.csv')
le = LabelEncoder()              # label encoding
col = 'Type'
glass[col] = le.fit_transform(glass[col])
label = 'Type'
y = glass[label]  # %
X = glass.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
description = describe(X,y)
description.columns = ['glass']
description.to_csv('../describe/glass.csv')

In [190]:
glass_res=implement(X_train,y_train, X_test, y_test)
glass_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.723886,0.178835,0.019522,0.685185
1,SVC,0.70017,0.083986,0.048421,0.740741
2,LogisticRegression,0.681904,0.056903,0.303863,0.592593
3,KNeighboors,0.720641,0.095739,0.025382,0.685185
4,MultipleLayerPerceptron,0.717425,0.087929,3.008001,0.759259
5,NaiveBayes,0.512606,0.164192,0.0358,0.62963
6,RandomForest,0.772314,0.070489,0.53215,0.777778
7,AdaBoost,0.673283,0.100884,0.02504,0.685185


In [212]:
dermatology=pd.read_csv('dermatology.csv')
dermatology['Age']=dermatology['Age'].replace('?','40').astype(int)
np.mean(dermatology['Age'])
label = 'class'
y = dermatology[label]  # %
X = dermatology.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
description = describe(X,y)
description.columns = ['dermatology']
description.to_csv('../describe/dermatology.csv')

In [214]:
dermatology_res=implement(X_train,y_train, X_test, y_test)
dermatology_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.956439,0.039636,0.020653,0.967391
1,SVC,0.97146,0.031184,0.090327,0.978261
2,LogisticRegression,0.974908,0.032226,0.093921,0.98913
3,KNeighboors,0.971583,0.026428,0.02801,0.967391
4,MultipleLayerPerceptron,0.971062,0.031283,1.38075,0.98913
5,NaiveBayes,0.872493,0.038025,0.024771,0.858696
6,RandomForest,0.985021,0.024736,0.222214,0.98913
7,AdaBoost,0.944167,0.038326,0.02907,0.956522


In [218]:
splice=pd.read_csv('splice.csv')
splice = splice.drop(['Instance_name'],axis=1)
labelencoder=LabelEncoder()
for col in splice.columns:
    splice[col] = labelencoder.fit_transform(splice[col])
label = 'Class'
y = splice[label]  # %
X = splice.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
description = describe(X,y)
description.columns = ['splice']
description.to_csv('../describe/splice.csv')

In [220]:
splice_res=implement(X_train,y_train, X_test, y_test)
splice_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.911406,0.019692,0.113761,0.897243
1,SVC,0.858261,0.01733,6.247995,0.863409
2,LogisticRegression,0.814802,0.021315,0.126022,0.83208
3,KNeighboors,0.800125,0.015362,0.43957,0.825815
4,MultipleLayerPerceptron,0.870803,0.016761,5.868808,0.868421
5,NaiveBayes,0.913883,0.019351,0.062585,0.924812
6,RandomForest,0.95194,0.012667,0.871482,0.952381
7,AdaBoost,0.90598,0.016202,0.138518,0.889724


In [221]:
glass_res.to_csv('../output/glass.csv')
dermatology_res.to_csv('../output/dermatology.csv')
splice_res.to_csv('../output/splice.csv')

In [224]:
thyroid=pd.read_csv('thyroid.csv')
label = 'Class'
y = thyroid[label]  # %
X = thyroid.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
description = describe(X,y)
description.columns = ['thyroid']
description.to_csv('../describe/thyroid.csv')

In [226]:
thyroid_res=implement(X_train,y_train, X_test, y_test)
thyroid_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.685789,0.024406,0.038348,0.688571
1,SVC,0.712041,0.024172,3.845595,0.702857
2,LogisticRegression,0.708241,0.026242,0.118715,0.714286
3,KNeighboors,0.697295,0.025681,0.148629,0.708571
4,MultipleLayerPerceptron,0.672,0.018792,19.11188,0.688571
5,NaiveBayes,0.176761,0.027131,0.03741,0.18
6,RandomForest,0.71821,0.021637,0.90477,0.702857
7,AdaBoost,0.612386,0.027889,0.091245,0.624286


In [230]:
banknote=pd.read_csv('banknote.csv')
label = 'Class'
y = banknote[label]  # %
X = banknote.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
description = describe(X,y)
description.columns = ['banknote']
description.to_csv('../describe/banknote.csv')

In [233]:
banknote_res=implement(X_train,y_train, X_test, y_test)
banknote_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.987387,0.011485,0.022732,0.991254
1,SVC,1.0,0.0,0.081137,1.0
2,LogisticRegression,0.985483,0.011571,0.049898,0.997085
3,KNeighboors,0.998068,0.003865,0.031564,1.0
4,MultipleLayerPerceptron,1.0,0.0,2.162261,1.0
5,NaiveBayes,0.834836,0.04362,0.024941,0.845481
6,RandomForest,0.992242,0.007264,0.570577,0.994169
7,AdaBoost,0.987415,0.013002,0.028408,0.988338


In [235]:
diabetes=pd.read_csv('diabetes.csv')
label = 'Outcome'
y = diabetes[label]  # %
X = diabetes.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
description = describe(X,y)
description.columns = ['diabetes']
description.to_csv('../describe/diabetes.csv')

In [237]:
diabetes_res=implement(X_train,y_train, X_test, y_test)
diabetes_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.746551,0.025533,0.017729,0.755208
1,SVC,0.770512,0.047934,0.152158,0.791667
2,LogisticRegression,0.756746,0.049498,0.027085,0.807292
3,KNeighboors,0.772656,0.029659,0.028292,0.760417
4,MultipleLayerPerceptron,0.767332,0.035733,5.026371,0.765625
5,NaiveBayes,0.744733,0.052177,0.022146,0.755208
6,RandomForest,0.77593,0.041063,0.333765,0.786458
7,AdaBoost,0.732517,0.051391,0.033581,0.744792


In [238]:
thyroid_res.to_csv('../output/thyroid.csv')
banknote_res.to_csv('../output/banknote.csv')
diabetes_res.to_csv('../output/diabetes.csv')

In [240]:
heart=pd.read_csv('heart.csv')
label = 'target'
y = heart[label]  # %
X = heart.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
description = describe(X,y)
description.columns = ['heart']
description.to_csv('../describe/heart.csv')

In [242]:
heart_res=implement(X_train,y_train, X_test, y_test)
heart_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.852536,0.112491,0.026101,0.776316
1,SVC,0.850264,0.044501,0.052899,0.828947
2,LogisticRegression,0.833416,0.047788,0.037462,0.842105
3,KNeighboors,0.842523,0.059983,0.029357,0.75
4,MultipleLayerPerceptron,0.837385,0.061389,1.332322,0.842105
5,NaiveBayes,0.829068,0.060501,0.023047,0.776316
6,RandomForest,0.851581,0.07331,0.224223,0.828947
7,AdaBoost,0.794433,0.089266,0.024875,0.644737


In [244]:
wine=pd.read_csv('wine.csv')
label = 'quality'
y = wine[label]  # %
X = wine.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
description = describe(X,y)
description.columns = ['wine']
description.to_csv('../describe/wine.csv')

In [246]:
wine_res=implement(X_train,y_train, X_test, y_test)
wine_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,0.59544,0.046487,0.025946,0.6075
1,SVC,0.634784,0.039702,2.112117,0.6175
2,LogisticRegression,0.579754,0.051518,0.177886,0.5875
3,KNeighboors,0.598893,0.028087,0.051198,0.5925
4,MultipleLayerPerceptron,0.614077,0.048338,16.451973,0.6025
5,NaiveBayes,0.537317,0.045358,0.030639,0.54
6,RandomForest,0.674763,0.036932,0.967383,0.68
7,AdaBoost,0.593337,0.06593,0.042797,0.6275


In [247]:
mushroom=pd.read_csv('mushroom.csv')
labelencoder=LabelEncoder()
for col in mushroom.columns:
    mushroom[col] = labelencoder.fit_transform(mushroom[col])
label = 'class'
y = mushroom[label]  # %
X = mushroom.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
description = describe(X,y)
description.columns = ['mushroom']
description.to_csv('../describe/mushroom.csv')

In [249]:
mushroom_res=implement(X_train,y_train, X_test, y_test)
mushroom_res

Unnamed: 0,Algorithm,CrossValMeans,CrossValerrors,Time,test
0,DecisionTree,1.0,0.0,0.073558,1.0
1,SVC,1.0,0.0,5.799253,1.0
2,LogisticRegression,0.965535,0.006224,0.202595,0.967996
3,KNeighboors,1.0,0.0,0.689679,1.0
4,MultipleLayerPerceptron,1.0,0.0,4.792615,1.0
5,NaiveBayes,0.922203,0.005963,0.066466,0.933038
6,RandomForest,1.0,0.0,0.132852,1.0
7,AdaBoost,1.0,0.0,0.078253,1.0


In [250]:
heart_res.to_csv('../output/heart.csv')
wine_res.to_csv('../output/wine.csv')
mushroom_res.to_csv('../output/mushroom.csv')

In [253]:
credit=pd.read_csv('credit.csv')
credit = credit.drop(['ID'],axis=1)
label = 'default.payment.next.month'
y = credit[label]  # %
X = credit.drop([label],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=4)
description = describe(X,y)
description.columns = ['credit']
description.to_csv('../describe/credit.csv')

In [255]:
credit.shape

(30000, 24)