# Bicycle classification - Pipeline, RandomzidedSearch, CrossValidation

## 1. Import and split

In [116]:
import pandas as pd
from Functions import BIKED_classif_functions as BIKED
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier     #Depth-8 Decision Tree
from sklearn.neighbors import KNeighborsClassifier  #K-Neighbors
from sklearn.ensemble import RandomForestClassifier #Random Forest


from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn import set_config


For other Classification models see here:

In [2]:
#more models used by the author:

from sklearn.ensemble import AdaBoostClassifier        #AdaBoost
from sklearn.svm import SVC                            #Support Vector Clf.

from sklearn.gaussian_process import GaussianProcessClassifier     #Gaussian Pr. Clf.
from sklearn.neural_network import MLPClassifier                   #3-layer Neural Net, #6-Layer Neural Net
                                                                    

#other models:
#from sklearn.gaussian_process.kernels import RBF
#from sklearn.naive_bayes import GaussianNB
#from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

### 1.1. Training data

In [117]:
#Read Data
data = pd.read_csv("Data/BIKED_reduced.csv", index_col=0)

classes = data["BIKESTYLE"].value_counts() # 19 categories, one of them has only one member
classes = pd.DataFrame(classes)
classes["percentage"] = (classes.BIKESTYLE / data.shape[0] * 100).round(2)


* label `BIKESTYLE` has 19 categories
* class `FAT`has only one row
* class `CRUISER`, `HYBRID`, `TRIALS`, `GRAVEL`, `CARGO`, `CHILDRENS`, and `FAT` are extremly unbalanced with < 1% of total rows
* If all classes would be evenly distributed, each of them had 5 %

In [118]:
#call function to drop <0.1% of values
data = BIKED.remove_classes_with_less_than_x_percent(data)
data.BIKESTYLE.value_counts()


ROAD          1856
MTB            616
TRACK          470
OTHER          315
DIRT_JUMP      293
TOURING        201
CYCLOCROSS     151
POLO           128
TIMETRIAL       89
BMX             86
COMMUTER        75
CITY            72
CRUISER         42
HYBRID          39
TRIALS          35
GRAVEL          19
CARGO           14
CHILDRENS       10
Name: BIKESTYLE, dtype: int64

In [119]:
#Split
y = data.pop("BIKESTYLE")

In [120]:
X = data

In [76]:
# check missing values # HIER KÖNNTE ICH LERNEN WIE ICH ES KÜRZER SCHREIBE
missing_values_col = X.isna().sum()
missing_values_col = pd.DataFrame(missing_values_col)
missing_values_col.columns = ["Missing_values_Count"]

missing_values_col = missing_values_col.loc[missing_values_col["Missing_values_Count"] > 0, :]
print(len(missing_values_col))
missing_values_col



35


Unnamed: 0,Missing_values_Count
HBAR type,219
Headset type,91
Dropout model,73
Top tube angle textfield,1
Seatpost type,166
Stack,73
SIZE,774
dropoutInsert,680
Cranks type,115
Dim A Inch TextField,2


**Conclusion:**
* There are 35 columns with missing values.
* Column `SIZE` has 774 missing values. Interesting, because `SIZE` results in 555 columns after OneHotEncoding.
* Columns `bottle xxx` have many missing values. Might be useful to put them into fewer columns. i.e. `bottle DOWNTUBE` and `bottle SEATTUBE`

In [36]:
#check data types (which columns are not numeric and what to do with them)
X.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4512 entries, 1 to 4800
Columns: 1319 entries, SSSIDECX3 to FDBSD
dtypes: bool(150), float64(979), int64(153), object(37)
memory usage: 40.9+ MB


In [43]:
categorical_cols = X.select_dtypes(include="object").columns
print(len(list(categorical_cols)))
categorical_cols

#boolean_cols = X.select_dtypes(include="bool").columns
#print(len(boolean_cols))
#boolean_cols

37


Index(['HBAR type', 'Headset type', 'Rack type', 'DIRECTMOUNTFDTYPE',
       'Dropout model', 'HEADTUBEtype', 'BATTERYmount', 'Seatpost type',
       'FRONTbrake type', 'Stem type', 'Pump size', 'MATERIAL',
       'FRONTdiscB type', 'SIZE', 'StaticDropout', 'RIM_STYLE front',
       'SPROCKETS type', 'PhBlock typeDOWNTUBE', 'GEARBOXtype',
       'dropoutInsert', 'REARdiscB type', 'Cranks type', 'MOUNT_TYPE',
       'Yoke type', 'RIM_STYLE rear', 'Brake lever brand', 'Pedals type',
       'Wheel choice rear', 'REARbrake type', 'CLAMPFDTYPE',
       'Wheel choice front', 'PhBlock typeTOPTUBE', 'DROPOUT STYLE',
       'Fit scheme', 'Fork choice', 'Saddle type', 'BRAZEonFDTYPE'],
      dtype='object')

**Conclusion:** 

* There are 187 non-numeric column. of which 150 columns are boolean.
* There are 37 object columns, which need to be encoded (OneHot or Ordinal).




LIST OF COLUMNS:
OBJECT: 
'HBAR type', 'Headset type', 'Rack type', 'DIRECTMOUNTFDTYPE',
       'Dropout model', 'HEADTUBEtype', 'BATTERYmount', 'Seatpost type',
       'FRONTbrake type', 'Stem type', 'Pump size', 'MATERIAL',
       'FRONTdiscB type', 'SIZE', 'StaticDropout', 'RIM_STYLE front',
       'SPROCKETS type', 'PhBlock typeDOWNTUBE', 'GEARBOXtype',
       'dropoutInsert', 'REARdiscB type', 'Cranks type', 'MOUNT_TYPE',
       'Yoke type', 'RIM_STYLE rear', 'Brake lever brand', 'Pedals type',
       'Wheel choice rear', 'REARbrake type', 'CLAMPFDTYPE',
       'Wheel choice front', 'PhBlock typeTOPTUBE', 'DROPOUT STYLE',
       'Fit scheme', 'Fork choice', 'Saddle type', 'BRAZEonFDTYPE'

BOOL: (just a few)
'REARDiscAdaptOnPost', 'Seatpost AERO', 'DOWNTUBE1SnSCheck',
       'USEgearbox', 'REARROTOR_INCLUDE', 'nSeat stay Curv Check',
       'CSSIDEISBENT', 'LINKpump2SCHEME', 'SEATSTAYholeCheck', 'LOOPED_STAYS',
       ...
       'Female', 'Seatpost MAST', 'CASS_aux', 'DUAL_CROWN',
       'REARDiscTabIsPost', 'CHEVRONDOWNTUBE1ON', 'CHEVRONTOPTUBE1ON',
       'Dim A CheckBox', 'bottle SEATTUBE0 show', 'bottle DOWNTUBE0 show'

In [47]:
# which columns have missing values and also have categorical? - yes there are some. Make sure in the preprocessor that they are properly taken care of.
# # # # # #  I wonder how the author took care of it.

categorical_cols.isin(missing_values_col.index)
#boolean_cols.isin(missing_values_col.index)

array([ True,  True, False, False,  True, False, False,  True, False,
       False, False, False, False,  True, False, False, False, False,
       False,  True, False,  True, False, False, False, False, False,
        True, False, False,  True, False, False, False,  True,  True,
       False])

In [121]:
X.shape

(4511, 1318)

In [122]:
# This time take only numerical columns
X = X.select_dtypes(include="number").copy()
X.shape

(4511, 1131)

In [123]:
# Train / Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31416)

## 2. Building `preprocessor` pipeline

In [124]:
X_train.head(2)

Unnamed: 0,SSSIDECX3,SSSIDECX2,SSSIDECX1,SSSIDECY2,SSSIDECY1,STEMBENDS,FRONTROTORBOLTS,Shoe up angle,Rollout units,Down tube front diameter,...,bottle DOWNTUBE0 WBL,bottle DOWNTUBE0 WBD,bottle SEATTUBE0 CAGE,bottle DOWNTUBE0 flip,bottle DOWNTUBE0 X,bottle DOWNTUBE0 CAGE,rockerPosition,RDERD,FDERD,RDBSD
1845,65.5,130.4,304.2,-10.0,10.0,0,6,39,0,38.1,...,201.0,73.5,1.0,0.0,200.0,1.0,50.0,92.0,92.0,52.0
3510,71.8,134.9,350.0,-10.0,10.0,0,6,36,0,58.0,...,,,,,,,,92.0,92.0,52.0


In [125]:
# 2.a) Select categorical and numerical columns
X_cat = X.select_dtypes(exclude="number").copy()
X_num = X.select_dtypes(include="number").copy()


In [126]:

# 2.b) Numerical Pipeline, only with the SimpleImputer(strategy="mean")
numeric_pipe = make_pipeline(
    SimpleImputer(strategy="constant")   #strategy="constant"
    #KNNImputer(5)
)


In [127]:
# NOT EXECUTED YET
# 2.c) Categorical Pipeline

# # # i. Defining the categorical encoder
ordinal_col_names = ['ExterQual', 
                     'ExterCond', 
                     'BsmtQual', 
                     'BsmtCond',
                     'BsmtExposure', 
                     'BsmtFinType1', 
                     'KitchenQual', 
                     'FireplaceQu', 
                     'LotShape', 
                     'BsmtFinType2',
                     'HeatingQC',
                     'GarageFinish', 
                     'GarageQual', 
                     'GarageCond', 
                     'PoolQC', 
                     'Fence']

ordinal_cols = X_cat.columns.get_indexer(ordinal_col_names)
onehot_cols = X_cat.columns.get_indexer(list(set(X_cat) - set(ordinal_col_names)))

# # # ii. Defining values for Ordinal Encoding
ExterQual_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
ExterCond_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtQual_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtCond_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtExposure_cats = ["NA", "No", "Mn", "Av", "Gd"]
BsmtFinType1_cats = ["NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"]
KitchenQual_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
FireplaceQu_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
LotShape_cats = ["NA", 'Reg', 'IR1', 'IR2', 'IR3']
BsmtFinType2_cats = ['NA','Unf','LwQ','Rec','BLQ','ALQ','GLQ']
HeatingQC_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
GarageFinish_cats = ['NA','Unf','RFn','Fin']
GarageQual_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
GarageCond_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
PoolQC_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
Fence_cats = ['NA','MnWw','GdWo','MnPrv','GdPrv']

ordinal_cats_list = [ExterQual_cats, ExterCond_cats, BsmtQual_cats, BsmtCond_cats, 
                    BsmtExposure_cats, BsmtFinType1_cats, KitchenQual_cats, FireplaceQu_cats, 
                    LotShape_cats, BsmtFinType2_cats, HeatingQC_cats, GarageFinish_cats, GarageQual_cats,
                    GarageCond_cats, PoolQC_cats, Fence_cats]


# # # iii. Defining the ColumnTransformer with 2 branches: ordinal & onehot (categorical encoder)
categorical_encoder = ColumnTransformer(
    transformers=[
#        ("cat_ordinal", OrdinalEncoder(categories=ordinal_cats_list), ordinal_cols),
        ("cat_onehot", OneHotEncoder(handle_unknown="ignore"), onehot_cols),
    ]
)

# # # iv. Categorical pipeline = "NA" imputer + categorical encoder
categoric_pipe = make_pipeline(SimpleImputer(strategy="constant", fill_value="NA"),
                                 categorical_encoder
                                )

# 2.d) Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
        ("cat_pipe", categoric_pipe, X_cat.columns),
    ]
)

In [145]:
X_train.shape

(3608, 1131)

**EXPLANATION FOR CATEGORICAL PIPELINE: --2.c)--**

**Start from the bottom! (# # # iv.):**
* `categoric_pipe` = make_pipeline(SimpleImputer(), `categorical_encoder`)



**needs `categorical_encoder` (# # # iii.):**
* `categorical_encoder` = ColumnTransformer(tranformers=[
    * ("cat_ordinal", OrdinalEncoder(categories=`ordinal_cats_list`), `ordinal_cols`), 
    * ("cat_onehot", OneHotEncoder(handle_unknown="ignore"), `onehot_cols`)])


**needs `ordinal_cats_list` (# # # ii.) and needs `ordinal_cols`and `onehot_cols`(# # # i.):**

(\# # # i.) `ordinal_cols`and `onehot_cols`:
* ordinal_col_names = ['ExterQual', '...']
* `ordinal_cols` = X_cat.columns.get_indexer(ordinal_col_names)
* `onehot_cols` = X_cat.columns.get_indexer(list(set(X_cat) - set(ordinal_col_names)))


(\# # # ii.) `ordinal_cats_list`: define values and then make a list:**
* ExterQual_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
* ExterCond_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
* etc.
* `ordinal_cats_list` = [ExterQual_cats, ExterCond_cats, ...]

## 3. Create `full_pipe` with DecisionTreeClassifier

In [128]:
#create the full_pipeline
full_pipe = make_pipeline(preprocessor, 
                          StandardScaler(with_mean=False),   #
                          DecisionTreeClassifier(random_state=123))

# create parameter grid
param_grid = {
#    "columntransformer__num_pipe__simpleimputer__strategy":["constant", "median"],
#    "standardscaler__with_mean":[True, False],
#    "standardscaler__with_std":[True, False],
    "decisiontreeclassifier__max_depth": range(2, 14),
    "decisiontreeclassifier__min_samples_leaf": range(2, 12),
#    "decisiontreeclassifier__min_samples_split": range(3, 40, 2),
    "decisiontreeclassifier__criterion":["gini", "entropy"]
}

# define cross validation
search = RandomizedSearchCV(full_pipe,
                      param_grid,
                      cv=10,
                      verbose=1,
                      scoring="accuracy",
                      n_jobs=-2, 
                      n_iter=10)

# fit
search.fit(X_train, y_train)

# create a dictionary to keep track of the scores of different models 
scores = {"dtree" : search.best_score_}
best_params = {"dtree" : search.best_params_}


Fitting 10 folds for each of 10 candidates, totalling 100 fits




In [129]:
# create more dictionaries:
### training accuracy ON the ENTIRE TRAIN-DATA
y_train_pred = search.predict(X_train)
acc_train = accuracy_score(y_train, y_train_pred)
scores_train_data = {"dtree" : acc_train}

In [130]:
### testing accuracy
y_test_pred = search.predict(X_test)
acc_test = accuracy_score(y_test, y_test_pred)
scores_test_data = {"dtree" : acc_test}

In [131]:
#look at full_pipe and check if its good
from sklearn import set_config

set_config(display="diagram")
full_pipe

In [132]:
scores

{'dtree': 0.6682294552169898}

In [133]:
search.best_params_

{'decisiontreeclassifier__min_samples_leaf': 3,
 'decisiontreeclassifier__max_depth': 8,
 'decisiontreeclassifier__criterion': 'entropy'}

## 4. Create `knn_full_pipe` for KNN

In [134]:
#create the full_pipeline
knn_full_pipe = make_pipeline(preprocessor, 
                          StandardScaler(with_mean=False),
                          KNeighborsClassifier()
                             )

# create parameter grid
param_grid = {
#    "columntransformer__num_pipe__simpleimputer__strategy":["constant", "median"],
#    "standardscaler__with_mean":[True, False],
#    "standardscaler__with_std":[True, False],
    "kneighborsclassifier__n_neighbors": range(2, 50),
    "kneighborsclassifier__weights": ["uniform", "distance"]
}

# define cross validation
knn_search = GridSearchCV(knn_full_pipe,
                      param_grid,
                      cv=10,
                      verbose=1,
                      scoring="accuracy",
                      n_jobs=-2, 
#                      n_iter=20
                    )

# fit
knn_search.fit(X_train, y_train)

# create a dictionary to keep track of the scores of different models 
scores["knn"] = knn_search.best_score_
best_params["knn"] = knn_search.best_params_


# create more dictionaries:
### training accuracy ON the ENTIRE TRAIN-DATA
y_train_pred = knn_search.predict(X_train)
acc_train = accuracy_score(y_train, y_train_pred)
scores_train_data["knn"] = acc_train

### testing accuracy
y_test_pred = knn_search.predict(X_test)
acc_test = accuracy_score(y_test, y_test_pred)
scores_test_data["knn"] = acc_test

Fitting 10 folds for each of 96 candidates, totalling 960 fits




In [135]:
pd.DataFrame(scores, index=["best_score_"])

Unnamed: 0,dtree,knn
best_score_,0.668229,0.677103


## 5. Create `rforest_full_pipe` for RandomForest

I couldn't recreate the model with the best score but these were the parameters:
* 'rforest': {'randomforestclassifier__n_estimators': 200,
* 'randomforestclassifier__min_samples_leaf': 5,
* 'randomforestclassifier__max_depth': 11,
* 'randomforestclassifier__criterion': 'gini'}}

In [136]:
#create the full_pipeline
rforest_full_pipe = make_pipeline(preprocessor, 
                          StandardScaler(with_mean=False),
                          RandomForestClassifier()
                             )

# create parameter grid
param_grid = {
#    "columntransformer__num_pipe__simpleimputer__strategy":["constant", "median"],
#    "standardscaler__with_mean":[True, False],
#    "standardscaler__with_std":[True, False],
#    "randomforestclassifier__warm_start": [True, False],
    "randomforestclassifier__n_estimators": [100, 200],
    "randomforestclassifier__max_depth": range(2, 14),
    "randomforestclassifier__min_samples_leaf": range(2, 10),
    "randomforestclassifier__criterion":["gini", "entropy"]
}

# define cross validation
rforest_search = RandomizedSearchCV(rforest_full_pipe,
                      param_grid,
                      cv=10,
                      verbose=1,
                      scoring="accuracy",
                      n_jobs=-2, 
                      n_iter=30
                    )

# fit
rforest_search.fit(X_train, y_train)

# create a dictionary to keep track of the scores of different models 
scores["rforest"] = rforest_search.best_score_
best_params["rforest"] = rforest_search.best_params_


# create more dictionaries:
### training accuracy ON the ENTIRE TRAIN-DATA
y_train_pred = rforest_search.predict(X_train)
acc_train = accuracy_score(y_train, y_train_pred)
scores_train_data["rforest"] = acc_train

### testing accuracy
y_test_pred = rforest_search.predict(X_test)
acc_test = accuracy_score(y_test, y_test_pred)
scores_test_data["rforest"] = acc_test

Fitting 10 folds for each of 30 candidates, totalling 300 fits




In [137]:
pd.DataFrame(scores, index=["best_score_"])


Unnamed: 0,dtree,knn,rforest
best_score_,0.668229,0.677103,0.721166


In [138]:
best_params

{'dtree': {'decisiontreeclassifier__min_samples_leaf': 3,
  'decisiontreeclassifier__max_depth': 8,
  'decisiontreeclassifier__criterion': 'entropy'},
 'knn': {'kneighborsclassifier__n_neighbors': 4,
  'kneighborsclassifier__weights': 'distance'},
 'rforest': {'randomforestclassifier__n_estimators': 100,
  'randomforestclassifier__min_samples_leaf': 4,
  'randomforestclassifier__max_depth': 13,
  'randomforestclassifier__criterion': 'entropy'}}

In [139]:
pd.DataFrame(scores_test_data, index=["acc_test"])

Unnamed: 0,dtree,knn,rforest
acc_test,0.637874,0.678848,0.717608


In [196]:
from sklearn.ensemble import AdaBoostClassifier        #AdaBoost
from sklearn.svm import SVC                            #Support Vector Clf.

from sklearn.gaussian_process import GaussianProcessClassifier     #Gaussian Pr. Clf.
from sklearn.neural_network import MLPClassifier                   #3-layer Neural Net, #6-Layer Neural Net


def get_model_and_params(model_name, param_grid):
    if model_name == "rforest":
        model = RandomForestClassifier()
        param_grid.update({"randomforestclassifier__warm_start": [True, False],
                "randomforestclassifier__n_estimators": [100, 200],
                "randomforestclassifier__max_depth": range(2, 14),
                "randomforestclassifier__min_samples_leaf": range(2, 10),
                "randomforestclassifier__criterion":["gini", "entropy"]})
    elif model_name == "knn":
        model = KNeighborsClassifier()
        param_grid.update({"kneighborsclassifier__n_neighbors": range(2, 50),
                "kneighborsclassifier__weights": ["uniform", "distance"]})
    elif model_name == "dt":
        model = DecisionTreeClassifier()
        param_grid.update({"decisiontreeclassifier__max_depth": range(2, 14),
                "decisiontreeclassifier__min_samples_leaf": range(2, 12),
#               "decisiontreeclassifier__min_samples_split": range(3, 40, 2),
                "decisiontreeclassifier__criterion":["gini", "entropy"]})
    return model, param_grid


#elseif "Ada": 
    # set the classifiername: 
    #    model = AdaBoostClassifier()
    # update param_grid: param_grid.update(    
    # "randomforestclassifier__n_estimators": [100, 200],
    # "randomforestclassifier__max_depth": range(2, 14),)

    # elif "SVC":
    # model = SVC()
    # param_grid.update()

    # elif "GaussPrC":
    # model = GaussianProcessClassifier()
    # param_grid.update()

    # elif "MLP":
    # model = MLPClassifier()
    # param_grid.update()


def evaluate_model(data, list_of_algorithms, param_grid_preproc):
    param_grid = {}
    for i in list_of_models:
        #reset values to be empty : 
        model = ""
        param_grid.clear()
        param_grid = param_grid_preproc

        # # # # it would nice if I make this into a function get_model_and_params
        model, param_grid = get_model_and_params(i, param_grid)

        #create the full_pipeline
        full_pipe = make_pipeline(preprocessor, 
                          StandardScaler(with_mean=False),
                          model
                             )

        # define cross validation
        search = RandomizedSearchCV(full_pipe,
                      param_grid,
                      cv=10,
                      verbose=1,
                      scoring="accuracy",
                      n_jobs=-2, 
                      #n_iter=30
                    )                             

        # fit
        search.fit(X_train, y_train)
        print(search.best_params_)

        score_new["model"].append(i)

        ### testing accuracy
        y_test_pred = search.predict(X_test)
        acc_test = accuracy_score(y_test, y_test_pred)
        score_new["test_acc_score"].append(acc_test)

        #get score of TRAIN-Data (overfitting?)
        y_train_pred = search.predict(X_train)
        acc_train = accuracy_score(y_train, y_train_pred)
        score_new["train_acc_score"].append(acc_train)

        # keep scores of model (optional)
        score_new["best_score_of_CV"].append(search.best_score_)
        score_new["best_params"].append(search.best_params_)

        print(i, model)
        print("end")
    return pd.DataFrame(score_new)



In [193]:
pd.DataFrame(score_new)


Unnamed: 0,model,test_acc_score,train_acc_score,best_score_of_CV,best_params
0,rforest,0.704319,0.812084,0.707583,"{'randomforestclassifier__warm_start': False, ..."
1,knn,0.717608,0.85837,0.720059,"{'randomforestclassifier__warm_start': False, ..."
2,dt,0.738649,0.892461,0.735303,"{'randomforestclassifier__warm_start': True, '..."
3,rforest,0.715393,0.863359,0.720335,"{'randomforestclassifier__warm_start': True, '..."
4,knn,0.747508,0.926829,0.742233,"{'randomforestclassifier__warm_start': True, '..."
5,rforest,0.743079,0.928215,0.74639,"{'randomforestclassifier__warm_start': True, '..."
6,rforest,0.744186,0.93459,0.745283,"{'randomforestclassifier__warm_start': True, '..."
7,rforest,0.681063,0.764412,0.694281,"{'randomforestclassifier__warm_start': True, '..."
8,rforest,0.710963,0.86031,0.723104,"{'randomforestclassifier__warm_start': False, ..."
9,rforest,0.717608,0.863359,0.723108,"{'randomforestclassifier__warm_start': True, '..."


In [None]:


    ### testing accuracy
    y_test_pred = search.predict(X_test)
    acc_test = accuracy_score(y_test, y_test_pred)
    scores_test_data[i] = acc_test

    #keep scores of TRAIN-Data (overfitting?)
    y_train_pred = search.predict(X_train)
    acc_train = accuracy_score(y_train, y_train_pred)
    scores_train_data[i] = acc_train

    # keep scores of model (optional)
    scores[i] = search.best_score_
    best_params[i] = search.best_params_




In [181]:
#search.fit(X_train, y_train)

score_new = {
    "model": [],
    "test_acc_score": [],
    "train_acc_score": [],
    "best_score_of_CV": [],
    "best_params": []
}


In [178]:

score_new["model"].append(i)

### testing accuracy
y_test_pred = search.predict(X_test)
acc_test = accuracy_score(y_test, y_test_pred)
score_new["test_acc_score"].append(acc_test)

#keep scores of TRAIN-Data (overfitting?)
y_train_pred = search.predict(X_train)
acc_train = accuracy_score(y_train, y_train_pred)
score_new["train_acc_score"].append(acc_train)

# keep scores of model (optional)
score_new["best_score_of_CV"].append(search.best_score_)
score_new["best_params"].append(search.best_params_)

pd.DataFrame(score_new)



Unnamed: 0,model,test_acc_score,train_acc_score,best_score_of_CV,best_params
0,Ada,0.724252,0.891353,0.736692,"{'randomforestclassifier__warm_start': False, ..."


In [170]:
scores_new = {
"dataset": [],
"preprocessing": [],
"algorithm": [], 
"test_acc_score" : [],
"train_acc_score" : [],
#"best_params": [],
"best_score_CV": [],
}



for i in score_new: 
    scores_new["dataset"].append(dataset)
    scores_new["preprocessing"].append(preprocessing)
    scores_new["algorithm"].append(i)
    scores_new["test_acc_score"].append(scores_test_data[i])
    scores_new["train_acc_score"].append(scores_train_data[i])
#    score["best_params"].append()
    scores_new["best_score_CV"].append(scores[i])

In [198]:

# set models, pass param_grid for preprocessor-pipeline
list_of_models = ["rforest", "knn", "dt"] #"Ada", "SVC", "GaussPrC", "MLP"
param_grid_preproc = {}

score_new = evaluate_model(data, list_of_models, param_grid_preproc)




Fitting 10 folds for each of 10 candidates, totalling 100 fits




{'randomforestclassifier__warm_start': False, 'randomforestclassifier__n_estimators': 200, 'randomforestclassifier__min_samples_leaf': 3, 'randomforestclassifier__max_depth': 12, 'randomforestclassifier__criterion': 'gini'}
rforest RandomForestClassifier()
end
Fitting 10 folds for each of 10 candidates, totalling 100 fits




{'kneighborsclassifier__weights': 'distance', 'kneighborsclassifier__n_neighbors': 4}
knn KNeighborsClassifier()
end
Fitting 10 folds for each of 10 candidates, totalling 100 fits




{'decisiontreeclassifier__min_samples_leaf': 2, 'decisiontreeclassifier__max_depth': 9, 'decisiontreeclassifier__criterion': 'entropy'}
dt DecisionTreeClassifier()
end


In [199]:
#description of the current round
dataset = "reduced,18labels"   #"reduced,19labels"
preprocessing = "only_numeric_cols,SimpleImputer_Mean"  #"all_columns,SimpleImputer_Mean,OneHot"

score_new["dataset"] = dataset
score_new["preprocessing"] = preprocessing

pd.concat([score_new, score_all], ignore_index=True)

Unnamed: 0,model,test_acc_score,train_acc_score,best_score_of_CV,best_params,dataset,preprocessing
0,rforest,0.704319,0.812084,0.707583,"{'randomforestclassifier__warm_start': False, ...","reduced,18labels","only_numeric_cols,SimpleImputer_Mean"
1,knn,0.717608,0.85837,0.720059,"{'randomforestclassifier__warm_start': False, ...","reduced,18labels","only_numeric_cols,SimpleImputer_Mean"
2,dt,0.738649,0.892461,0.735303,"{'randomforestclassifier__warm_start': True, '...","reduced,18labels","only_numeric_cols,SimpleImputer_Mean"
3,rforest,0.715393,0.863359,0.720335,"{'randomforestclassifier__warm_start': True, '...","reduced,18labels","only_numeric_cols,SimpleImputer_Mean"
4,knn,0.747508,0.926829,0.742233,"{'randomforestclassifier__warm_start': True, '...","reduced,18labels","only_numeric_cols,SimpleImputer_Mean"
5,rforest,0.743079,0.928215,0.74639,"{'randomforestclassifier__warm_start': True, '...","reduced,18labels","only_numeric_cols,SimpleImputer_Mean"
6,rforest,0.744186,0.93459,0.745283,"{'randomforestclassifier__warm_start': True, '...","reduced,18labels","only_numeric_cols,SimpleImputer_Mean"
7,rforest,0.681063,0.764412,0.694281,"{'randomforestclassifier__warm_start': True, '...","reduced,18labels","only_numeric_cols,SimpleImputer_Mean"
8,rforest,0.710963,0.86031,0.723104,"{'randomforestclassifier__warm_start': False, ...","reduced,18labels","only_numeric_cols,SimpleImputer_Mean"
9,rforest,0.717608,0.863359,0.723108,"{'randomforestclassifier__warm_start': True, '...","reduced,18labels","only_numeric_cols,SimpleImputer_Mean"


Unnamed: 0,model,test_acc_score,train_acc_score,best_score_of_CV,best_params,dataset,preprocessing,best_score_CV
0,rforest,0.704319,0.812084,0.707583,"{'randomforestclassifier__warm_start': False, ...","reduced,18labels","only_numeric_cols,SimpleImputer_Mean",
1,knn,0.717608,0.85837,0.720059,"{'randomforestclassifier__warm_start': False, ...","reduced,18labels","only_numeric_cols,SimpleImputer_Mean",
2,dt,0.738649,0.892461,0.735303,"{'randomforestclassifier__warm_start': True, '...","reduced,18labels","only_numeric_cols,SimpleImputer_Mean",
3,rforest,0.715393,0.863359,0.720335,"{'randomforestclassifier__warm_start': True, '...","reduced,18labels","only_numeric_cols,SimpleImputer_Mean",
4,knn,0.747508,0.926829,0.742233,"{'randomforestclassifier__warm_start': True, '...","reduced,18labels","only_numeric_cols,SimpleImputer_Mean",
5,rforest,0.743079,0.928215,0.74639,"{'randomforestclassifier__warm_start': True, '...","reduced,18labels","only_numeric_cols,SimpleImputer_Mean",
6,rforest,0.744186,0.93459,0.745283,"{'randomforestclassifier__warm_start': True, '...","reduced,18labels","only_numeric_cols,SimpleImputer_Mean",
7,rforest,0.681063,0.764412,0.694281,"{'randomforestclassifier__warm_start': True, '...","reduced,18labels","only_numeric_cols,SimpleImputer_Mean",
8,rforest,0.710963,0.86031,0.723104,"{'randomforestclassifier__warm_start': False, ...","reduced,18labels","only_numeric_cols,SimpleImputer_Mean",
9,rforest,0.717608,0.863359,0.723108,"{'randomforestclassifier__warm_start': True, '...","reduced,18labels","only_numeric_cols,SimpleImputer_Mean",


## Evaluating scores

In [209]:
##preparing to collect results:
score = {
"dataset": [],
"preprocessing": [],
"model": [], 
"test_acc_score" : [],
"train_acc_score" : [],
"best_params": [],
"best_score_CV": [],
}

dict

In [210]:
#description of the current round
dataset = "reduced,18labels"   #"reduced,19labels"
preprocessing = "only_numeric_cols,SimpleImputer_Mean"  #"all_columns,SimpleImputer_Mean,OneHot"
scores_test_data



{'dtree': 0.6378737541528239,
 'knn': 0.6788482834994463,
 'rforest': 0.7043189368770764,
 'Ada': 0.7109634551495017}

In [211]:
for i in scores_test_data: 
    score["dataset"].append(dataset)
    score["preprocessing"].append(preprocessing)
    score["model"].append(i)
    score["test_acc_score"].append(scores_test_data[i])
    score["train_acc_score"].append(scores_train_data[i])
    score["best_params"].append(best_params[i])
    score["best_score_CV"].append(scores[i])



In [213]:
#score
score = pd.DataFrame(score)

In [115]:
#store results
pd.DataFrame(score).to_csv("all_scores.csv")


