# Bicycle classification - Pipeline, RandomzidedSearch, CrossValidation

## 1. Import and split

In [102]:
import pandas as pd
from Functions import BIKED_classif_functions as BIKED
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier     #Depth-8 Decision Tree
from sklearn.neighbors import KNeighborsClassifier  #K-Neighbors
from sklearn.ensemble import RandomForestClassifier #Random Forest


from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn import set_config


For other Classification models see here:

In [103]:
#more models used by the author:

from sklearn.ensemble import AdaBoostClassifier        #AdaBoost
from sklearn.svm import SVC                            #Support Vector Clf.

from sklearn.gaussian_process import GaussianProcessClassifier     #Gaussian Pr. Clf.
from sklearn.gaussian_process.kernels import ConstantKernel, RBF
from sklearn.neural_network import MLPClassifier                   #3-layer Neural Net, #6-Layer Neural Net
                                                                    

#other models:
#from sklearn.gaussian_process.kernels import RBF
#from sklearn.naive_bayes import GaussianNB
#from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

### 1.1. Training data

In [104]:
#Read Data
data = pd.read_csv("Data/BIKED_reduced.csv", index_col=0)


## here I use the processed dataset
#processed = pd.read_csv("Data/BIKED_processed.csv", index_col=0)
#processed["BIKESTYLE"] = data["BIKESTYLE"]
#print(processed.shape, data.shape)
#data = processed

(4512, 2402) (4512, 1320)


In [105]:

classes = data["BIKESTYLE"].value_counts() # 19 categories, one of them has only one member
classes = pd.DataFrame(classes)
classes["percentage"] = (classes.BIKESTYLE / data.shape[0] * 100).round(2)


In [106]:
data.shape

(4512, 2402)

* label `BIKESTYLE` has 19 categories
* class `FAT`has only one row
* class `CRUISER`, `HYBRID`, `TRIALS`, `GRAVEL`, `CARGO`, `CHILDRENS`, and `FAT` are extremly unbalanced with < 1% of total rows
* If all classes would be evenly distributed, each of them had 5 %

In [107]:
#call function to drop <0.1% of values
data = BIKED.remove_classes_with_less_than_x_percent(data)
data.BIKESTYLE.value_counts()


AttributeError: 'DataFrame' object has no attribute 'BIKESTYLE'

In [100]:
#Split
y = data.pop("BIKESTYLE")

KeyError: 'BIKESTYLE'

In [6]:
X = data

In [7]:
# check missing values # HIER KÖNNTE ICH LERNEN WIE ICH ES KÜRZER SCHREIBE
missing_values_col = X.isna().sum()
missing_values_col = pd.DataFrame(missing_values_col)
missing_values_col.columns = ["Missing_values_Count"]

missing_values_col = missing_values_col.loc[missing_values_col["Missing_values_Count"] > 0, :]
print(len(missing_values_col))
missing_values_col



35


Unnamed: 0,Missing_values_Count
HBAR type,219
Headset type,91
Dropout model,73
Top tube angle textfield,1
Seatpost type,166
Stack,73
SIZE,774
dropoutInsert,680
Cranks type,115
Dim A Inch TextField,2


**Conclusion:**
* There are 35 columns with missing values.
* Column `SIZE` has 774 missing values. Interesting, because `SIZE` results in 555 columns after OneHotEncoding.
* Columns `bottle xxx` have many missing values. Might be useful to put them into fewer columns. i.e. `bottle DOWNTUBE` and `bottle SEATTUBE`

In [8]:
#check data types (which columns are not numeric and what to do with them)
X.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4511 entries, 0 to 4510
Columns: 1318 entries, SSSIDECX3 to RDBSD
dtypes: bool(150), float64(978), int64(153), object(37)
memory usage: 40.9+ MB


In [9]:
categorical_cols = X.select_dtypes(include="object").columns
print(len(list(categorical_cols)))
categorical_cols

#boolean_cols = X.select_dtypes(include="bool").columns
#print(len(boolean_cols))
#boolean_cols

37


Index(['HBAR type', 'Headset type', 'Rack type', 'DIRECTMOUNTFDTYPE',
       'Dropout model', 'HEADTUBEtype', 'BATTERYmount', 'Seatpost type',
       'FRONTbrake type', 'Stem type', 'Pump size', 'MATERIAL',
       'FRONTdiscB type', 'SIZE', 'StaticDropout', 'RIM_STYLE front',
       'SPROCKETS type', 'PhBlock typeDOWNTUBE', 'GEARBOXtype',
       'dropoutInsert', 'REARdiscB type', 'Cranks type', 'MOUNT_TYPE',
       'Yoke type', 'RIM_STYLE rear', 'Brake lever brand', 'Pedals type',
       'Wheel choice rear', 'REARbrake type', 'CLAMPFDTYPE',
       'Wheel choice front', 'PhBlock typeTOPTUBE', 'DROPOUT STYLE',
       'Fit scheme', 'Fork choice', 'Saddle type', 'BRAZEonFDTYPE'],
      dtype='object')

**Conclusion:** 

* There are 187 non-numeric column. of which 150 columns are boolean.
* There are 37 object columns, which need to be encoded (OneHot or Ordinal).




LIST OF COLUMNS:
OBJECT: 
'HBAR type', 'Headset type', 'Rack type', 'DIRECTMOUNTFDTYPE',
       'Dropout model', 'HEADTUBEtype', 'BATTERYmount', 'Seatpost type',
       'FRONTbrake type', 'Stem type', 'Pump size', 'MATERIAL',
       'FRONTdiscB type', 'SIZE', 'StaticDropout', 'RIM_STYLE front',
       'SPROCKETS type', 'PhBlock typeDOWNTUBE', 'GEARBOXtype',
       'dropoutInsert', 'REARdiscB type', 'Cranks type', 'MOUNT_TYPE',
       'Yoke type', 'RIM_STYLE rear', 'Brake lever brand', 'Pedals type',
       'Wheel choice rear', 'REARbrake type', 'CLAMPFDTYPE',
       'Wheel choice front', 'PhBlock typeTOPTUBE', 'DROPOUT STYLE',
       'Fit scheme', 'Fork choice', 'Saddle type', 'BRAZEonFDTYPE'

BOOL: (just a few)
'REARDiscAdaptOnPost', 'Seatpost AERO', 'DOWNTUBE1SnSCheck',
       'USEgearbox', 'REARROTOR_INCLUDE', 'nSeat stay Curv Check',
       'CSSIDEISBENT', 'LINKpump2SCHEME', 'SEATSTAYholeCheck', 'LOOPED_STAYS',
       ...
       'Female', 'Seatpost MAST', 'CASS_aux', 'DUAL_CROWN',
       'REARDiscTabIsPost', 'CHEVRONDOWNTUBE1ON', 'CHEVRONTOPTUBE1ON',
       'Dim A CheckBox', 'bottle SEATTUBE0 show', 'bottle DOWNTUBE0 show'

In [10]:
# which columns have missing values and also have categorical? - yes there are some. Make sure in the preprocessor that they are properly taken care of.
# # # # # #  I wonder how the author took care of it.

categorical_cols.isin(missing_values_col.index)
#boolean_cols.isin(missing_values_col.index)

array([ True,  True, False, False,  True, False, False,  True, False,
       False, False, False, False,  True, False, False, False, False,
       False,  True, False,  True, False, False, False, False, False,
        True, False, False,  True, False, False, False,  True,  True,
       False])

In [11]:
X.shape

(4511, 1318)

In [12]:
# This time take only numerical columns
X = X.select_dtypes(include="number").copy()
X.shape

(4511, 1131)

In [13]:
# Train / Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31416)

## 2. Building `preprocessor` pipeline

In [14]:
X_train.head(2)

Unnamed: 0,SSSIDECX3,SSSIDECX2,SSSIDECX1,SSSIDECY2,SSSIDECY1,STEMBENDS,FRONTROTORBOLTS,Shoe up angle,Rollout units,Down tube front diameter,...,bottle DOWNTUBE0 WBL,bottle DOWNTUBE0 WBD,bottle SEATTUBE0 CAGE,bottle DOWNTUBE0 flip,bottle DOWNTUBE0 X,bottle DOWNTUBE0 CAGE,rockerPosition,RDERD,FDERD,RDBSD
1845,65.5,130.4,304.2,-10.0,10.0,0,6,39,0,38.1,...,201.0,73.5,1.0,0.0,200.0,1.0,50.0,92.0,92.0,52.0
3510,71.8,134.9,350.0,-10.0,10.0,0,6,36,0,58.0,...,,,,,,,,92.0,92.0,52.0


In [15]:
# 2.a) Select categorical and numerical columns
X_cat = X.select_dtypes(exclude="number").copy()
X_num = X.select_dtypes(include="number").copy()


In [16]:

# 2.b) Numerical Pipeline, only with the SimpleImputer(strategy="mean")
numeric_pipe = make_pipeline(
    SimpleImputer(strategy="constant")   #strategy="constant"
    #KNNImputer(5)
)


In [17]:
# NOT EXECUTED YET
# 2.c) Categorical Pipeline

# # # i. Defining the categorical encoder
ordinal_col_names = ['ExterQual', 
                     'ExterCond', 
                     'BsmtQual', 
                     'BsmtCond',
                     'BsmtExposure', 
                     'BsmtFinType1', 
                     'KitchenQual', 
                     'FireplaceQu', 
                     'LotShape', 
                     'BsmtFinType2',
                     'HeatingQC',
                     'GarageFinish', 
                     'GarageQual', 
                     'GarageCond', 
                     'PoolQC', 
                     'Fence']

ordinal_cols = X_cat.columns.get_indexer(ordinal_col_names)
onehot_cols = X_cat.columns.get_indexer(list(set(X_cat) - set(ordinal_col_names)))

# # # ii. Defining values for Ordinal Encoding
ExterQual_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
ExterCond_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtQual_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtCond_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
BsmtExposure_cats = ["NA", "No", "Mn", "Av", "Gd"]
BsmtFinType1_cats = ["NA", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"]
KitchenQual_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
FireplaceQu_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
LotShape_cats = ["NA", 'Reg', 'IR1', 'IR2', 'IR3']
BsmtFinType2_cats = ['NA','Unf','LwQ','Rec','BLQ','ALQ','GLQ']
HeatingQC_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
GarageFinish_cats = ['NA','Unf','RFn','Fin']
GarageQual_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
GarageCond_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
PoolQC_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
Fence_cats = ['NA','MnWw','GdWo','MnPrv','GdPrv']

ordinal_cats_list = [ExterQual_cats, ExterCond_cats, BsmtQual_cats, BsmtCond_cats, 
                    BsmtExposure_cats, BsmtFinType1_cats, KitchenQual_cats, FireplaceQu_cats, 
                    LotShape_cats, BsmtFinType2_cats, HeatingQC_cats, GarageFinish_cats, GarageQual_cats,
                    GarageCond_cats, PoolQC_cats, Fence_cats]


# # # iii. Defining the ColumnTransformer with 2 branches: ordinal & onehot (categorical encoder)
categorical_encoder = ColumnTransformer(
    transformers=[
#        ("cat_ordinal", OrdinalEncoder(categories=ordinal_cats_list), ordinal_cols),
        ("cat_onehot", OneHotEncoder(handle_unknown="ignore"), onehot_cols),
    ]
)

# # # iv. Categorical pipeline = "NA" imputer + categorical encoder
categoric_pipe = make_pipeline(SimpleImputer(strategy="constant", fill_value="NA"),
                                 categorical_encoder
                                )

# 2.d) Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
        ("cat_pipe", categoric_pipe, X_cat.columns),
    ]
)

In [18]:
X_train.shape

(3608, 1131)

**EXPLANATION FOR CATEGORICAL PIPELINE: --2.c)--**

**Start from the bottom! (# # # iv.):**
* `categoric_pipe` = make_pipeline(SimpleImputer(), `categorical_encoder`)



**needs `categorical_encoder` (# # # iii.):**
* `categorical_encoder` = ColumnTransformer(tranformers=[
    * ("cat_ordinal", OrdinalEncoder(categories=`ordinal_cats_list`), `ordinal_cols`), 
    * ("cat_onehot", OneHotEncoder(handle_unknown="ignore"), `onehot_cols`)])


**needs `ordinal_cats_list` (# # # ii.) and needs `ordinal_cols`and `onehot_cols`(# # # i.):**

(\# # # i.) `ordinal_cols`and `onehot_cols`:
* ordinal_col_names = ['ExterQual', '...']
* `ordinal_cols` = X_cat.columns.get_indexer(ordinal_col_names)
* `onehot_cols` = X_cat.columns.get_indexer(list(set(X_cat) - set(ordinal_col_names)))


(\# # # ii.) `ordinal_cats_list`: define values and then make a list:**
* ExterQual_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
* ExterCond_cats = ["NA", "Po", "Fa", "TA", "Gd", "Ex"]
* etc.
* `ordinal_cats_list` = [ExterQual_cats, ExterCond_cats, ...]

# 3. Evaluating Models

In [73]:
from sklearn.ensemble import AdaBoostClassifier        #AdaBoost
from sklearn.svm import SVC                            #Support Vector Clf.

from sklearn.gaussian_process import GaussianProcessClassifier     #Gaussian Pr. Clf.
from sklearn.neural_network import MLPClassifier                   #3-layer Neural Net, #6-Layer Neural Net
from sklearn.gaussian_process.kernels import ConstantKernel, RBF

def get_model_and_params(model_name, param_grid):
    if model_name == "rforest":
        model = RandomForestClassifier()
        param_grid.update({"randomforestclassifier__warm_start": [True, False],
                "randomforestclassifier__n_estimators": [100, 200],
                "randomforestclassifier__max_depth": range(2, 14),
                "randomforestclassifier__min_samples_leaf": range(2, 10),
                "randomforestclassifier__criterion":["gini", "entropy"]})
    elif model_name == "knn":
        model = KNeighborsClassifier()
        param_grid.update({"kneighborsclassifier__n_neighbors": range(2, 50),
                "kneighborsclassifier__weights": ["uniform", "distance"]})
    elif model_name == "dt":
        model = DecisionTreeClassifier()
        param_grid.update({"decisiontreeclassifier__max_depth": range(2, 14),
                "decisiontreeclassifier__min_samples_leaf": range(2, 12),
#               "decisiontreeclassifier__min_samples_split": range(3, 40, 2),
                "decisiontreeclassifier__criterion":["gini", "entropy"]})
    elif model_name == "Ada":
        model = AdaBoostClassifier()
        param_grid.update({#"adaboostclassifiert__estimator": [DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=3)],
                "adaboostclassifier__n_estimators": [50, 80],
                "adaboostclassifier__learning_rate": [0.5, 1.0, 1.5, 2.0, 2.5]})
    #elif model_name == "SVC": # runtime too long. Stopped after 2 hours 
    #    model = SVC()
    #    param_grid.update({"svc__kernel": ["linear", "poly", "rbf", "sigmoid"],
    #            "svc__class_weight": ["balanced", "None"]})
    elif model_name == "GaussPrC":
        model = GaussianProcessClassifier(1.0 * RBF(1.0))
        param_grid.update({"gaussianprocessclassifier__multi_class": ["one_vs_rest", "one_vs_one"]})
    elif model_name == "mlp":
        model = MLPClassifier()
        param_grid.update({"mlpclassifier__hidden_layer_sizes": [(200), (200, 200)],
            "mlpclassifier__max_iter": [3000, 5000]})

    return model, param_grid




In [68]:

def evaluate_model(list_of_models, param_grid_preproc):
    score_new = {
        "model": [],
        "test_acc_score": [],
        "train_acc_score": [],
        "best_score_of_CV": [],
        "best_params": []
    }
    param_grid = {}
    
    for i in list_of_models:
        #reset values to be empty : 
        model = ""
        param_grid.clear()
        param_grid = param_grid_preproc

        # # # # it would nice if I make this into a function get_model_and_params
        model, param_grid = get_model_and_params(i, param_grid)

        #create the full_pipeline
        full_pipe = make_pipeline(preprocessor, 
                          StandardScaler(with_mean=False),
                          model
                             )

        # define cross validation
        search = RandomizedSearchCV(full_pipe,
                      param_grid,
                      cv=5,
                      verbose=1,
                      scoring="accuracy",
                      n_jobs=-2, 
                      n_iter=5
                    )                             

        # fit
        search.fit(X_train, y_train)
        print(search.best_params_)

        score_new["model"].append(i)

        ### testing accuracy
        y_test_pred = search.predict(X_test)
        acc_test = accuracy_score(y_test, y_test_pred)
        score_new["test_acc_score"].append(acc_test)

        #get score of TRAIN-Data (overfitting?)
        y_train_pred = search.predict(X_train)
        acc_train = accuracy_score(y_train, y_train_pred)
        score_new["train_acc_score"].append(acc_train)

        # keep scores of model (optional)
        score_new["best_score_of_CV"].append(search.best_score_)
        score_new["best_params"].append(search.best_params_)

        print(i, model)
        print("end")
    return pd.DataFrame(score_new)



In [74]:
# set models, pass param_grid for preprocessor-pipeline
list_of_models = ["mlp"] # , "rforest", "knn", "dt", "GaussPrC", "mlp" "SVC" "Ada", , "mlp"

param_grid_preproc = {}

score_new = evaluate_model(list_of_models, param_grid_preproc)


Fitting 5 folds for each of 4 candidates, totalling 20 fits




{'mlpclassifier__max_iter': 3000, 'mlpclassifier__hidden_layer_sizes': 200}
mlp MLPClassifier()
end


# Keep track of scores

In [76]:
#Set to Initialize = True when you want to start over.
initialize_all_scores = False     

if initialize_all_scores == True:
    score_all = {
        "model": [], 
        "test_acc_score" : [],
        "train_acc_score" : [],
        "best_params": [],
        "best_score_of_CV": [],
        "dataset": [],
        "preprocessing": [],
    }
    score_all = pd.DataFrame(score_all)

In [77]:
score_new
# the ada boost - which is not a model itself but a way to stack things together performs really bad. Ada-Boost 
# 

Unnamed: 0,model,test_acc_score,train_acc_score,best_score_of_CV,best_params
0,mlp,0.100775,0.105044,0.346657,"{'mlpclassifier__max_iter': 3000, 'mlpclassifi..."


In [78]:
#description of the current round
#score_all = pd.DataFrame([]) --> only once!
dataset = "reduced,18labels"   #"reduced,19labels"
preprocessing = "only_numeric_cols,SimpleImputer_Mean"  #"all_columns,SimpleImputer_Mean,OneHot"

score_new["dataset"] = dataset
score_new["preprocessing"] = preprocessing


In [79]:

score_all = pd.concat([score_all, score_new], ignore_index=True)

In [80]:
score_all

Unnamed: 0,model,test_acc_score,train_acc_score,best_params,best_score_of_CV,dataset,preprocessing
0,rforest,0.704319,0.807373,"{'randomforestclassifier__warm_start': True, '...",0.70897,"reduced,18labels","only_numeric_cols,SimpleImputer_Mean"
1,knn,0.640089,0.809035,"{'kneighborsclassifier__weights': 'uniform', '...",0.634143,"reduced,18labels","only_numeric_cols,SimpleImputer_Mean"
2,dt,0.665559,0.860865,{'decisiontreeclassifier__min_samples_leaf': 2...,0.668788,"reduced,18labels","only_numeric_cols,SimpleImputer_Mean"
3,rforest,0.714286,0.836197,"{'randomforestclassifier__warm_start': True, '...",0.713129,"reduced,18labels","only_numeric_cols,SimpleImputer_Mean"
4,knn,0.668882,0.999169,"{'kneighborsclassifier__weights': 'distance', ...",0.672114,"reduced,18labels","only_numeric_cols,SimpleImputer_Mean"
5,dt,0.630122,0.828991,{'decisiontreeclassifier__min_samples_leaf': 5...,0.668786,"reduced,18labels","only_numeric_cols,SimpleImputer_Mean"
6,Ada,0.51495,0.533259,"{'adaboostclassifier__n_estimators': 80, 'adab...",0.51386,"reduced,18labels","only_numeric_cols,SimpleImputer_Mean"
7,mlp,0.100775,0.105044,"{'mlpclassifier__max_iter': 3000, 'mlpclassifi...",0.346657,"reduced,18labels","only_numeric_cols,SimpleImputer_Mean"


In [115]:
#store results
pd.DataFrame(score_all).to_csv("all_scores.csv")


