# Model Performance Transformations

Lets practice some basic data transformation for ML performance enhancement

In [1]:
# Imports

import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [2]:
# Categorical data analyser

def cat_var(df, cols):
    '''
    Return: a Pandas dataframe object with the following columns:
        - "categorical_variable" => every categorical variable include as an input parameter (string).
        - "number_of_possible_values" => the amount of unique values that can take a given categorical variable (integer).
        - "values" => a list with the posible unique values for every categorical variable (list).

    Input parameters:
        - df -> Pandas dataframe object: a dataframe with categorical variables.
        - cols -> list object: a list with the name (string) of every categorical variable to analyse.
    '''
    cat_list = []
    for col in cols:
        cat = df[col].unique()
        cat_num = len(cat)
        cat_dict = {"categorical_variable":col,
                    "number_of_possible_values":cat_num,
                    "values":cat}
        cat_list.append(cat_dict)
    df = pd.DataFrame(cat_list).sort_values(by="number_of_possible_values", ascending=False)
    return df.reset_index(drop=True)

## Scaling

Some ML algorithms have problems performing well whenever the data scale differ greatly between features. In those cases scaling the data is your best option.

- [RobustScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html#sklearn.preprocessing.RobustScaler)

- [StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler)

Try both options and see what happens with performance (i.e.: AUC).

<img src="../images/scaling.png" alt="Drawing" style="width: 500px;"/>

In [3]:
# Weather dataset (https://www.kaggle.com/jsphyg/weather-dataset-rattle-package)

weather = pd.read_csv('../data/weatherAUS.csv')
print(weather.shape)
weather.head()

(145460, 23)


Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [4]:
# Uluru weather (numerical features)

weather = weather[weather['Location'].isin(['Uluru'])].reset_index(drop=True)
weather = weather[weather['RainToday'].isin(['No','Yes'])].reset_index(drop=True)
weather = weather[weather['RainTomorrow'].isin(['No','Yes'])]
weather = weather[['MinTemp',
                   'MaxTemp',
                   'Rainfall',
                   'WindSpeed9am',
                   'WindSpeed3pm',
                   'Humidity9am',
                   'Humidity3pm',
                   'Pressure9am',
                   'Pressure3pm',
                   'Temp9am',
                   'Temp3pm',
                   'RainTomorrow']]
weather = weather.dropna().reset_index(drop=True)
col_weather = list(weather.columns)
print(col_weather)
print(weather.shape)
print(weather.describe())
weather.head()

['MinTemp', 'MaxTemp', 'Rainfall', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Temp9am', 'Temp3pm', 'RainTomorrow']
(1479, 12)
           MinTemp      MaxTemp     Rainfall  WindSpeed9am  WindSpeed3pm  \
count  1479.000000  1479.000000  1479.000000   1479.000000   1479.000000   
mean     14.368627    30.402299     0.716700     17.613928     17.050710   
std       7.432857     7.624058     4.208585      7.887082      6.893016   
min      -1.900000    11.300000     0.000000      0.000000      0.000000   
25%       8.100000    23.800000     0.000000     11.000000     11.000000   
50%      14.900000    31.200000     0.000000     17.000000     17.000000   
75%      20.800000    37.100000     0.000000     24.000000     22.000000   
max      31.000000    44.400000    83.800000     41.000000     48.000000   

       Humidity9am  Humidity3pm  Pressure9am  Pressure3pm      Temp9am  \
count  1479.000000  1479.000000  1479.000000  1479.000000  1479.0

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainTomorrow
0,19.7,30.0,0.8,30.0,24.0,76.0,54.0,1010.6,1007.5,21.7,28.4,No
1,21.6,33.1,0.0,22.0,11.0,44.0,33.0,1010.5,1006.5,24.6,31.3,No
2,21.3,36.1,0.0,24.0,13.0,39.0,27.0,1006.9,1002.7,27.6,34.5,No
3,22.9,37.7,0.0,28.0,13.0,35.0,22.0,1006.0,1002.1,28.7,35.4,No
4,24.0,39.0,0.0,20.0,19.0,33.0,21.0,1006.9,1003.5,29.9,37.3,No


In [5]:
# Features + target

X = weather[['MinTemp',
          'MaxTemp',
          'Rainfall',
          'WindSpeed9am',
          'WindSpeed3pm',
          'Humidity9am',
          'Humidity3pm',
          'Pressure9am',
          'Pressure3pm',
          'Temp9am',
          'Temp3pm']]
y = pd.get_dummies(weather['RainTomorrow'], drop_first=True)['Yes']
print(X.shape,y.shape)

(1479, 11) (1479,)


In [6]:
# Train + test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")
print(f"X_train: {type(X_train)}, X_test: {type(X_test)}, y_train: {type(y_train)}, y_test: {type(y_test)}")

X_train: (1183, 11), X_test: (296, 11), y_train: (1183,), y_test: (296,)
X_train: <class 'pandas.core.frame.DataFrame'>, X_test: <class 'pandas.core.frame.DataFrame'>, y_train: <class 'pandas.core.series.Series'>, y_test: <class 'pandas.core.series.Series'>


In [7]:
# Scaling using Standarization
scaler = StandardScaler()
scaled_data = scaler.fit_transform(X)
scaled_data

array([[ 0.71751357, -0.05278487,  0.01979946, ..., -0.90039756,
         0.09167267, -0.09181875],
       [ 0.97322179,  0.35396027, -0.17035247, ..., -1.05506695,
         0.4660711 ,  0.30207985],
       [ 0.93284681,  0.7475846 , -0.17035247, ..., -1.64281064,
         0.85337982,  0.73672658],
       ...,
       [-1.44927714, -0.66946299, -0.17035247, ...,  0.89376738,
        -1.30263873, -0.62154446],
       [-1.20702724, -0.45953001, -0.17035247, ...,  0.53802778,
        -1.09607408, -0.40422109],
       [-0.88402739, -0.4464092 , -0.17035247, ...,  0.49162696,
        -0.76040652, -0.4178038 ]])

In [8]:
scaled_df = pd.DataFrame(scaled_data, columns=['MinTemp',
          'MaxTemp',
          'Rainfall',
          'WindSpeed9am',
          'WindSpeed3pm',
          'Humidity9am',
          'Humidity3pm',
          'Pressure9am',
          'Pressure3pm',
          'Temp9am',
          'Temp3pm'])
scaled_df.describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm
count,1479.0,1479.0,1479.0,1479.0,1479.0,1479.0,1479.0,1479.0,1479.0,1479.0,1479.0
mean,-3.4605330000000006e-17,-1.142877e-16,7.172161e-16,8.955349000000001e-17,-2.951587e-16,1.337297e-16,1.472556e-16,-1.995008e-14,-2.044229e-14,-6.981118e-18,5.43101e-16
std,1.000338,1.000338,1.000338,1.000338,1.000338,1.000338,1.000338,1.000338,1.000338,1.000338,1.000338
min,-2.189485,-2.506377,-0.1703525,-2.234018,-2.474458,-1.712349,-1.28117,-3.504662,-3.730847,-2.064346,-2.794778
25%,-0.8436524,-0.8662752,-0.1703525,-0.838861,-0.8780999,-0.7905785,-0.6650588,-0.7514505,-0.7457282,-0.8765991,-0.8456592
50%,0.07151386,0.1046649,-0.1703525,-0.07786606,-0.007359201,-0.1914278,-0.2337808,0.01073801,-0.01878203,0.05294179,0.09833919
75%,0.8655552,0.8787927,-0.1703525,0.8099614,0.7182581,0.6842539,0.3823307,0.7418168,0.7081641,0.8792004,0.8725537
max,2.238305,1.836612,19.74806,2.966114,4.491468,2.66606,4.695111,2.530627,2.54873,1.989485,1.836926


In [9]:
X_esc = scaled_df[['MinTemp',
          'MaxTemp',
          'Rainfall',
          'WindSpeed9am',
          'WindSpeed3pm',
          'Humidity9am',
          'Humidity3pm',
          'Pressure9am',
          'Pressure3pm',
          'Temp9am',
          'Temp3pm']]

In [10]:
X_train_esc, X_test_esc, y_train_esc, y_test_esc = train_test_split(X_esc, y, test_size=0.2, random_state=42)
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")
print(f"X_train: {type(X_train)}, X_test: {type(X_test)}, y_train: {type(y_train)}, y_test: {type(y_test)}")

X_train: (1183, 11), X_test: (296, 11), y_train: (1183,), y_test: (296,)
X_train: <class 'pandas.core.frame.DataFrame'>, X_test: <class 'pandas.core.frame.DataFrame'>, y_train: <class 'pandas.core.series.Series'>, y_test: <class 'pandas.core.series.Series'>


In [11]:
# Scaling using Normalization
scaler = MinMaxScaler()
scaled_data_nor = scaler.fit_transform(X)
scaled_data_nor

array([[0.65653495, 0.56495468, 0.00954654, ..., 0.45073892, 0.53184713,
        0.58357771],
       [0.71428571, 0.65861027, 0.        , ..., 0.42610837, 0.62420382,
        0.6686217 ],
       [0.70516717, 0.74924471, 0.        , ..., 0.33251232, 0.71974522,
        0.76246334],
       ...,
       [0.16717325, 0.42296073, 0.        , ..., 0.7364532 , 0.18789809,
        0.46920821],
       [0.2218845 , 0.47129909, 0.        , ..., 0.67980296, 0.2388535 ,
        0.51612903],
       [0.29483283, 0.47432024, 0.        , ..., 0.67241379, 0.32165605,
        0.51319648]])

In [12]:
scaled_df_nor = pd.DataFrame(scaled_data_nor, columns=['MinTemp',
          'MaxTemp',
          'Rainfall',
          'WindSpeed9am',
          'WindSpeed3pm',
          'Humidity9am',
          'Humidity3pm',
          'Pressure9am',
          'Pressure3pm',
          'Temp9am',
          'Temp3pm'])
scaled_df_nor.describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm
count,1479.0,1479.0,1479.0,1479.0,1479.0,1479.0,1479.0,1479.0,1479.0,1479.0,1479.0
mean,0.494487,0.577109,0.008553,0.429608,0.355223,0.391089,0.214376,0.580695,0.594124,0.509233,0.603402
std,0.225923,0.230334,0.050222,0.192368,0.143604,0.228471,0.167385,0.165748,0.1593,0.246764,0.215976
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.303951,0.377644,0.0,0.268293,0.229167,0.210526,0.103093,0.456186,0.475369,0.292994,0.420821
50%,0.510638,0.601208,0.0,0.414634,0.354167,0.347368,0.175258,0.582474,0.591133,0.522293,0.624633
75%,0.68997,0.779456,0.0,0.585366,0.458333,0.547368,0.278351,0.703608,0.706897,0.726115,0.791789
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [13]:
X_esc_nor = scaled_df_nor[['MinTemp',
          'MaxTemp',
          'Rainfall',
          'WindSpeed9am',
          'WindSpeed3pm',
          'Humidity9am',
          'Humidity3pm',
          'Pressure9am',
          'Pressure3pm',
          'Temp9am',
          'Temp3pm']]

In [14]:
X_train_esc_nor, X_test_esc_nor, y_train_esc_nor, y_test_esc_nor = train_test_split(X_esc_nor, y, test_size=0.2, random_state=42)
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")
print(f"X_train: {type(X_train)}, X_test: {type(X_test)}, y_train: {type(y_train)}, y_test: {type(y_test)}")

X_train: (1183, 11), X_test: (296, 11), y_train: (1183,), y_test: (296,)
X_train: <class 'pandas.core.frame.DataFrame'>, X_test: <class 'pandas.core.frame.DataFrame'>, y_train: <class 'pandas.core.series.Series'>, y_test: <class 'pandas.core.series.Series'>


In [15]:
# Scaling using Robust
scaler = RobustScaler()
scaled_data_rob = scaler.fit_transform(X)
scaled_data_rob

array([[ 0.37795276, -0.09022556,  0.8       , ..., -0.60638298,
         0.02205882, -0.11067194],
       [ 0.52755906,  0.14285714,  0.        , ..., -0.71276596,
         0.23529412,  0.11857708],
       [ 0.50393701,  0.36842105,  0.        , ..., -1.11702128,
         0.45588235,  0.3715415 ],
       ...,
       [-0.88976378, -0.44360902,  0.        , ...,  0.62765957,
        -0.77205882, -0.41897233],
       [-0.7480315 , -0.32330827,  0.        , ...,  0.38297872,
        -0.65441176, -0.29249012],
       [-0.55905512, -0.31578947,  0.        , ...,  0.35106383,
        -0.46323529, -0.30039526]])

In [16]:
scaled_df_rob = pd.DataFrame(scaled_data_rob, columns=['MinTemp',
          'MaxTemp',
          'Rainfall',
          'WindSpeed9am',
          'WindSpeed3pm',
          'Humidity9am',
          'Humidity3pm',
          'Pressure9am',
          'Pressure3pm',
          'Temp9am',
          'Temp3pm'])
scaled_df_rob.describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm
count,1479.0,1479.0,1479.0,1479.0,1479.0,1479.0,1479.0,1479.0,1479.0,1479.0,1479.0
mean,-0.04184,-0.059978,0.7167,0.047225,0.00461,0.129796,0.223203,-0.007191,0.012918,-0.030153,-0.057233
std,0.585264,0.573237,4.208585,0.606699,0.626638,0.678272,0.955078,0.669899,0.688042,0.569734,0.582197
min,-1.322835,-1.496241,0.0,-1.307692,-1.545455,-1.03125,-1.0,-2.354167,-2.553191,-1.205882,-1.683794
25%,-0.535433,-0.556391,0.0,-0.461538,-0.545455,-0.40625,-0.411765,-0.510417,-0.5,-0.529412,-0.549407
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.464567,0.443609,0.0,0.538462,0.454545,0.59375,0.588235,0.489583,0.5,0.470588,0.450593
max,1.267717,0.992481,83.8,1.846154,2.818182,1.9375,4.705882,1.6875,1.765957,1.102941,1.011858


In [17]:
X_esc_rob = scaled_df_rob[['MinTemp',
          'MaxTemp',
          'Rainfall',
          'WindSpeed9am',
          'WindSpeed3pm',
          'Humidity9am',
          'Humidity3pm',
          'Pressure9am',
          'Pressure3pm',
          'Temp9am',
          'Temp3pm']]

In [18]:
X_train_esc_rob, X_test_esc_rob, y_train_esc_rob, y_test_esc_rob = train_test_split(X_esc_rob, y, test_size=0.2, random_state=42)
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")
print(f"X_train: {type(X_train)}, X_test: {type(X_test)}, y_train: {type(y_train)}, y_test: {type(y_test)}")

X_train: (1183, 11), X_test: (296, 11), y_train: (1183,), y_test: (296,)
X_train: <class 'pandas.core.frame.DataFrame'>, X_test: <class 'pandas.core.frame.DataFrame'>, y_train: <class 'pandas.core.series.Series'>, y_test: <class 'pandas.core.series.Series'>


In [19]:
# Linear model

linear_model = LogisticRegression(max_iter=1000, random_state=42)
linear_param = linear_model.fit(X_train, y_train)
linear_pred = linear_model.predict(X_test)
linear_auc = roc_auc_score(y_test, linear_pred)
print(f"Linear model AUC no scaling is: {linear_auc}")

linear_model_esc = LogisticRegression(max_iter=1000, random_state=42)
linear_param_esc = linear_model_esc.fit(X_train_esc, y_train_esc)
linear_pred_esc = linear_model_esc.predict(X_test_esc)
linear_auc_esc = roc_auc_score(y_test_esc, linear_pred_esc)
print(f"Linear model AUC Standarization scaling is: {linear_auc_esc}")

linear_model_esc_nor = LogisticRegression(max_iter=1000, random_state=42)
linear_param_esc_nor = linear_model_esc_nor.fit(X_train_esc_nor, y_train_esc_nor)
linear_pred_esc_nor = linear_model_esc_nor.predict(X_test_esc_nor)
linear_auc_esc_nor = roc_auc_score(y_test_esc_nor, linear_pred_esc_nor)
print(f"Linear model AUC Normalized scaling is: {linear_auc_esc_nor}")

linear_model_esc_rob = LogisticRegression(max_iter=1000, random_state=42)
linear_param_esc_rob = linear_model_esc_rob.fit(X_train_esc_rob, y_train_esc_rob)
linear_pred_esc_rob = linear_model_esc_rob.predict(X_test_esc_rob)
linear_auc_esc_rob = roc_auc_score(y_test_esc_rob, linear_pred_esc_rob)
print(f"Linear model AUC Robust scaling is: {linear_auc_esc_rob}")

Linear model AUC no scaling is: 0.7278168345050351
Linear model AUC Standarization scaling is: 0.6787953638609159
Linear model AUC Normalized scaling is: 0.6542846285388563
Linear model AUC Robust scaling is: 0.6787953638609159


In [20]:
# Ensemble model

ensemble_model = RandomForestClassifier(random_state=42)
ensemble_param = ensemble_model.fit(X_train, y_train)
ensemble_pred = ensemble_model.predict(X_test)
ensemble_auc = roc_auc_score(y_test, ensemble_pred)
print(f"Linear model AUC no scaling is: {ensemble_auc}")

ensemble_model_esc = RandomForestClassifier(random_state=42)
ensemble_param_esc = ensemble_model_esc.fit(X_train_esc, y_train_esc)
ensemble_pred_esc = ensemble_model_esc.predict(X_test_esc)
ensemble_auc_esc = roc_auc_score(y_test_esc, ensemble_pred_esc)
print(f"Linear model AUC Standarization scaling is: {ensemble_auc_esc}")

ensemble_model_esc_nor = RandomForestClassifier(random_state=42)
ensemble_param_esc_nor = ensemble_model_esc_nor.fit(X_train_esc_nor, y_train_esc_nor)
ensemble_pred_esc_nor = ensemble_model_esc_nor.predict(X_test_esc_nor)
ensemble_auc_esc_nor = roc_auc_score(y_test_esc_nor, ensemble_pred_esc_nor)
print(f"Linear model AUC Normalization scaling is: {ensemble_auc_esc_nor}")

ensemble_model_esc_rob = RandomForestClassifier(random_state=42)
ensemble_param_esc_rob = ensemble_model_esc_rob.fit(X_train_esc_rob, y_train_esc_rob)
ensemble_pred_esc_rob = ensemble_model_esc_rob.predict(X_test_esc_rob)
ensemble_auc_esc_rob = roc_auc_score(y_test_esc_rob, ensemble_pred_esc_rob)
print(f"Linear model AUC Robust scaling is: {ensemble_auc_esc_rob}")

Linear model AUC no scaling is: 0.6715751472544176
Linear model AUC Standarization scaling is: 0.6715751472544176
Linear model AUC Normalization scaling is: 0.6715751472544176
Linear model AUC Robust scaling is: 0.6715751472544176


---

## Enconding

ML algorithms do not support categorical data. Therefore you need to find a way to transform categorical data into numerical. You must compare the results using both techniques: __One Hot Encoding__ or __Label Encoding__

- [OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder)

- [LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html#sklearn.preprocessing.LabelEncoder)

<img src="../images/encoding.png" alt="Drawing" style="width: 500px;"/>

In [21]:
# Mushrooms dataset (https://www.kaggle.com/uciml/mushroom-classification)

mushrooms = pd.read_csv('../data/mushrooms.csv')
col_mushrooms = list(mushrooms.columns)
print(mushrooms.shape)
mushrooms.head()

(8124, 23)


Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [22]:
# Features analysis

cat_mushrooms = cat_var(mushrooms, col_mushrooms)
cat_mushrooms

Unnamed: 0,categorical_variable,number_of_possible_values,values
0,gill-color,12,"[k, n, g, p, w, h, u, e, b, r, y, o]"
1,cap-color,10,"[n, y, w, g, e, p, b, u, c, r]"
2,spore-print-color,9,"[k, n, u, h, w, r, o, y, b]"
3,odor,9,"[p, a, l, n, f, c, y, s, m]"
4,stalk-color-below-ring,9,"[w, p, g, b, n, e, y, o, c]"
5,stalk-color-above-ring,9,"[w, g, p, n, b, e, o, c, y]"
6,habitat,7,"[u, g, m, d, p, w, l]"
7,cap-shape,6,"[x, b, s, f, k, c]"
8,population,6,"[s, n, a, v, y, c]"
9,ring-type,5,"[p, e, l, f, n]"


In [23]:
mushrooms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [27]:
# Features + target (encoding). IMPORTANT: you may pick any of the 2-labeled features as you target (choose wisely!!!)

mushrooms_features = mushrooms.drop('class', axis=1)
mushrooms_target = mushrooms['class']

mushrooms_features_dum = pd.get_dummies(mushrooms_features, drop_first=True)

mushrooms_target_dum = pd.get_dummies(mushrooms_target, drop_first=True)
mushrooms_target_dum

Unnamed: 0,p
0,1
1,0
2,0
3,1
4,0
...,...
8119,0
8120,0
8121,0
8122,1


In [28]:
# Train + test
X_train, X_test, y_train, y_test = train_test_split(mushrooms_features_dum, mushrooms_target_dum, test_size=0.2, random_state=42)
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")
print(f"X_train: {type(X_train)}, X_test: {type(X_test)}, y_train: {type(y_train)}, y_test: {type(y_test)}")

X_train: (6499, 95), X_test: (1625, 95), y_train: (6499, 1), y_test: (1625, 1)
X_train: <class 'pandas.core.frame.DataFrame'>, X_test: <class 'pandas.core.frame.DataFrame'>, y_train: <class 'pandas.core.frame.DataFrame'>, y_test: <class 'pandas.core.frame.DataFrame'>


In [None]:
# Scaling









In [29]:
# Linear model

linear_model = LogisticRegression(max_iter=1000)
linear_param = linear_model.fit(X_train, y_train)
linear_pred = linear_model.predict(X_test)
linear_auc = roc_auc_score(y_test, linear_pred)
print(f"Linear model AUC is: {linear_auc}")

Linear model AUC is: 1.0


  y = column_or_1d(y, warn=True)


In [30]:
# Ensemble model

ensemble_model = RandomForestClassifier()
ensemble_param = ensemble_model.fit(X_train, y_train)
ensemble_pred = ensemble_model.predict(X_test)
ensemble_auc = roc_auc_score(y_test, ensemble_pred)
print(f"Linear model AUC is: {ensemble_auc}")

  ensemble_param = ensemble_model.fit(X_train, y_train)


Linear model AUC is: 1.0


---

## Bonus

Now that you can grasp the potential of pre-processing your data...what would you do about the following dataset?

<img src="../images/bonus.jpg" alt="Drawing" style="width: 500px;"/>

In [None]:
# Netflix dataset (https://www.kaggle.com/shivamb/netflix-shows)

netflix = pd.read_csv('../data/netflix_titles.csv')
col_netflix = list(netflix.columns)
print(netflix.shape)
netflix.head()

In [None]:
# ML workflow -> ¿what would you do?










---