In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import mlflow
import mlflow.sklearn

from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

# Load and Explore data

In [2]:
df = pd.read_csv('../assignment_3/mushroom2020_dataset.csv')

df.head()

Unnamed: 0,id,label,cap-shape,cap-surface,bruises,odor,gill-attachment,gill-spacing,gill-size,stalk-shape,...,ring-number,ring-type,spore-print-color,population,habitat,cap-color-rate,gill-color-rate,veil-color-rate,stalk-color-above-ring-rate,stalk-color-below-ring-rate
0,1,p,x,s,t,p,f,c,n,e,...,o,p,k,s,u,1.0,3.0,1.0,1.0,1.0
1,2,e,x,s,t,a,f,c,b,e,...,o,p,n,n,g,2.0,3.0,1.0,1.0,1.0
2,3,e,b,s,t,l,f,c,b,e,...,o,p,n,n,m,3.0,1.0,1.0,1.0,1.0
3,4,p,x,y,t,p,f,c,n,e,...,o,p,k,s,u,3.0,1.0,1.0,1.0,1.0
4,5,e,x,s,f,n,f,w,b,t,...,o,e,n,a,g,4.0,3.0,1.0,1.0,1.0


In [3]:
target_col = 'gill-size'

na_amt = df[df[target_col].isna()].shape[0]

print(f"Column {target_col} has {na_amt} rows that be NaN")

Column gill-size has 121 rows that be NaN


In [4]:
drop_col = ['id','gill-attachment', 'gill-spacing', 'gill-size','gill-color-rate','stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', \
            'stalk-color-above-ring-rate','stalk-color-below-ring-rate','veil-color-rate','veil-type']
df.drop(columns=drop_col, inplace=True)

df.dropna(subset=['label'], inplace=True)

df.reset_index()

df

Unnamed: 0,label,cap-shape,cap-surface,bruises,odor,stalk-shape,ring-number,ring-type,spore-print-color,population,habitat,cap-color-rate
0,p,x,s,t,p,e,o,p,k,s,u,1.0
1,e,x,s,t,a,e,o,p,n,n,g,2.0
2,e,b,s,t,l,e,o,p,n,n,m,3.0
3,p,x,y,t,p,e,o,p,k,s,u,3.0
4,e,x,s,f,n,t,o,e,n,a,g,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...
5819,e,k,s,f,n,e,o,p,b,c,l,1.0
5820,e,x,s,f,n,e,o,p,b,v,l,1.0
5821,e,f,s,f,n,e,o,p,b,c,l,1.0
5822,p,k,y,f,y,t,o,e,w,v,l,1.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5764 entries, 0 to 5823
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   label              5764 non-null   object 
 1   cap-shape          5764 non-null   object 
 2   cap-surface        5737 non-null   object 
 3   bruises            5665 non-null   object 
 4   odor               5665 non-null   object 
 5   stalk-shape        5643 non-null   object 
 6   ring-number        5702 non-null   object 
 7   ring-type          5702 non-null   object 
 8   spore-print-color  5708 non-null   object 
 9   population         5708 non-null   object 
 10  habitat            5733 non-null   object 
 11  cap-color-rate     5737 non-null   float64
dtypes: float64(1), object(11)
memory usage: 585.4+ KB


In [6]:
print(f"Shape of data is : {df.shape}")

missed_col = []

for col in df.columns:
    na_amt = df[col].isna().sum()
    if na_amt != 0:
        print(f"For column {col:20}has {na_amt:4} rows that be NaN ({100*na_amt/df.shape[0]:3.4f}%)")
        missed_col.append(col)

Shape of data is : (5764, 12)
For column cap-surface         has   27 rows that be NaN (0.4684%)
For column bruises             has   99 rows that be NaN (1.7176%)
For column odor                has   99 rows that be NaN (1.7176%)
For column stalk-shape         has  121 rows that be NaN (2.0992%)
For column ring-number         has   62 rows that be NaN (1.0756%)
For column ring-type           has   62 rows that be NaN (1.0756%)
For column spore-print-color   has   56 rows that be NaN (0.9715%)
For column population          has   56 rows that be NaN (0.9715%)
For column habitat             has   31 rows that be NaN (0.5378%)
For column cap-color-rate      has   27 rows that be NaN (0.4684%)


# Process data

In [7]:
df

Unnamed: 0,label,cap-shape,cap-surface,bruises,odor,stalk-shape,ring-number,ring-type,spore-print-color,population,habitat,cap-color-rate
0,p,x,s,t,p,e,o,p,k,s,u,1.0
1,e,x,s,t,a,e,o,p,n,n,g,2.0
2,e,b,s,t,l,e,o,p,n,n,m,3.0
3,p,x,y,t,p,e,o,p,k,s,u,3.0
4,e,x,s,f,n,t,o,e,n,a,g,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...
5819,e,k,s,f,n,e,o,p,b,c,l,1.0
5820,e,x,s,f,n,e,o,p,b,v,l,1.0
5821,e,f,s,f,n,e,o,p,b,c,l,1.0
5822,p,k,y,f,y,t,o,e,w,v,l,1.0


In [8]:
num_cols = ['cap-color-rate']
cat_cols = ['cap-shape', 'cap-surface', 'bruises', 'odor', 'stalk-shape', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']

num_pl = Pipeline(steps=[('impute', SimpleImputer(strategy='mean')),
                         ('scale', StandardScaler())])

cat_pl = Pipeline(steps=[('impute', SimpleImputer(strategy='most_frequent')),
                         ('one-hot', OneHotEncoder(drop='first'))])

column_transform = ColumnTransformer(transformers=[('num_pl', num_pl, num_cols),
                                                   ('cat_pl', cat_pl, cat_cols)], n_jobs=-1, remainder='drop')

column_transform

In [9]:
label = 'label'
features = list(df.columns)
features.remove(label)

X_train, X_test, y_train, y_test = train_test_split(df[features], 
                                                    df[label], 
                                                    test_size=0.2,
                                                    stratify=df[label],
                                                    random_state=2020)

print(f"X_train shape : {X_train.shape}")
print(f"X_test shape : {X_test.shape}")
print(f"y_train shape : {y_train.shape}")
print(f"y_test shape : {y_test.shape}")

X_train shape : (4611, 11)
X_test shape : (1153, 11)
y_train shape : (4611,)
y_test shape : (1153,)


# Fit model and track with MLFlow

In [10]:
class MLFlowPipelinedRF(Pipeline):

    def fit(self, X, y, **kwargs):
        super().fit(X, y)

        with mlflow.start_run():
            params = super().get_params()
            mlflow.log_param('criterion', params['model__criterion'])
            mlflow.log_param('max_depth', params['model__max_depth'])
            mlflow.log_param('min_samples_leaf', params['model__min_samples_leaf'])
            mlflow.log_param('n_estimators', params['model__n_estimators'])
            
            score = super().score(X, y)
            mlflow.log_metric('accuracy', score)

            mlflow.sklearn.log_model(self, 'model')

In [11]:
model = MLFlowPipelinedRF(steps=[('col_trans', column_transform),
                                 ('model', RandomForestClassifier())])

model.get_params()

{'memory': None,
 'steps': [('col_trans', ColumnTransformer(n_jobs=-1,
                     transformers=[('num_pl',
                                    Pipeline(steps=[('impute', SimpleImputer()),
                                                    ('scale', StandardScaler())]),
                                    ['cap-color-rate']),
                                   ('cat_pl',
                                    Pipeline(steps=[('impute',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('one-hot',
                                                     OneHotEncoder(drop='first'))]),
                                    ['cap-shape', 'cap-surface', 'bruises', 'odor',
                                     'stalk-shape', 'ring-number', 'ring-type',
                                     'spore-print-color', 'population',
                                     'habitat'])])),
  ('model', RandomFo

In [12]:
param_grid = {'model__criterion': ['gini','entropy'],
              'model__max_depth': [2,3,5,6,8],
              'model__min_samples_leaf':[2,5,7,10],
              'model__n_estimators':[100,150,200],
              'model__random_state': [2020]
}

best_model = GridSearchCV(model, param_grid)

local_registry = 'sqlite:///mlruns.db'
mlflow.set_tracking_uri(local_registry)
mlflow.set_experiment('grid_search')

best_model.fit(X_train, y_train)

best_model.best_params_

2024/02/11 16:53:13 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2024/02/11 16:53:13 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

{'model__criterion': 'gini',
 'model__max_depth': 2,
 'model__min_samples_leaf': 2,
 'model__n_estimators': 100,
 'model__random_state': 2020}

In [13]:
best_model_df = mlflow.search_runs(order_by=['metrics.accuracy DESC'], max_results=5)

best_model_df

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.accuracy,params.n_estimators,params.max_depth,params.criterion,params.min_samples_leaf,tags.mlflow.runName,tags.mlflow.source.type,tags.mlflow.log-model.history,tags.mlflow.user,tags.mlflow.source.name
0,522004517f154716991e6cfe47350ab6,1,FINISHED,/Users/jirayuwat/Desktop/2110446-DS-and-DE/ass...,2024-02-11 10:06:31.746000+00:00,2024-02-11 10:06:33.070000+00:00,0.999458,200,8,entropy,2,burly-shrimp-612,LOCAL,"[{""run_id"": ""522004517f154716991e6cfe47350ab6""...",jirayuwat,/Users/jirayuwat/anaconda3/envs/DSandDE/lib/py...
1,428c5a17324a473caa9ee0df2f82431f,1,FINISHED,/Users/jirayuwat/Desktop/2110446-DS-and-DE/ass...,2024-02-11 10:06:16.221000+00:00,2024-02-11 10:06:17.489000+00:00,0.999458,100,8,entropy,2,capable-crab-254,LOCAL,"[{""run_id"": ""428c5a17324a473caa9ee0df2f82431f""...",jirayuwat,/Users/jirayuwat/anaconda3/envs/DSandDE/lib/py...
2,0d05f61bdba144c6b5873e4b9374151b,1,FINISHED,/Users/jirayuwat/Desktop/2110446-DS-and-DE/ass...,2024-02-11 10:06:33.289000+00:00,2024-02-11 10:06:34.161000+00:00,0.999187,200,8,entropy,2,casual-bass-916,LOCAL,"[{""run_id"": ""0d05f61bdba144c6b5873e4b9374151b""...",jirayuwat,/Users/jirayuwat/anaconda3/envs/DSandDE/lib/py...
3,cb7106c7a07e49e186f6687df637d4b2,1,FINISHED,/Users/jirayuwat/Desktop/2110446-DS-and-DE/ass...,2024-02-11 10:06:20.775000+00:00,2024-02-11 10:06:21.648000+00:00,0.999187,100,8,entropy,2,masked-chimp-577,LOCAL,"[{""run_id"": ""cb7106c7a07e49e186f6687df637d4b2""...",jirayuwat,/Users/jirayuwat/anaconda3/envs/DSandDE/lib/py...
4,f2d9dd5a23554d5fb7b38f54d83b48ad,1,FINISHED,/Users/jirayuwat/Desktop/2110446-DS-and-DE/ass...,2024-02-11 10:06:19.389000+00:00,2024-02-11 10:06:20.271000+00:00,0.999187,100,8,entropy,2,clumsy-stag-838,LOCAL,"[{""run_id"": ""f2d9dd5a23554d5fb7b38f54d83b48ad""...",jirayuwat,/Users/jirayuwat/anaconda3/envs/DSandDE/lib/py...


In [14]:
best_model = mlflow.sklearn.load_model(model_uri=best_model_df.iloc[0].artifact_uri+'/model')

best_model

In [15]:
!mlflow ui --port 5001 --backend-store-uri sqlite:///mlruns.db

[2024-02-11 17:07:47 +0700] [29183] [INFO] Starting gunicorn 21.2.0
[2024-02-11 17:07:47 +0700] [29183] [INFO] Listening at: http://127.0.0.1:5001 (29183)
[2024-02-11 17:07:47 +0700] [29183] [INFO] Using worker: sync
[2024-02-11 17:07:47 +0700] [29184] [INFO] Booting worker with pid: 29184
[2024-02-11 17:07:47 +0700] [29185] [INFO] Booting worker with pid: 29185
[2024-02-11 17:07:47 +0700] [29186] [INFO] Booting worker with pid: 29186
[2024-02-11 17:07:47 +0700] [29187] [INFO] Booting worker with pid: 29187
^C
[2024-02-11 17:10:07 +0700] [29183] [INFO] Handling signal: int
[2024-02-11 17:10:07 +0700] [29186] [INFO] Worker exiting (pid: 29186)
[2024-02-11 17:10:07 +0700] [29185] [INFO] Worker exiting (pid: 29185)
[2024-02-11 17:10:07 +0700] [29184] [INFO] Worker exiting (pid: 29184)
[2024-02-11 17:10:07 +0700] [29187] [INFO] Worker exiting (pid: 29187)


# Evaluate

In [16]:
model = best_model

model

In [17]:
y_pred = model.predict(X_test)

In [18]:
print(classification_report(y_test, y_pred, digits=3))

              precision    recall  f1-score   support

           e      0.998     1.000     0.999       421
           p      1.000     0.999     0.999       732

    accuracy                          0.999      1153
   macro avg      0.999     0.999     0.999      1153
weighted avg      0.999     0.999     0.999      1153



In [19]:
print(f'\t   False    True')
for idx, row in enumerate(confusion_matrix(y_test, y_pred)):
    print(['Negative','Positive'][idx], end='')
    for element in row:
        print(f"{element:8}", end='')
    print()

	   False    True
Negative     421       0
Positive       1     731
