In [49]:
%pip install mlflow 

Note: you may need to restart the kernel to use updated packages.


In [53]:
#Import the libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

import mlflow
import mlflow.sklearn


In [30]:
# 1
df = pd.read_csv('https://raw.githubusercontent.com/pvateekul/2110446_DSDE_2023s2/main/code/Week03_ML/mushroom2020_dataset.csv')

In [31]:
# na in gill-size
df["gill-size"].isnull().sum()

121

In [32]:
# 2
df = df.dropna(subset=['label'])

In [33]:
# 3
df = df.drop(columns=['id','gill-attachment', 'gill-spacing', 'gill-size','gill-color-rate',
                      'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring',
                      'stalk-color-above-ring-rate','stalk-color-below-ring-rate','veil-color-rate','veil-type'])

In [34]:
# 4
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 5764 entries, 0 to 5823
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   label              5764 non-null   object 
 1   cap-shape          5764 non-null   object 
 2   cap-surface        5737 non-null   object 
 3   bruises            5665 non-null   object 
 4   odor               5665 non-null   object 
 5   stalk-shape        5643 non-null   object 
 6   ring-number        5702 non-null   object 
 7   ring-type          5702 non-null   object 
 8   spore-print-color  5708 non-null   object 
 9   population         5708 non-null   object 
 10  habitat            5733 non-null   object 
 11  cap-color-rate     5737 non-null   float64
dtypes: float64(1), object(11)
memory usage: 585.4+ KB
None


In [35]:
# 5
for col in df.columns:
    if df[col].dtype == np.number:
        df[col].fillna(df[col].mean(), inplace=True)
    else:
        df[col].fillna(df[col].mode()[0], inplace=True)


  if df[col].dtype == np.number:


In [36]:
df

Unnamed: 0,label,cap-shape,cap-surface,bruises,odor,stalk-shape,ring-number,ring-type,spore-print-color,population,habitat,cap-color-rate
0,p,x,s,t,p,e,o,p,k,s,u,1.0
1,e,x,s,t,a,e,o,p,n,n,g,2.0
2,e,b,s,t,l,e,o,p,n,n,m,3.0
3,p,x,y,t,p,e,o,p,k,s,u,3.0
4,e,x,s,f,n,t,o,e,n,a,g,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...
5819,e,k,s,f,n,e,o,p,b,c,l,1.0
5820,e,x,s,f,n,e,o,p,b,v,l,1.0
5821,e,f,s,f,n,e,o,p,b,c,l,1.0
5822,p,k,y,f,y,t,o,e,w,v,l,1.0


In [37]:
# 6
df['label'] = df['label'].map({'e': 1, 'p': 0})
print(df['label'].value_counts())


label
0    3660
1    2104
Name: count, dtype: int64


In [38]:
# 7
df = pd.get_dummies(df, drop_first=True)


In [39]:
# 8
X = df.drop('label', axis=1)
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2020)
print("training set:", y_train.shape[0])
print("testing set:", y_test.shape[0])

training set: 4611
testing set: 1153


In [40]:
# 9
param_grid = {
    'criterion':['gini','entropy'],
    'max_depth': [2,3,6],
    'min_samples_leaf':[2,5,10],
    'n_estimators':[100,200],
    'random_state': [2020]
}
rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)


In [41]:
print(grid_search.best_params_)

{'criterion': 'gini', 'max_depth': 6, 'min_samples_leaf': 2, 'n_estimators': 100, 'random_state': 2020}


In [42]:
# 10
y_pred = grid_search.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[731   1]
 [  5 416]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       732
           1       1.00      0.99      0.99       421

    accuracy                           0.99      1153
   macro avg       1.00      0.99      0.99      1153
weighted avg       0.99      0.99      0.99      1153



In [45]:
# pipeline in sklearn
num_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_features = X_train.select_dtypes(include=['object']).columns.tolist()

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ])

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier(random_state=2020))])


In [46]:
param_grid = {
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_depth': [2, 3, 6],
    'classifier__min_samples_leaf': [2, 5, 10],
    'classifier__n_estimators': [100, 200]
}

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

In [47]:
print(grid_search.best_params_)

{'classifier__criterion': 'gini', 'classifier__max_depth': 2, 'classifier__min_samples_leaf': 2, 'classifier__n_estimators': 100}


In [48]:
y_pred = grid_search.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[728   4]
 [418   3]]
              precision    recall  f1-score   support

           0       0.64      0.99      0.78       732
           1       0.43      0.01      0.01       421

    accuracy                           0.63      1153
   macro avg       0.53      0.50      0.39      1153
weighted avg       0.56      0.63      0.50      1153



In [51]:
# MLflow
mlflow.set_tracking_uri("file:///Users/inwpuun/mlruns")

In [54]:
with mlflow.start_run():
    mlflow.log_param("max_depth", 2)
    mlflow.log_param("n_estimators", 100)
    
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', RandomForestClassifier(max_depth=2, n_estimators=100, random_state=2020))])
    clf.fit(X_train, y_train)
    mlflow.sklearn.log_model(clf, "model")
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    mlflow.log_metric("accuracy", accuracy)
    
    print(f"Model accuracy: {accuracy}")

Model accuracy: 0.6339982653946227
