In [1]:
import pandas as pd
from sklearn import metrics

In [2]:
df = pd.read_csv('penguin.csv')
df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [3]:
df.isnull().sum()

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   culmen_length_mm   342 non-null    float64
 3   culmen_depth_mm    342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [5]:
df.drop(columns = ['island', 'sex'], inplace = True)

In [6]:
df.dropna(inplace = True)

In [7]:
df.head()

Unnamed: 0,species,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g
0,Adelie,39.1,18.7,181.0,3750.0
1,Adelie,39.5,17.4,186.0,3800.0
2,Adelie,40.3,18.0,195.0,3250.0
4,Adelie,36.7,19.3,193.0,3450.0
5,Adelie,39.3,20.6,190.0,3650.0


In [8]:
import mlflow
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import metrics

mlflow.autolog()

x = df.drop(columns = ['species'])
y = df['species']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

scaler = StandardScaler()

x_train_rescaled = pd.DataFrame(scaler.fit_transform(x_train), columns = x_train.columns, index = x_train.index)

x_test_rescaled = pd.DataFrame(scaler.transform(x_test), columns = x_test.columns, index = x_test.index)

2023/05/05 14:23:05 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [9]:
with mlflow.start_run():
    mlflow.set_tag("dev", "Harshit")
    mlflow.set_tag("algo", "KNN")
    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path", "data/penguin.csv")
    k = 53
    mlflow.log_param("n_neighbors", k)
    knn_classifier = KNeighborsClassifier(n_neighbors = k)
    knn_classifier.fit(x_train_rescaled, y_train)
    y_test_pred = knn_classifier.predict(x_test_rescaled)
    acc = metrics.accuracy_score(y_test, y_test_pred)
    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(knn_classifier, artifact_path = "models")
    mlflow.log_artifact("models/standard_scaler.pkl")



In [10]:
with mlflow.start_run():
    mlflow.set_tag("dev", "Harshit")
    mlflow.set_tag("algo", "Logit")
    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path", "data/penguin.csv")
    C = 0.1
    mlflow.log_param("C", C)
    lr_classifier = LogisticRegression(C = C)
    lr_classifier.fit(x_train_rescaled, y_train)
    y_test_pred = lr_classifier.predict(x_test_rescaled)
    acc = metrics.accuracy_score(y_test, y_test_pred)    
    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(lr_classifier, artifact_path="models")
    mlflow.log_artifact("models/standard_scaler.pkl")

In [11]:
with mlflow.start_run():
    mlflow.set_tag("dev", "Harshit")
    mlflow.set_tag("algo", "GaussianNB")
    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path", "data/penguin.csv")
    nb_classifier = GaussianNB()
    nb_classifier.fit(x_train_rescaled, y_train)
    y_test_pred = nb_classifier.predict(x_test_rescaled)
    acc = metrics.accuracy_score(y_test, y_test_pred)    
    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(nb_classifier, artifact_path="models")
    mlflow.log_artifact("models/standard_scaler.pkl")

In [12]:
with mlflow.start_run():
    mlflow.set_tag("dev", "Harshit")
    mlflow.set_tag("algo", "DecisionTree")
    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path", "data/penguin.csv")
    depth = 3
    mlflow.log_param("max_depth", depth)
    dt_classifier = DecisionTreeClassifier(max_depth = depth)
    dt_classifier.fit(x_train_rescaled, y_train)
    y_test_pred = dt_classifier.predict(x_test_rescaled)
    acc = metrics.accuracy_score(y_test, y_test_pred)    
    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(dt_classifier, artifact_path="models")
    mlflow.log_artifact("models/standard_scaler.pkl")

In [13]:
with mlflow.start_run():
    mlflow.set_tag("dev", "Harshit")
    mlflow.set_tag("algo", "SVM")
    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path", "data/penguin.csv")
    C = 0.1
    mlflow.log_param("C", C)
    sv_classifier = SVC(C=C)
    sv_classifier.fit(x_train_rescaled, y_train)
    y_test_pred = sv_classifier.predict(x_test_rescaled)
    acc = metrics.accuracy_score(y_test, y_test_pred)    
    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(sv_classifier, artifact_path="models")
    mlflow.log_artifact("models/standard_scaler.pkl")

In [14]:
from sklearn.model_selection import GridSearchCV
# Enabling automatic MLflow logging for scikit-learn runs
mlflow.sklearn.autolog(max_tuning_runs=None)

with mlflow.start_run():
    tuned_parameters = [{'n_neighbors':[i for i in range(1, 51)], 'p':[1, 2]}]

    clf = GridSearchCV(
        estimator = KNeighborsClassifier(), 
        param_grid = tuned_parameters, 
        scoring = 'accuracy',
        cv = 5,
        return_train_score = True,
        verbose = 1
    )
    clf.fit(x_train, y_train)
    
    # Disabling autologging
    mlflow.sklearn.autolog(disable=True)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
