# Scikit-learn Iris Dataset experiment

In [23]:
import pandas as pd
from sklearn.datasets import load_iris

def load_data():
    iris = load_iris()
    df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
    df['target'] = iris.target
    return df

In [32]:
df = load_data()

In [165]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pandas as pd

def preprocess_iris_dataframe(df, target_column):
    # Copy the dataframe to avoid modifying the original one
    df_preprocessed = df.copy()
    
    # Scale the feature columns
    scaler = StandardScaler()
    feature_columns = df_preprocessed.columns[df_preprocessed.columns != target_column]
    df_preprocessed[feature_columns] = scaler.fit_transform(df_preprocessed[feature_columns])
    
    # Encode the target column
    label_encoder = LabelEncoder()
    df_preprocessed[target_column] = label_encoder.fit_transform(df_preprocessed[target_column])
    
    return df_preprocessed, label_encoder

In [166]:
df, _ = preprocess_iris_dataframe(df, 'target')

In [167]:
import pandas as pd
from sklearn.model_selection import train_test_split

def split_data(df,test_ratio,random_state):
    train, test = train_test_split(df, test_size=test_ratio, random_state=random_state)
    
    return train, test

In [168]:
train, test = split_data(df, 0.2, 42)

In [169]:
train.shape, test.shape, df.shape

((120, 5), (30, 5), (150, 5))

In [170]:
def get_feat_and_target(df,target):
    """
    Get features and target variables seperately from given dataframe and target 
    input: dataframe and target column
    output: two dataframes for x and y 
    """
    x=df.drop(target,axis=1)
    y=df[[target]]
    y = y.values.ravel()
    return x,y    

In [171]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

def train_and_evaluate(df):
    train, test = split_data(df, test_ratio=0.7, random_state=42)
    train_x, train_y = get_feat_and_target(train, target='target')
    test_x, test_y = get_feat_and_target(test, target='target')

    ################### MLFLOW ###############################
    remote_server_uri = 'http://localhost:5000'
    mlflow.set_experiment("Iris RandomForest Experiment")

    # PARAMS 
    # FIXME make it more flexible
    MAX_DEPTH = 3
    N_ESTIMATOR = 50
    
    with mlflow.start_run():
        rf_classifier = RandomForestClassifier(max_depth=MAX_DEPTH, n_estimators=N_ESTIMATOR, random_state=42)
        rf_classifier.fit(train_x, train_y)

        y_pred = rf_classifier.predict(test_x)

        # Calculate metrics
        accuracy = accuracy_score(test_y, y_pred)
        precision = precision_score(test_y, y_pred, average='macro')
        recall = recall_score(test_y, y_pred, average='macro')
        f1score = f1_score(test_y, y_pred, average='macro')

        # Log parameters, metrics, and the model
        mlflow.log_param("max_depth", MAX_DEPTH)
        mlflow.log_param("n_estimators", N_ESTIMATOR)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1score)
        # Specify metric conditions
       
        mlflow.sklearn.log_model(rf_classifier, "random_forest_model")


        exp = mlflow.search_runs(
            filter_string=f"metrics.f1_score >= 0.9",
            order_by=["start_time DESC"],
            max_results=1
        )
        
        # Get the ID of the registered model
        model_id = exp.iloc[0].run_id

        # Save the registered model as a pickle file
        mlflow.sklearn.save_model(f"runs:/{model_id}/random_forest_model", f"models/{model_id}")
               
        print(f"Run with max_depth={MAX_DEPTH}, n_estimators={N_ESTIMATOR} logged with accuracy={accuracy}")

        return model_id

In [172]:
train_and_evaluate(df)



Run with max_depth=3, n_estimators=50 logged with accuracy=0.9238095238095239


'335616595d86472e8c58b2d3222c492b'

In [175]:
model_id = '335616595d86472e8c58b2d3222c492b'

# Load the model associated with the specified run ID
model = mlflow.sklearn.load_model(f"runs:/{model_id}/random_forest_model")

# Make predictions on the new data
predictions = model.predict(pd.DataFrame(df.iloc[120][0:4]).T)

# Print or use the predictions as needed
print(predictions)

[2]


In [177]:
pd.DataFrame(df.iloc[10][0:4]).T

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
10,-0.537178,1.479398,-1.283389,-1.315444
