In [1]:
import pandas as pd
import math
from pandas import DataFrame
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib

# Load Titanic dataset
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv?raw=True'
df = pd.read_csv(url)

# Select number columns (features)
def select_number_columns(df: DataFrame) -> DataFrame:
    return df[['Age', 'Fare', 'Parch', 'Pclass', 'SibSp', 'Survived']]

# Fill missing values with the median
def fill_missing_values_with_median(df: DataFrame) -> DataFrame:
    for col in df.columns:
        values = sorted(df[col].dropna().tolist())
        median_value = values[math.floor(len(values) / 2)]
        df[[col]] = df[[col]].fillna(median_value)
    return df

# Function to transform data
def transform_df(df: DataFrame) -> DataFrame:
    print(f"Type of data before transformation: {type(df)}")
    assert isinstance(df, pd.DataFrame), f"Input data is not a DataFrame! Type: {type(df)}"
    transformed_df = fill_missing_values_with_median(select_number_columns(df))
    assert isinstance(transformed_df, pd.DataFrame), f"Output of transform_df is not a DataFrame! Type: {type(transformed_df)}"
    print(f"Type of Transformed df is: {type(transformed_df)}")
    return transformed_df

# Model training function with pipeline
def train_models(df: DataFrame):
    # Split data into features (X) and target (y)
    X = df.drop('Survived', axis=1)
    y = df['Survived']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define models and create a pipeline
    models = {
        'RandomForest': RandomForestClassifier(),
        'LogisticRegression': LogisticRegression(max_iter=1000),
        'SVC': SVC(),
        'KNeighbors': KNeighborsClassifier()
    }

    best_accuracy = 0
    best_model = None
    best_model_name = ""

    # Iterate through each model and evaluate performance using a pipeline
    for model_name, model in models.items():
        print(f"\nTraining {model_name} with pipeline...")

        # Create a pipeline that first scales data and then applies the model
        pipeline = Pipeline([
            ('scaler', StandardScaler()),  # Feature scaling
            ('classifier', model)          # Model training
        ])

        # Train the model
        pipeline.fit(X_train, y_train)

        # Evaluate the model
        y_pred = pipeline.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        print(f"{model_name} Accuracy: {accuracy:.4f}")

        # Save the best model
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = pipeline
            best_model_name = model_name

    print(f"\nBest Model: {best_model_name} with accuracy: {best_accuracy:.4f}")

    # Save the best model to a file
    joblib.dump(best_model, 'best_titanic_model_with_pipeline.pkl')
    print(f"Saving model of type: {type(best_model)}")

    return best_model

# Apply transformations and train models
df_transformed = transform_df(df)
best_model = train_models(df_transformed)

# Optional: Test output (e.g., print first few rows of the transformed data)
print("\nTransformed data preview:")
print(df_transformed.head())



Type of data before transformation: <class 'pandas.core.frame.DataFrame'>
Type of Transformed df is: <class 'pandas.core.frame.DataFrame'>

Training RandomForest with pipeline...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[[col]] = df[[col]].fillna(median_value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[[col]] = df[[col]].fillna(median_value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[[col]] = df[[col]].fillna(median_value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

RandomForest Accuracy: 0.7207

Training LogisticRegression with pipeline...
LogisticRegression Accuracy: 0.7318

Training SVC with pipeline...
SVC Accuracy: 0.7430

Training KNeighbors with pipeline...
KNeighbors Accuracy: 0.7095

Best Model: SVC with accuracy: 0.7430
Saving model of type: <class 'sklearn.pipeline.Pipeline'>

Transformed data preview:
    Age     Fare  Parch  Pclass  SibSp  Survived
0  22.0   7.2500      0       3      1         0
1  38.0  71.2833      0       1      1         1
2  26.0   7.9250      0       3      0         1
3  35.0  53.1000      0       1      1         1
4  35.0   8.0500      0       3      0         0


In [2]:
import pandas as pd
import math
import mlflow
import mlflow.sklearn
from pandas import DataFrame
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import joblib

# Load Titanic dataset
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv?raw=True'
df = pd.read_csv(url)

# Select number columns (features)
def select_number_columns(df: DataFrame) -> DataFrame:
    return df[['Age', 'Fare', 'Parch', 'Pclass', 'SibSp', 'Survived']]

# Fill missing values with the median
def fill_missing_values_with_median(df: DataFrame) -> DataFrame:
    for col in df.columns:
        values = sorted(df[col].dropna().tolist())
        median_value = values[math.floor(len(values) / 2)]
        df[[col]] = df[[col]].fillna(median_value)
    return df

# Function to transform data
def transform_df(df: DataFrame) -> DataFrame:
    print(f"Type of data before transformation: {type(df)}")
    assert isinstance(df, pd.DataFrame), f"Input data is not a DataFrame! Type: {type(df)}"
    transformed_df = fill_missing_values_with_median(select_number_columns(df))
    assert isinstance(transformed_df, pd.DataFrame), f"Output of transform_df is not a DataFrame! Type: {type(transformed_df)}"
    print(f"Type of Transformed df is: {type(transformed_df)}")
    return transformed_df

# Model training function with pipeline
def train_models(df: DataFrame):
    # Split data into features (X) and target (y)
    X = df.drop('Survived', axis=1)
    y = df['Survived']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define models and create a pipeline
    models = {
        'RandomForest': RandomForestClassifier(),
        'LogisticRegression': LogisticRegression(max_iter=1000),
        'SVC': SVC(),
        'KNeighbors': KNeighborsClassifier()
    }

    best_accuracy = 0
    best_model = None
    best_model_name = ""

    # Initialize MLflow experiment
    mlflow.set_experiment("Titanic_Model_Experiment")
    mlflow.set_tracking_uri("http://127.0.0.1:5000")  # Update with your MLflow server URI if necessary

    # Iterate through each model and evaluate performance using a pipeline
    for model_name, model in models.items():
        print(f"\nTraining {model_name} with pipeline...")

        # Create a pipeline that first scales data and then applies the model
        pipeline = Pipeline([ 
            ('scaler', StandardScaler()),  # Feature scaling
            ('classifier', model)          # Model training
        ])

        # Start MLflow run for each model
        with mlflow.start_run(run_name=model_name):
            # Log model parameters
            mlflow.log_params(model.get_params())

            # Train the model
            pipeline.fit(X_train, y_train)

            # Evaluate the model
            y_pred = pipeline.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)

            print(f"{model_name} Accuracy: {accuracy:.4f}")

            # Log metrics for the model
            mlflow.log_metrics({
                'accuracy': accuracy,
            })

            # Log the trained model to MLflow
            mlflow.sklearn.log_model(pipeline, "model")

            # Save the best model
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_model = pipeline
                best_model_name = model_name

    print(f"\nBest Model: {best_model_name} with accuracy: {best_accuracy:.4f}")

    # Optionally save the best model locally
    joblib.dump(best_model, 'best_titanic_model_with_pipeline.pkl')
    print(f"Saving model of type: {type(best_model)}")

    return best_model

# Apply transformations and train models
df_transformed = transform_df(df)
best_model = train_models(df_transformed)

# Optional: Test output (e.g., print first few rows of the transformed data)
print("\nTransformed data preview:")
print(df_transformed.head())


Type of data before transformation: <class 'pandas.core.frame.DataFrame'>
Type of Transformed df is: <class 'pandas.core.frame.DataFrame'>

Training RandomForest with pipeline...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[[col]] = df[[col]].fillna(median_value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[[col]] = df[[col]].fillna(median_value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[[col]] = df[[col]].fillna(median_value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

RestException: RESOURCE_DOES_NOT_EXIST: Could not find experiment with ID 632744194449365604

In [3]:
import pandas as pd
import math
from pandas import DataFrame
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib

# Load Titanic dataset
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv?raw=True'
df = pd.read_csv(url)

# Select number columns (features)
def select_number_columns(df: DataFrame) -> DataFrame:
    return df[['Age', 'Fare', 'Parch', 'Pclass', 'SibSp', 'Survived']]

# Fill missing values with the median
def fill_missing_values_with_median(df: DataFrame) -> DataFrame:
    for col in df.columns:
        values = sorted(df[col].dropna().tolist())
        median_value = values[math.floor(len(values) / 2)]
        df[[col]] = df[[col]].fillna(median_value)
    return df

In [4]:
# Function to transform data
def transform_df(df: DataFrame) -> DataFrame:
    print(f"Type of data before transformation: {type(df)}")
    assert isinstance(df, pd.DataFrame), f"Input data is not a DataFrame! Type: {type(df)}"
    transformed_df = fill_missing_values_with_median(select_number_columns(df))
    assert isinstance(transformed_df, pd.DataFrame), f"Output of transform_df is not a DataFrame! Type: {type(transformed_df)}"
    print(f"Type of Transformed df is: {type(transformed_df)}")
    return transformed_df

In [5]:
# Apply transformations and train models
df_transformed = transform_df(df)
df_transformed

Type of data before transformation: <class 'pandas.core.frame.DataFrame'>
Type of Transformed df is: <class 'pandas.core.frame.DataFrame'>


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[[col]] = df[[col]].fillna(median_value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[[col]] = df[[col]].fillna(median_value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[[col]] = df[[col]].fillna(median_value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

Unnamed: 0,Age,Fare,Parch,Pclass,SibSp,Survived
0,22.0,7.2500,0,3,1,0
1,38.0,71.2833,0,1,1,1
2,26.0,7.9250,0,3,0,1
3,35.0,53.1000,0,1,1,1
4,35.0,8.0500,0,3,0,0
...,...,...,...,...,...,...
886,27.0,13.0000,0,2,0,0
887,19.0,30.0000,0,1,0,1
888,28.0,23.4500,2,3,1,0
889,26.0,30.0000,0,1,0,1


In [14]:
X = df_transformed.drop('Survived', axis=1)
y = df_transformed['Survived']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
print(f'The shape of X_train is {X_train.shape}')
print(f'The shape of X_test is {X_test.shape}')

The shape of X_train is (712, 5)
The shape of X_test is (179, 5)


### Experiment 1: Train Logistic Regression Classifier

In [22]:
from sklearn.metrics import classification_report

log_reg = LogisticRegression(C=1, solver='liblinear', class_weight='balanced')
pipeline = Pipeline([
            ('scaler', StandardScaler()),  # Feature scaling
            ('classifier', log_reg)          # Model training
        ])

        # Train the model
pipeline.fit(X_train, y_train)

        # Evaluate the model
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.74      0.76       105
           1       0.66      0.70      0.68        74

    accuracy                           0.73       179
   macro avg       0.72      0.72      0.72       179
weighted avg       0.73      0.73      0.73       179



### Experiment 2: Train Random Forest Classifier

In [24]:
rf_clf = RandomForestClassifier(n_estimators=30, max_depth=3, class_weight='balanced')

pipeline2 = Pipeline([
            ('scaler', StandardScaler()),  # Feature scaling
            ('classifier', rf_clf)          # Model training
        ])

        # Train the model
pipeline2.fit(X_train, y_train)

        # Evaluate the model
y_pred = pipeline2.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.78      0.71      0.75       105
           1       0.64      0.72      0.68        74

    accuracy                           0.72       179
   macro avg       0.71      0.72      0.71       179
weighted avg       0.72      0.72      0.72       179



### Experiment 3: Train Random Forest Classifier

In [28]:
SVC = SVC(class_weight='balanced')

pipeline3 = Pipeline([
            ('scaler', StandardScaler()),  # Feature scaling
            ('SVC', SVC)          # Model training
        ])

        # Train the model
pipeline3.fit(X_train, y_train)

        # Evaluate the model
y_pred = pipeline3.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.76      0.64      0.69       105
           1       0.58      0.72      0.64        74

    accuracy                           0.67       179
   macro avg       0.67      0.68      0.67       179
weighted avg       0.69      0.67      0.67       179



In [34]:
models = [
    (
        "LR Normal", 
        {},
        LogisticRegression(), 
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "LR With params", 
        {"C": 1, "solver": 'liblinear'},
        LogisticRegression(C=1, solver='liblinear', class_weight='balanced'), 
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "RF Normal", 
        {},
        RandomForestClassifier(), 
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "RF With params", 
        {"n_estimators": 30, "max_depth": 3, "class_weight":'balanced'},
        RandomForestClassifier(n_estimators=30, max_depth=3, class_weight='balanced'), 
        (X_train, y_train),
        (X_test, y_test)
    ),
]


In [36]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Define models with pipelines
models = [
    (
        "LR Normal", 
        {},
        Pipeline([('scaler', StandardScaler()), ('LR', LogisticRegression())]), 
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "LR With params", 
        {"LR__C": 1, "LR__solver": 'liblinear'},
        Pipeline([('scaler', StandardScaler()), ('LR', LogisticRegression())]), 
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "RF Normal", 
        {},
        Pipeline([('scaler', StandardScaler()), ('RF', RandomForestClassifier())]), 
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "RF With params", 
        {"RF__n_estimators": 30, "RF__max_depth": 3, "RF__class_weight": 'balanced'},
        Pipeline([('scaler', StandardScaler()), ('RF', RandomForestClassifier())]), 
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "SVC With params", 
        {"SVC__C": 1, "SVC__kernel": 'linear', "SVC__class_weight": 'balanced'},
        Pipeline([('scaler', StandardScaler()), ('SVC', SVC())]), 
        (X_train, y_train),
        (X_test, y_test)
    ),
]

reports = []

for model_name, params, pipeline, (X_train, y_train), (X_test, y_test) in models:
    print(f'Model name: {model_name}')
    
    # Set model-specific parameters if provided
    if params:
        pipeline.set_params(**params)

    # Fit the pipeline
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(X_test)
    
    # Generate and print the classification report
    report = classification_report(y_test, y_pred, output_dict=True)
    print(report)
    print(f'#################################\n')
    
    # Append the report to the list
    reports.append((model_name, report))

# Optionally, you can print out the reports
for model_name, report in reports:
    print(f"Report for {model_name}:")
    print(report)
    print(f"###############################")


Model name: LR Normal
{'0': {'precision': 0.7175572519083969, 'recall': 0.8952380952380953, 'f1-score': 0.7966101694915254, 'support': 105.0}, '1': {'precision': 0.7708333333333334, 'recall': 0.5, 'f1-score': 0.6065573770491803, 'support': 74.0}, 'accuracy': 0.7318435754189944, 'macro avg': {'precision': 0.7441952926208651, 'recall': 0.6976190476190476, 'f1-score': 0.7015837732703529, 'support': 179.0}, 'weighted avg': {'precision': 0.7395820006539015, 'recall': 0.7318435754189944, 'f1-score': 0.7180408586494387, 'support': 179.0}}
#################################

Model name: LR With params
{'0': {'precision': 0.7175572519083969, 'recall': 0.8952380952380953, 'f1-score': 0.7966101694915254, 'support': 105.0}, '1': {'precision': 0.7708333333333334, 'recall': 0.5, 'f1-score': 0.6065573770491803, 'support': 74.0}, 'accuracy': 0.7318435754189944, 'macro avg': {'precision': 0.7441952926208651, 'recall': 0.6976190476190476, 'f1-score': 0.7015837732703529, 'support': 179.0}, 'weighted avg':

In [39]:
import mlflow

# Set or create experiment if necessary
mlflow.set_experiment('Titanic_Survival_Prediction')

2025/02/06 22:23:08 INFO mlflow.tracking.fluent: Experiment with name 'Titanic_Survival_Prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/191165618774552540', creation_time=1738869788246, experiment_id='191165618774552540', last_update_time=1738869788246, lifecycle_stage='active', name='Titanic_Survival_Prediction', tags={}>

In [None]:
import mlflow
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Initialize MLflow
mlflow.set_experiment("Titanics Detection")
mlflow.set_tracking_uri("http://127.0.0.1:5000")

for i, element in enumerate(models):
    model_name = element[0]
    params = element[1]
    pipeline = element[2]
    (X_train, y_train), (X_test, y_test) = element[3], element[4]
    
    with mlflow.start_run(run_name=model_name):
        # Set model-specific parameters
        if params:
            pipeline.set_params(**params)
        
        # Fit the model pipeline
        pipeline.fit(X_train, y_train)
        
        # Make predictions
        y_pred = pipeline.predict(X_test)
        
        # Generate classification report
        report = classification_report(y_test, y_pred, output_dict=True)
        
        # Log the parameters
        mlflow.log_params(params)
        
        # Log the metrics (handling edge cases for class imbalance)
        metrics = {
            'accuracy': report['accuracy'],
            'precision_class_1': report['1']['precision'] if '1' in report else 0,
            'precision_class_0': report['0']['precision'] if '0' in report else 0,
            'recall_class_1': report['1']['recall'] if '1' in report else 0,
            'recall_class_0': report['0']['recall'] if '0' in report else 0,
            'f1_score_class_1': report['1']['f1-score'] if '1' in report else 0,
            'f1_score_class_0': report['0']['f1-score'] if '0' in report else 0,
            'f1_score_macro': report['macro avg']['f1-score']
        }
        mlflow.log_metrics(metrics)
        
        # Log the model based on type
        if "XGB" in model_name:
            mlflow.xgboost.log_model(pipeline, "model")
        else:
            mlflow.sklearn.log_model(pipeline, "model")


2025/02/06 22:25:29 INFO mlflow.tracking.fluent: Experiment with name 'Titanics Detections' does not exist. Creating a new experiment.




🏃 View run LR Normal at: http://127.0.0.1:5000/#/experiments/521150557808231275/runs/9f666a34ebdd469db61b226a21040f5b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/521150557808231275




🏃 View run LR With params at: http://127.0.0.1:5000/#/experiments/521150557808231275/runs/315190e4e3984b17931b377b32b8d067
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/521150557808231275




🏃 View run RF Normal at: http://127.0.0.1:5000/#/experiments/521150557808231275/runs/41f99c19243f4e7782c6dac506b1b748
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/521150557808231275




🏃 View run RF With params at: http://127.0.0.1:5000/#/experiments/521150557808231275/runs/8c4d2ecbaf684506ae542ca426f69ac3
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/521150557808231275




🏃 View run SVC With params at: http://127.0.0.1:5000/#/experiments/521150557808231275/runs/c1548e7474d2477da6b75312ae698858
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/521150557808231275


In [42]:
model_name = 'LR With params'  
run_id = input('Please type RunID: ')  

# Form the model URI from the run ID
model_uri = f'runs:/{run_id}/model' 

# Register the model
with mlflow.start_run(run_id=run_id):
    mlflow.register_model(model_uri=model_uri, name=model_name)

print(f"Model registered with name: {model_name} and run ID: {run_id}")


Successfully registered model 'LR With params'.
2025/02/06 22:28:13 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LR With params, version 1


🏃 View run LR With params at: http://127.0.0.1:5000/#/experiments/521150557808231275/runs/315190e4e3984b17931b377b32b8d067
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/521150557808231275
Model registered with name: LR With params and run ID: 315190e4e3984b17931b377b32b8d067


Created version '1' of model 'LR With params'.


### Load the Model

In [43]:
# After registration, use the model name and version to load the model
model_version = 1

# model_uri = f"models:/{model_name}/{model_version}"
model_uri = f"models:/{model_name}/{model_version}"

# Load the registered model
loaded_model = mlflow.sklearn.load_model(model_uri)

# Make predictions
y_pred = loaded_model.predict(X_test)

# Display the first 4 predictions
print("First 4 predictions:", y_pred[:4])

First 4 predictions: [0 0 0 1]


### Offline Evaluation

In [44]:
# Get predicted probabilities
y_pred_proba = loaded_model.predict_proba(X_test)

# Show the first 4 probabilities (for class 1)
print("First 4 probabilities for class 1:", y_pred_proba[:4, 1])


First 4 probabilities for class 1: [0.24568529 0.40657272 0.29246188 0.67636816]


In [45]:
# Evaluate the model (e.g., using accuracy and classification report)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Generate a classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

Accuracy: 0.7318435754189944
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.90      0.80       105
           1       0.77      0.50      0.61        74

    accuracy                           0.73       179
   macro avg       0.74      0.70      0.70       179
weighted avg       0.74      0.73      0.72       179



In [46]:
# Save the classification report to a file
report_filename = "classification_report.txt"
with open(report_filename, "w") as f:
    f.write(report)

# Log the metrics and report to MLflow
with mlflow.start_run():
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_artifact(report_filename)  

# Optionally, print the report
print("Classification Report:")
print(report)

🏃 View run luxuriant-fowl-680 at: http://127.0.0.1:5000/#/experiments/521150557808231275/runs/b191600ea976452d961d85f2e7e3ec15
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/521150557808231275
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.90      0.80       105
           1       0.77      0.50      0.61        74

    accuracy                           0.73       179
   macro avg       0.74      0.70      0.70       179
weighted avg       0.74      0.73      0.72       179

