In [53]:
import os
import joblib  # For loading the saved model
import whylogs as why
import pandas as pd
import warnings
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [54]:
# Configure WhyLabs API
os.environ["WHYLABS_DEFAULT_ORG_ID"] = "org-jAKdPA" # ORG-ID is case sensitive
os.environ["WHYLABS_API_KEY"] = "UIn56UYSLA.BtGRwJIMbbzlHp0oYyGMGB9mauslM1MOHD9zmo9Lsl0nr1elwgs9W:org-jAKdPA"
os.environ["WHYLABS_DEFAULT_DATASET_ID"] = "model-2" # The selected project "mi_fatality_prediction (model-2)" is "model-2"

In [55]:
training_data_path = "../../data/raw_data.csv"
new_data_path = "../../data/new_data.csv"
training_profile_path = "../../data/training_data_profile.bin"
new_data_profile_path = "../../data/new_data_profile.bin"
model_path = "../../model/best_model.joblib"

In [56]:
def process_data(training_data_path):
    data = pd.read_csv(training_data_path)
    print(f"Dataset loaded: {data.shape[0]} rows, {data.shape[1]} columns.")

    # Remove columns with more than `missing_threshold` missing values
    missing_fraction = data.isnull().mean()
    cols_to_drop = missing_fraction[missing_fraction > 0.5].index
    data = data.drop(columns=cols_to_drop)
    print(f"Removed {len(cols_to_drop)} columns with more than 50% missing values.")

    # Reduce to binary classification
    data['output'] = data['output'].apply(lambda x: 1 if x > 1 else x)
    print(f"Reduced target column to binary classification.")

    # Separate features and target
    X = data.drop('output', axis=1)
    y = data['output']

    # Impute missing values in remaining columns
    imputer = SimpleImputer(strategy="mean")  # Replace missing values with column mean
    X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
    print("Missing values imputed.")

    # Oversample the minority class using SMOTE
    smote = SMOTE(random_state=42)
    data, y_resampled = smote.fit_resample(X_imputed, y)
    print("Oversampled the minority class using SMOTE.")
    data['output'] = y_resampled
    return data

In [57]:
data = process_data(training_data_path)

Dataset loaded: 1700 rows, 112 columns.
Removed 4 columns with more than 50% missing values.
Reduced target column to binary classification.
Missing values imputed.
Oversampled the minority class using SMOTE.


  data['output'] = y_resampled


In [58]:
data.head()

Unnamed: 0,AGE,SEX,INF_ANAM,STENOK_AN,FK_STENOK,IBS_POST,GB,SIM_GIPERT,DLIT_AG,ZSN_A,...,NOT_NA_2_n,NOT_NA_3_n,LID_S_n,B_BLOK_S_n,ANT_CA_S_n,GEPAR_S_n,ASP_S_n,TIKL_S_n,TRENT_S_n,output
0,77.0,1.0,2.0,1.0,1.0,2.0,3.0,0.0,7.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0
1,55.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0
2,52.0,1.0,0.0,0.0,0.0,2.0,2.0,0.0,2.0,0.0,...,2.0,2.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0
3,68.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,3.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0
4,60.0,1.0,0.0,0.0,0.0,2.0,3.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0


In [59]:
# Log and Save Training Data Profile
def log_training_data(training_data_path):
    # Load training data
    training_data = process_data(training_data_path)

    # Generate a data profile
    training_profile = why.log(pandas=training_data)

    # Save the training profile locally
    training_profile_path = "../../data/training_data_profile.bin"
    training_profile.writer("local").write(training_profile_path)

    # Upload the profile to WhyLabs
    training_profile.writer("whylabs").write()

    return training_profile, training_profile_path

In [60]:
training_profile, training_profile_pat = log_training_data(training_data_path)

Dataset loaded: 1700 rows, 112 columns.
Removed 4 columns with more than 50% missing values.
Reduced target column to binary classification.
Missing values imputed.
Oversampled the minority class using SMOTE.


  data['output'] = y_resampled


In [61]:
def log_new_data(new_data_path):

    # loan and preprocess new data
    new_data = process_data(new_data_path)

    ground_truth = new_data['output']
    new_data = new_data.drop("output", axis = 1)
    

    # Generate a data profile
    new_data_profile = why.log(pandas=new_data)

    # Save the training profile locally
    new_data_profile_path = "../../data/new_data_profile.bin"
    new_data_profile.writer("local").write(new_data_profile_path)

    # Upload the profile to WhyLabs
    new_data_profile.writer("whylabs").write()

    return new_data_profile, new_data_profile_path, new_data, ground_truth

In [62]:
new_data_profile, new_data_profile_path, new_data, ground_truth = log_new_data(new_data_path)

Dataset loaded: 1700 rows, 112 columns.
Removed 4 columns with more than 50% missing values.
Reduced target column to binary classification.
Missing values imputed.
Oversampled the minority class using SMOTE.


  data['output'] = y_resampled


In [63]:
# compare training and new data profile
def compare_data(training_profile_path, new_data_profile_path):

    # Load the training profile for comparison
    training_profile = why.read(training_profile_path)

    # load new data profile for comparison
    new_data_profile = why.read(new_data_profile_path)

    # Compare the two profiles
    comparison_report = training_profile.merge(new_data_profile)
    print("Comparison Report:\n", comparison_report)

    return training_profile, new_data_profile

In [64]:
training_profile, new_data_profile = compare_data(training_profile_path, new_data_profile_path)

Comparison Report:
 <whylogs.api.logger.result_set.ViewResultSet object at 0x0000027E2567D2A0>


In [65]:
model = joblib.load(model_path)

In [66]:
# Step 3: Predict and Log Model Outputs
def log_predictions(new_data, model):
    # Predict labels and confidence scores
    prediction = model.predict(new_data)
    confidence = model.predict_proba(new_data).max(axis=1)
    results = pd.DataFrame({
        "prediction": prediction,
        "confidence": confidence
    }, index=new_data.index)

    new_data = pd.concat([new_data, results], axis=1)
        
    # Log the data with predictions
    prediction_profile = why.log(pandas=new_data)

    # Upload the predictions profile to WhyLabs
    prediction_profile.writer("whylabs").write()

    return new_data

In [67]:
new_data = log_predictions(new_data, model)

In [68]:
new_data.head()

Unnamed: 0,AGE,SEX,INF_ANAM,STENOK_AN,FK_STENOK,IBS_POST,GB,SIM_GIPERT,DLIT_AG,ZSN_A,...,NOT_NA_3_n,LID_S_n,B_BLOK_S_n,ANT_CA_S_n,GEPAR_S_n,ASP_S_n,TIKL_S_n,TRENT_S_n,prediction,confidence
0,77.0,1.0,2.0,1.0,1.0,2.0,3.0,0.0,7.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0,1.0
1,55.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0,0.965
2,52.0,1.0,0.0,0.0,0.0,2.0,2.0,0.0,2.0,0.0,...,2.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0,1.0
3,68.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,3.0,1.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0,1.0
4,60.0,1.0,0.0,0.0,0.0,2.0,3.0,0.0,7.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0,1.0
