In [6]:
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt

data_path = "..//data//diabetes.csv"

df = pd.read_csv(data_path)

In [7]:
from sklearn.model_selection import train_test_split 
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler 

X = df.drop("Outcome", axis=1)
y = df["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_features = X.columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="mean")),
    ('scaler', StandardScaler())
]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features)
    ]
)

In [8]:
from sklearn.ensemble import RandomForestClassifier 

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
    
])

pipeline.fit(X_train, y_train)

In [9]:
from sklearn.metrics import classification_report 

y_pred = pipeline.predict(X_test)

# model evaluation 
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.79      0.78      0.78        99
           1       0.61      0.62      0.61        55

    accuracy                           0.72       154
   macro avg       0.70      0.70      0.70       154
weighted avg       0.72      0.72      0.72       154



In [10]:
# save the model 
import pickle 

with open("..//models//pipeline.pkl", "wb") as f:
    pickle.dump(pipeline, f)

In [11]:
import requests 

url = "http://localhost:8000/predict"

data = {
    "data" : [[1, 85, 66, 29, 0, 26.6, 0.351, 31]]
}

response = requests.post(url, json=data)
print(response.json())

ConnectionError: HTTPConnectionPool(host='localhost', port=8000): Max retries exceeded with url: /predict (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000029133377280>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))

In [12]:
import numpy as np

def introduce_drift(data, drift_features, drift_amount=0.1, random_seed=42):
    np.random.seed(random_seed)
    drifted_data = data.copy()
    
    for feature in drift_features:
        if feature in data.columns:
            drifted_data[feature] += np.random.normal(loc=0, scale=drift_amount, size=data.shape[0])
    
    return drifted_data
    
features_to_drift = ['Glucose', 'BloodPressure', 'SkinThickness', 'Pregnancies']

drifted_data = introduce_drift(X_test, features_to_drift, drift_amount=50)
drifted_data = drifted_data.reset_index(drop = True)

In [14]:
drifted_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154 entries, 0 to 153
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               154 non-null    float64
 1   Glucose                   154 non-null    float64
 2   BloodPressure             154 non-null    float64
 3   SkinThickness             154 non-null    float64
 4   Insulin                   154 non-null    int64  
 5   BMI                       154 non-null    float64
 6   DiabetesPedigreeFunction  154 non-null    float64
 7   Age                       154 non-null    int64  
dtypes: float64(6), int64(2)
memory usage: 9.8 KB


In [17]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 154 entries, 668 to 462
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               154 non-null    int64  
 1   Glucose                   154 non-null    int64  
 2   BloodPressure             154 non-null    int64  
 3   SkinThickness             154 non-null    int64  
 4   Insulin                   154 non-null    int64  
 5   BMI                       154 non-null    float64
 6   DiabetesPedigreeFunction  154 non-null    float64
 7   Age                       154 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 10.8 KB


In [18]:
reference_data = X_test.copy()

reference_data['Outcome'] = y_train.reset_index(drop = True)
drifted_data['Outcome'] = y_test.reset_index(drop = True)

drifted_data.to_csv('..//data//new_data.csv', index=False)
reference_data.to_csv('..//data//reference_data.csv', index=False)

In [19]:
from evidently.metric_preset import DataDriftPreset
from evidently.report import Report

data_drift_report = Report(metrics=[
    DataDriftPreset(),
])

data_drift_report.run(current_data=drifted_data.drop('Outcome', axis =1), 
reference_data=reference_data.drop('Outcome', axis =1), column_mapping=None)
report_json = data_drift_report.as_dict()
drift_detected = report_json['metrics'][0]['result']['dataset_drift'] 

In [20]:
print(drift_detected)

True
