In [1]:
#!pip install evidently

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
from evidently import Dataset, DataDefinition
from evidently import Report
from evidently.presets import DataDriftPreset
from evidently.ui.workspace import CloudWorkspace
from dotenv import load_dotenv
import os
from evidently import Report
from evidently.presets import DataDriftPreset


In [3]:
# 1. LOAD AND PREPARE DATA (keeping your existing data prep)
df = pd.read_csv('cancer_reg.csv', encoding='utf-8', encoding_errors='ignore')

# Drop non-numeric / non-useful columns
df.drop(columns=["Geography", "binnedInc"], inplace=True)

# Drop rows with missing values for simplicity
df.dropna(inplace=True)

df.head()

Unnamed: 0,avgAnnCount,avgDeathsPerYear,TARGET_deathRate,incidenceRate,medIncome,popEst2015,povertyPercent,studyPerCap,MedianAge,MedianAgeMale,...,PctPrivateCoverageAlone,PctEmpPrivCoverage,PctPublicCoverage,PctPublicCoverageAlone,PctWhite,PctBlack,PctAsian,PctOtherRace,PctMarriedHouseholds,BirthRate
1,173.0,70,161.3,411.6,48127,43269,18.6,23.111234,33.0,32.2,...,53.8,43.6,31.1,15.3,89.228509,0.969102,2.246233,3.741352,45.3725,4.333096
3,427.0,202,194.8,430.4,44243,75882,17.1,342.637253,42.8,42.2,...,40.3,35.0,45.3,25.0,91.744686,0.782626,1.161359,1.362643,51.021514,4.603841
4,57.0,26,144.4,350.1,49955,10321,12.5,0.0,48.3,47.8,...,43.9,35.1,44.0,22.7,94.104024,0.270192,0.66583,0.492135,54.02746,6.796657
7,146.0,71,183.6,404.0,40189,20848,17.8,0.0,51.7,50.8,...,33.1,25.9,50.9,24.1,89.406636,0.305159,1.889077,2.286268,48.967033,5.889179
14,2265.0,901,171.0,440.7,50083,490945,16.3,462.373586,37.2,35.7,...,50.6,42.5,36.5,21.4,89.038167,1.827041,2.315986,1.033625,48.188377,5.355836


In [4]:

X = df.drop(columns=["TARGET_deathRate"])
y = df["TARGET_deathRate"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Initial Test MSE:", mse)

Initial Test MSE: 541.1440000252096


In [6]:
# A. Median income -40,000
X_test_A = X_test.copy()
X_test_A["medIncome"] = X_test_A["medIncome"] - 40000

# B. Add poverty percent +20
X_test_AB = X_test_A.copy()
X_test_AB["povertyPercent"] = X_test_AB["povertyPercent"] + 20

# C. Add avgHouseholdSize +2
X_test_ABC = X_test_AB.copy()
X_test_ABC["AvgHouseholdSize"] = X_test_ABC["AvgHouseholdSize"] + 2

In [7]:
def evaluate_model(X_modified, y_true, label):
    y_pred = model.predict(X_modified)
    mse = mean_squared_error(y_true, y_pred)
    print(f"{label} Test MSE:", mse)
    return y_pred

y_pred_A = evaluate_model(X_test_A, y_test, "A (medianIncome - 40k)")
y_pred_AB = evaluate_model(X_test_AB, y_test, "A+B (povertyPercent +20)")
y_pred_ABC = evaluate_model(X_test_ABC, y_test, "A+B+C (AvgHouseholdSize +2)")


A (medianIncome - 40k) Test MSE: 572.3218530672266
A+B (povertyPercent +20) Test MSE: 582.0083268571423
A+B+C (AvgHouseholdSize +2) Test MSE: 561.8297985042012


In [8]:
# Define schema
schema = DataDefinition(
    numerical_columns=list(X.columns)
)

# Reference and production datasets
ref_dataset = Dataset.from_pandas(X_test, data_definition=schema)
prod_dataset_A = Dataset.from_pandas(X_test_A, data_definition=schema)
prod_dataset_AB = Dataset.from_pandas(X_test_AB, data_definition=schema)
prod_dataset_ABC = Dataset.from_pandas(X_test_ABC, data_definition=schema)

In [9]:
# Create and run the report
report_A = Report([DataDriftPreset()])
snapshot_A = report_A.run(prod_dataset_A, ref_dataset)
snapshot_A.save_html("report_A.html")

report_AB = Report([DataDriftPreset()])
snapshot_AB = report_AB.run(prod_dataset_AB, ref_dataset)
snapshot_AB.save_html("report_AB.html")

report_ABC = Report([DataDriftPreset()])
snapshot_ABC = report_ABC.run(prod_dataset_ABC, ref_dataset)
snapshot_ABC.save_html("report_ABC.html")


In [10]:
# Load the token from .env
load_dotenv()
api_token = os.getenv("evidently_api_token")

# Connect to Evidently Cloud
ws = CloudWorkspace(token=api_token, url="https://app.evidently.cloud")

# Use your actual org ID - replace with your UUID from Evidently Cloud
project = ws.create_project("Cancer Death Rate Prediction", org_id="01987ddc-8e10-753d-8391-0b92642e5a4c")

# Run reports and upload snapshots
report_A = Report([DataDriftPreset()])
snapshot_A = report_A.run(prod_dataset_A, ref_dataset)
ws.add_run(project.id, snapshot_A, include_data=False)

report_AB = Report([DataDriftPreset()])
snapshot_AB = report_AB.run(prod_dataset_AB, ref_dataset)
ws.add_run(project.id, snapshot_AB, include_data=False)

report_ABC = Report([DataDriftPreset()])
snapshot_ABC = report_ABC.run(prod_dataset_ABC, ref_dataset)
ws.add_run(project.id, snapshot_ABC, include_data=False)