In [0]:
import evidently
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from evidently.ui.workspace import CloudWorkspace
from evidently.sdk.models import PanelMetric
from evidently.sdk.panels import DashboardPanelPlot
from evidently.presets import DataDriftPreset
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

### Data Preparation

In [0]:
df = pd.read_csv('cancer_reg.csv', engine='python', encoding='latin1')
df.head()

In [0]:
df.describe()

In [0]:
df.isna().sum()

Based on the size of the dataset, we're better off just removing the columns with NA values rather than removing rows with NA values (not enough rows would be left over for training/testing). The two non-numerical features will not provide additional power in predicting death rates, so we remove those as well. We also assume no data cleaning/engineering is required since this dataset was already pre-processed and aggregated.

In [0]:
cols_to_remove = ['PctSomeCol18_24', 'PctEmployed16_Over', 'PctPrivateCoverageAlone', 'Geography', 'binnedInc']

X = df.drop(cols_to_remove, axis=1)
y = df['TARGET_deathRate']
X = X.drop(['TARGET_deathRate'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=24)

### Data Modeling

In [0]:
params = {
    'objective': 'reg:squarederror',
    'max_depth': 5,
    'learning_rate': 0.1,
    'n_estimators': 100,
    'random_state': 24
}

xgb_model = xgb.XGBRegressor(**params)
xgb_model.fit(X_train, y_train)

baseline_preds = xgb_model.predict(X_train)
test0_preds = xgb_model.predict(X_test)
rmse_0 = mean_squared_error(y_test, test0_preds, squared=False)
r2_0 = r2_score(y_test, test0_preds)

X_train['prediction'] = baseline_preds
X_test['prediction'] = test0_preds

In [0]:
print("RMSE: ",rmse_0)
print("R^2: ",r2_0)

### Evidently - Model Monitoring

In [0]:
ev_token = 'dG9rbgGixxnNEBJJA6lXh/ZSB2srwlmhvp8jwZc5uThynL04zwBQ6q33qkY2pvGRBihzroyJ7ubfXTn6yyPpI4bpXlgf0Tt8IX1HWxTotYm3fvGcVLNVkq7o/y+VYfPfWgi7OLUSHZZgnfSjwpFd5IkcS0DH3PiROAST'
org_id = '019848b2-2e4a-79c3-82f9-746d245392f0'

ws = CloudWorkspace(token=ev_token, url="https://app.evidently.cloud")

project = ws.create_project("assignment4", org_id=org_id)
project.description = "Model Monitoring Exercise"
project.save()

In [0]:
all_cols = X_train.columns.to_list()
all_cols.append('prediction')

schema = evidently.DataDefinition(
    numerical_columns=all_cols
)


ev_train_data = evidently.Dataset.from_pandas(
    X_train,
    data_definition=schema
)

ev_test0_data = evidently.Dataset.from_pandas(
    X_test,
    data_definition=schema
)

In [0]:
drift_report = evidently.Report([
    DataDriftPreset(drift_share=0.1)
])

run0 = drift_report.run(ev_test0_data, ev_train_data)

In [0]:
# project_id = "01984a6e-1fb9-7f5a-80c7-e6fcb0d637b0"
# project = ws.get_project(project_id)
ws.add_run(project.id, run0, include_data=False)

### Change A

In [0]:
X_test = X_test.drop(['prediction'], axis=1)
X_test['medIncome'] = X_test['medIncome'] - 40000
test1_preds = xgb_model.predict(X_test)
rmse1 = mean_squared_error(y_test, test1_preds, squared=False)
r2_1 = r2_score(y_test, test1_preds)

X_test['prediction'] = test1_preds
ev_test1_data = evidently.Dataset.from_pandas(
    X_test,
    data_definition=schema
)

run1 = drift_report.run(ev_test1_data, ev_train_data)
ws.add_run(project.id, run1, include_data=False)

In [0]:
print("RMSE: ",rmse1)
print("R^2: ",r2_1)

### Change B

In [0]:
X_test = X_test.drop(['prediction'], axis=1)
X_test['povertyPercent'] = X_test['povertyPercent'] + 20
test2_preds = xgb_model.predict(X_test)
rmse2 = mean_squared_error(y_test, test2_preds, squared=False)
r2_2 = r2_score(y_test, test2_preds)
X_test['prediction'] = test2_preds
ev_test2_data = evidently.Dataset.from_pandas(
    X_test,
    data_definition=schema
)

run2 = drift_report.run(ev_test2_data, ev_train_data)
ws.add_run(project.id, run2, include_data=False)

In [0]:
print("RMSE: ",rmse2)
print("R^2: ",r2_2)

### Change C

In [0]:
X_test = X_test.drop(['prediction'], axis=1)
X_test['AvgHouseholdSize'] = X_test['AvgHouseholdSize'] + 2
test3_preds = xgb_model.predict(X_test)
rmse3 = mean_squared_error(y_test, test3_preds, squared=False)
r2_3 = r2_score(y_test, test3_preds)
X_test['prediction'] = test3_preds
ev_test3_data = evidently.Dataset.from_pandas(
    X_test,
    data_definition=schema
)

run3 = drift_report.run(ev_test3_data, ev_train_data)
ws.add_run(project.id, run3, include_data=False)

In [0]:
print("RMSE: ",rmse3)
print("R^2: ",r2_3)

In [0]:
project.dashboard.add_panel(
    DashboardPanelPlot(
        title="Feature + Prediction column drift",
        subtitle = "Share of drifted columns",
        size="half",
        values=[
            PanelMetric(
                legend="Share",
                metric="DriftedColumnsCount",
                metric_labels={"value_type": "share"} 
            ),
        ],
        plot_params={"plot_type": "line"},
    ),
    tab="Data Drift"
)
project.dashboard.add_panel(
    DashboardPanelPlot(
        title="Prediction drift",
        subtitle = """Drift in the prediction column, method: Jensen-Shannon distance""",
        size="half",
        values=[
            PanelMetric(
                legend="Drift score",
                metric="ValueDrift",
                metric_labels={"column": "prediction"} 
            ),
        ],
        plot_params={"plot_type": "bar"},
    ),
    tab="Data Drift"
)