# Evidently Report

In [None]:
import requests
import datetime
import pandas as pd

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric, DatasetMissingValuesMetric

from tqdm import tqdm

from optbinning import Scorecard, BinningProcess

In [None]:
num_features = [
            "AverageMInFile",
            "MSinceMostRecentInqexcl7days",
            "PercentTradesNeverDelq",
            "ExternalRiskEstimate",
            "NetFractionRevolvingBurden",
            "NumSatisfactoryTrades",
            "PercentInstallTrades"
      ]

In [None]:
column_mapping = ColumnMapping(
    target="RiskPerformance",
    prediction='prediction',
    numerical_features=num_features,
    # categorical_features=cat_features
)

In [None]:
report = Report(metrics=[
    ColumnDriftMetric(column_name='prediction'),
    DatasetDriftMetric(),
    DatasetMissingValuesMetric()
]
)

In [None]:
TARGET = "RiskPerformance"

In [None]:
model = Scorecard.load("/home/fini/Learning/CreditRisk/crm-simple-scorecard/mlops-model/data/artifacts/rfecv/scorecard-model.pkl")

In [None]:

def get_data():
    base_path = "/home/fini/Learning/CreditRisk/crm-simple-scorecard/mlops-model/data"
    x_train = pd.read_parquet(f"{base_path}/X_train.parquet")
    y_train = pd.read_parquet(f"{base_path}/y_train.parquet")
    x_val = pd.read_parquet(f"{base_path}/X_val.parquet")
    y_val = pd.read_parquet(f"{base_path}/y_val.parquet")
    target = "RiskPerformance"
    train = x_train.assign(RiskPerformance=y_train.values)
    val = x_val.assign(RiskPerformance=y_val.values)

    return train[num_features], val[num_features]

    
train_data, val_data = get_data()

Create Drift in the data

In [None]:
def add_drift(data):
    import pandas as pd
    import numpy as np

    p = 0.1 #percentage missing data required

    df = pd.DataFrame(np.random.randint(0,100,size=(10,10)))

    mask = np.random.choice([True, False], size=data.shape, p=[p,1-p])
    return data.mask(mask)
  

In [None]:
train_data, val_data = add_drift(train_data), add_drift(val_data)

In [None]:
# Add Gaussian noise to some of the data.
import numpy as np 
def add_gaussian_noise(df):

    df = df.copy()
    std = df.describe().to_dict(orient="records")[2]
    for column in df.columns:
        mu, sigma = 0, std[column]
        # creating a noise with the same dimension as the dataset (2,2) 
        noise = np.random.normal(mu, sigma, df[column].shape) 
        df[column] = df[column] + noise

    return df

def noisy_data(df):
    df = df.copy()
    stats = df.describe().to_dict(orient="records")
    std = stats[2]
    mean = stats[1]
    for column in df.columns:
        mu, sigma = mean[column], std[column]
        # creating a noise with the same dimension as the dataset (2,2) 
        noise = np.random.normal(mu, sigma, df[column].shape) 
        df[column] = noise

    return df


In [None]:
drift_data = add_gaussian_noise(val_data.sample(1000))
noise_data = noisy_data(val_data.sample(1000))

In [None]:
raw_noisy_data = pd.concat([drift_data, noise_data])

In [None]:
raw_noisy_data["operation_date"] = [date(2023, 10, day) for day in np.random.randint(1, 13, size=raw_noisy_data.shape[0])]

In [None]:
raw_noisy_data.head()

In [None]:

from datetime import date

In [None]:
date(2020, 10, 1)

In [None]:
def add_date(size, year = 2023):
    days = np.random.randint(1, 29,size=(size))
    months = np.random.randint(1, 13,size=(size))
    year = 2023
    date_data = [date(year, month, day) for month, day in zip(months, days) ]

    return date_data

date_col = add_date(10)

In [None]:
val_data.head()

In [None]:
raw_data = val_data.assign(operation_date=add_date(val_data.shape[0]))

In [None]:
raw_data.head()

In [None]:
all_raw_data = pd.concat([raw_data, raw_noisy_data])

In [None]:
all_raw_data

In [None]:
val_preds = model.predict(val_data[num_features])
val_data['prediction'] = val_preds

In [None]:
val_data.describe()

In [None]:
val_data.describe().to_dict(orient="records")[2]

In [None]:
val_data.head()

In [None]:
train_preds = model.predict(train_data[num_features])
train_data['prediction'] = train_preds

In [None]:
train_data.to_parquet("data/train.parquet")
val_data.to_parquet("data/val.parquet")

In [None]:
train_data.to_parquet("data/reference.parquet")

In [None]:
train_data.head()

In [None]:
all_raw_data.head()

In [None]:
all_raw_data.to_parquet("data/raw.parquet")

In [None]:
# train_data = pd.read_parquet("data/train.parquet")
# val_data = pd.read_parquet("data/val.parquet")

In [None]:
train_preds = model.predict(train_data[num_features])
train_data['prediction'] = train_preds

In [None]:
train_data.head()

In [None]:
val_preds = model.predict(val_data[num_features])
val_data['prediction'] = val_preds

In [None]:
val_preds.head

In [None]:
report.run(reference_data=train_data, current_data=val_data, column_mapping=column_mapping)

In [None]:
report.show(mode='inline')

In [None]:
result = report.as_dict()

In [None]:
result

In [None]:
#prediction drift
result['metrics'][0]['result']['drift_score']

In [None]:
#number of drifted columns
result['metrics'][1]['result']['number_of_drifted_columns']

In [None]:
#share of missing values
result['metrics'][2]['result']['current']['share_of_missing_values']

In [None]:
import datetime

In [None]:
datetime.timedelta(days=1)