In [1]:
import pandas as pd
import numpy as np
import nannyml as nml
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sklearn
import pickle

import warnings
warnings.filterwarnings("ignore")

In [2]:
# load model
with open('model_lgbm.pkl', 'rb') as f:
    model_lgbm = pickle.load(f)

In [3]:
model_lgbm

Pipeline(steps=[('scaler', RobustScaler()),
                ('regressor',
                 LGBMClassifier(bagging_fraction=0.76, bagging_freq=1,
                                bagging_seed=14,
                                feature_fraction=0.8600000000000001,
                                feature_fraction_seed=14,
                                learning_rate=0.017475148203262847,
                                min_data_in_leaf=19, n_estimators=620,
                                num_leaves=73, objective='multiclass'))])

In [261]:
# load dataset prod
df = pd.read_csv('dataset_production.csv')

def preprocess_data(df):
    
    df.drop('Unnamed: 0', axis=1, inplace=True)
    df['clarity'].replace({'I1':1, 'SI2':2, 'SI1':3, 'VS2':4, 'VS1':5, 'VVS2':6, 'VVS1':7, 'IF':8}, inplace=True)
    df['color'].replace({'J':1, 'I':2, 'H':3, 'G':4, 'F':5, 'E':6, 'D':7}, inplace=True)
    df['cut'].replace({'Fair':1, 'Good':2, 'Very Good':3, 'Premium':4, 'Ideal':5}, inplace=True)
    
    # lets add id col
    df['identifier'] = df.index
    
    return df

# get data for nannyml
def get_data_nml(df, model):
    
    # preporcess data
    df = preprocess_data(df)
    # get X and Y to predict
    X, y = df.drop(['Timestamp', 'identifier', 'cut'], axis=1), df.cut.values
    # still watch the score of the model
    print(f'score of the model on production is: {model_lgbm.score(X, y):.2%}')
    
    # get pred and proba
    y_pred = model_lgbm.predict(X)
    y_pred_proba_ = model_lgbm.predict_proba(X)
    # to df and replace by original vals for understanding purpose (cat var)
    df_proba = pd.DataFrame(y_pred_proba_, columns=['y_pred_proba_fair', 'y_pred_proba_good', 
                                                    'y_pred_proba_very_good', 'y_pred_proba_premium', 
                                                    'y_pred_proba_ideal'])
#     df_proba = pd.DataFrame(y_pred_proba_, columns=['y_pred_proba_1', 'y_pred_proba_2', 
#                                                     'y_pred_proba_3', 'y_pred_proba_4', 
#                                                     'y_pred_proba_5'])

    df_proba['y_pred'] = y_pred
    df_proba['y_pred'].replace({1:'fair', 2:'good', 3:'very_good', 4:'premium', 5:'ideal'}, inplace=True)
    
    df = df.join(df_proba)
    
    df['y_true'] = df['cut'].replace({1:'fair', 2:'good', 3:'very_good', 4:'premium', 5:'ideal'})
    # df['y_true'] = df['cut']
    df.drop('cut', axis=1, inplace=True)
    
    # actually need to put timestamp with a t
    df.rename(columns={'Timestamp':'timestamp'}, inplace=True)
    
    return df


In [262]:
df = get_data_nml(df, model_lgbm)

score of the model on production is: 73.44%


In [263]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32364 entries, 0 to 32363
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   carat                   32364 non-null  float64
 1   color                   32364 non-null  int64  
 2   clarity                 32364 non-null  int64  
 3   depth                   32364 non-null  float64
 4   table                   32364 non-null  float64
 5   price                   32364 non-null  int64  
 6   x                       32364 non-null  float64
 7   y                       32364 non-null  float64
 8   z                       32364 non-null  float64
 9   timestamp               32364 non-null  object 
 10  identifier              32364 non-null  int64  
 11  y_pred_proba_fair       32364 non-null  float64
 12  y_pred_proba_good       32364 non-null  float64
 13  y_pred_proba_very_good  32364 non-null  float64
 14  y_pred_proba_premium    32364 non-null

In [264]:
# separate into reference and analysis 

"""
Don’t use training data as a reference data set to prevent overfitting, e.g during model score calibration.
"""

reference = df.loc[:15000] # +/- 40% for ref --> chunk by 5000
reference['partition'] = 'reference'
analysis = df.loc[15001:] # 60 % for analysis
analysis['partition'] = 'analysis'

# create analysis target and remove y_true from target
analysis_targets = analysis.loc[:, ['identifier', 'y_true']]
# remove y_pred from analysis
analysis.drop(['y_true'], axis=1, inplace=True)

In [265]:
# lets try something else

In [266]:
analysis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17363 entries, 15001 to 32363
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   carat                   17363 non-null  float64
 1   color                   17363 non-null  int64  
 2   clarity                 17363 non-null  int64  
 3   depth                   17363 non-null  float64
 4   table                   17363 non-null  float64
 5   price                   17363 non-null  int64  
 6   x                       17363 non-null  float64
 7   y                       17363 non-null  float64
 8   z                       17363 non-null  float64
 9   timestamp               17363 non-null  object 
 10  identifier              17363 non-null  int64  
 11  y_pred_proba_fair       17363 non-null  float64
 12  y_pred_proba_good       17363 non-null  float64
 13  y_pred_proba_very_good  17363 non-null  float64
 14  y_pred_proba_premium    17363 non-

In [267]:
data = pd.concat([
    reference,
    analysis.set_index('identifier').join(analysis_targets.set_index('identifier'), on='identifier', rsuffix='_r')
], ignore_index=True).reset_index(drop=True)
display(data.loc[data['partition'] == 'analysis'].head(3))

Unnamed: 0,carat,color,clarity,depth,table,price,x,y,z,timestamp,identifier,y_pred_proba_fair,y_pred_proba_good,y_pred_proba_very_good,y_pred_proba_premium,y_pred_proba_ideal,y_pred,y_true,partition
15001,0.4,4,6,59.2,63.0,945,4.88,4.81,2.87,2060-02-22,,3.3e-05,0.017206,0.981077,0.000821,0.000863,very_good,very_good,analysis
15002,0.4,5,3,62.0,56.0,945,4.77,4.72,2.94,2060-02-23,,8e-06,0.001001,0.076323,0.006916,0.915752,ideal,premium,analysis
15003,0.4,6,3,61.3,61.0,945,4.76,4.73,2.91,2060-02-24,,0.000188,0.124595,0.760167,0.106926,0.008124,very_good,premium,analysis


In [268]:
metadata = nml.extract_metadata(
    reference,
    model_name='lgbm',
    model_type='classification_multiclass',
    exclude_columns=['identifier']
)
metadata.target_column_name = 'y_true'
display(metadata.is_complete())

(True, [])

In [269]:
performance_calculator = nml.PerformanceCalculator(
    model_metadata=metadata,
    metrics=['roc_auc', 'f1'],
    chunk_size=6000
).fit(reference_data=reference)

# always got an error with the CPBE estimator ? 
# - could be for chunk size between ref and analysis?
# [FIXED] TypeError: '<' not supported between instances of 'NoneType' and 'float' for the sklearn roc_auc_score but
# all my data were the same type in all cols (checks), maybe the missing vals ?

In [270]:
realized_performance = performance_calculator.calculate(data)

In [271]:
display(realized_performance.data.head(3))

Unnamed: 0,key,start_index,end_index,start_date,end_date,partition,targets_missing_rate,roc_auc,roc_auc_thresholds,roc_auc_alert,f1,f1_thresholds,f1_alert
0,[0:5999],0,5999,2019-01-27,2035-07-01,reference,0.0,0.912055,"(0.8998266524807966, 0.9347221255209291)",False,0.752268,"(0.6130630360820146, 0.8273928899213809)",False
1,[6000:11999],6000,11999,2035-07-02,2051-12-04,reference,0.0,0.925389,"(0.8998266524807966, 0.9347221255209291)",False,0.738033,"(0.6130630360820146, 0.8273928899213809)",False
2,[12000:17999],12000,17999,2051-12-05,2068-05-08,analysis,0.0,0.910279,"(0.8998266524807966, 0.9347221255209291)",False,0.671407,"(0.6130630360820146, 0.8273928899213809)",False


In [272]:
for metric in performance_calculator.metrics:
    figure = realized_performance.plot(kind='performance', metric=metric)
    figure.show()

In [273]:
# lets look at data drift 

# Let's initialize the object that will perform Data Reconstruction with PCA
# Let's use a chunk size of 5000 data points to create our drift statistics
rcerror_calculator = nml.DataReconstructionDriftCalculator(model_metadata=metadata, chunk_size=5000)
rcerror_calculator = rcerror_calculator.fit(reference_data=reference)
# let's see RC error statistics for all available data
data = pd.concat([reference, analysis], ignore_index=True)
rcerror_results = rcerror_calculator.calculate(data=data)

from sklearn.impute import SimpleImputer

# Let's initialize the object that will perform Data Reconstruction with PCA
rcerror_calculator = nml.DataReconstructionDriftCalculator(
    model_metadata=metadata,
    chunk_size=5000,
#     imputer_categorical=SimpleImputer(strategy='constant', fill_value='missing'),
#     imputer_continuous=SimpleImputer(strategy='median')
) 
# --> doesnt work if I dont remove the imputer but I dont need it anyway since I dont have missing vals

# NannyML compares drift versus the full reference dataset.
rcerror_calculator.fit(reference_data=reference)
# let's see RC error statistics for all available data
rcerror_results = rcerror_calculator.calculate(data=data)

# We use the data property of the results class to view the relevant data.
display(rcerror_results.data)

figure = rcerror_results.plot(kind='drift')
figure.show()

Unnamed: 0,key,start_index,end_index,start_date,end_date,partition,reconstruction_error,lower_threshold,upper_threshold,alert
0,[0:4999],0,4999,2019-01-27,2032-10-04,reference,1.617294,0.205794,2.480967,False
1,[5000:9999],5000,9999,2032-10-05,2046-06-13,reference,1.468679,0.205794,2.480967,False
2,[10000:14999],10000,14999,2046-06-14,2060-02-20,reference,1.504991,0.205794,2.480967,False
3,[15000:19999],15000,19999,2060-02-21,2073-10-29,analysis,1.535262,0.205794,2.480967,False
4,[20000:24999],20000,24999,2073-10-30,2087-07-08,analysis,1.50111,0.205794,2.480967,False
5,[25000:29999],25000,29999,2087-07-09,2101-03-17,analysis,1.643584,0.205794,2.480967,False
6,[30000:32363],30000,32363,2101-03-18,2107-09-06,analysis,1.559058,0.205794,2.480967,False


In [254]:
# Let's initialize the object that will perform the Univariate Drift calculations
univariate_calculator = nml.UnivariateStatisticalDriftCalculator(model_metadata=metadata, chunk_size=5000)
univariate_calculator = univariate_calculator.fit(reference_data=reference)
univariate_results = univariate_calculator.calculate(data=data)
# let's plot drift results for all model inputs
for feature in metadata.features:
    figure = univariate_results.plot(kind='feature_drift', metric='statistic', feature_label=feature.label)
    figure.show()

In [277]:
univariate_results

             key  start_index  end_index start_date   end_date  partition  \
0       [0:4999]            0       4999 2019-01-27 2032-10-04  reference   
1    [5000:9999]         5000       9999 2032-10-05 2046-06-13  reference   
2  [10000:14999]        10000      14999 2046-06-14 2060-02-20  reference   
3  [15000:19999]        15000      19999 2060-02-21 2073-10-28   analysis   
4  [20000:24999]        20000      24999 2073-10-29 2087-07-07   analysis   
5  [25000:29999]        25000      29999 2087-07-08 2101-03-16   analysis   
6  [30000:32364]        30000      32364 2101-03-17 2107-09-06   analysis   

   color_chi2  color_p_value  color_alert  color_threshold  ...  \
0  289.833122            0.0        False             0.05  ...   
1   43.359986            0.0        False             0.05  ...   
2  270.228819            0.0        False             0.05  ...   
3  249.033662            0.0        False             0.05  ...   
4  308.376434            0.0         True       

# Obs

The model seems to perform porrly between on the roc auc metric between april and may 2068 and regain a good score between septembre and novembre 2084.

We have data drift for all the features!! although it doesnt seems to make the model drift, lets take a closer look at that.

In [274]:
# check distribution of data before and after 

# all the feature are drifting at the same time (noviembre 2073), lets investigate

# price goes upppppp and new size of dimanods after our trainnig time --> should be that 

data.reset_index(drop=True, inplace=True)
data.to_csv("data_production.csv", encoding = 'utf-8', index=False)
analysis_targets.to_csv('targets_analysis.csv', encoding = 'utf-8', index=False)