# Case study - optymalizacja modelu dla fraudow

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from bayes_opt import BayesianOptimization
import lightgbm as lgb
import seaborn as sns


In [2]:
# puść ten kod, 
# jeżeli wywołujesz plik  w folderze rozwiąznaia, 
# a ramka danych znajduje się w folderze data
import os 
os.chdir('../')

In [3]:
# Wczytanie danych
df = pd.read_csv('data/creditcard.csv')


In [None]:
# wielkość ramki
df.shape

In [None]:
# Liczebność klas
df['Class'].value_counts()

In [5]:
# Podział na zbiór treningowy i testowy
train_x, test_x,train_y, test_y = train_test_split(df.drop('Class',axis=1),df['Class'], test_size=0.2, random_state=123)

In [None]:
# Liczebność klas - train
train_y.value_counts()

In [None]:
# Liczebność klas - test 
test_y.value_counts()

## Minimalizacja kwoty fraudow

In [8]:
# użycie lgb.Dataset 
train_lgb = lgb.Dataset(data=train_x, label= train_y, free_raw_data=False)

In [10]:
# Parametry do optymalizacji
params = {'max_depth': [3,20],
          'min_samples_leaf': [5,50],
          'max_leaf_nodes': [20,100],}

In [9]:
# Funkcja optymalizacyjna Bayesian maksymalizuje
def opt_fun(**params):
    params_new = {key: int(round(value)) for key, value in params.items()}
    model_opt = lgb.LGBMClassifier(**params_new, random_state=123).fit(train_x,train_y)
    test_calc = test_x.copy()
    test_calc['pred'] = model_opt.predict(test_x)
    test_calc['pred_reversed'] =1 - test_calc['pred']

    return - np.sum(test_calc['pred_reversed'] * test_y * test_calc['Amount'])

In [11]:
# Obiekt optymalizacji
optimization = BayesianOptimization(f  = opt_fun,
                                    pbounds=params)

In [None]:
# optymalizacja
optimization.maximize(n_iter=3)

In [None]:
# wybrane parametry
optimization.max

In [None]:
test_x.loc[test_y==1,'Amount'].sum()

In [18]:
# Konwersja do int
params_new = {key: int(round(value)) for key, value in optimization.max['params'].items()}

In [None]:
params_new

## Symulacja miesięcznej rezerwy
(z wykorzystaniem klasy i prawdopodobieństwa)

In [None]:
# estymacja wybranego modelu
model_final = lgb.LGBMClassifier(**params_new, random_state=123).fit(train_x,train_y)


In [24]:
# Parametry do optymalizacji
n = 100 
monthly_number = 20000


In [21]:
# Przygotowanie test setu
test = test_x.copy()
test['Class'] = test_y

In [None]:
# Predykcje
test['pred'] = model_final.predict(test_x)
test['pred_proba'] = model_final.predict_proba(test_x)[:,1]

In [33]:
# Symulacja
fp_lost = []
fn_lost = []
exp_amount = []

for i in range(n):
    sample_df = test.sample(n=monthly_number,replace= True)
    fp = sample_df[(sample_df['Class']==0) & (sample_df['pred']==1)]
    fn = sample_df[(sample_df['Class']==1) & (sample_df['pred']==0)]
    fp_sum = fp['Amount'].sum()
    fn_sum = fn['Amount'].sum()
    expected_amount = (sample_df['Amount'] * sample_df['pred_proba']).sum()
    fp_lost.append(fp_sum)
    fn_lost.append(fn_sum)
    exp_amount.append(expected_amount)
    

In [34]:
# random choice 10 %
fp_lost_random = []
fn_lost_random = []

for i in range(n):
    sample_df = test.sample(n=monthly_number,replace= True)
    sample_df['random_number'] = np.random.rand(len(sample_df))
    sample_df['pred'] = (sample_df['random_number']>=0.9).astype(int)
    fp = sample_df[(sample_df['Class']==0) & (sample_df['pred']==1)]
    fn = sample_df[(sample_df['Class']==1) & (sample_df['pred']==0)]
    fp_sum = fp['Amount'].sum()
    fn_sum = fn['Amount'].sum()
    fp_lost_random.append(fp_sum)
    fn_lost_random.append(fn_sum)

    

In [None]:
# Histogram straty
sns.kdeplot(fn_lost)

In [None]:
# describe straty
pd.Series(fn_lost).describe()

In [None]:
pd.Series(fn_lost).quantile(q=[0.025,0.975])

In [None]:
# Wartość oczekiwana na podstawie pdp
pd.Series(exp_amount).describe()

In [None]:
# Strata przy 10% losowych sprawdzeń
pd.Series(fn_lost_random).describe()