# Metryki oparte na prawdopodobieństwie

In [1]:
#pip install scipy

In [2]:
import pandas as pd 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve,roc_auc_score,precision_recall_curve
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as stats

In [3]:
# puść ten kod, 
# jeżeli wywołujesz plik  w folderze rozwiąznaia, 
# a ramka danych znajduje się w folderze data
import os 
os.chdir('../')

In [4]:
# Wczytanie zbioru danych
df = pd.read_csv('data/credit_fraud_sample.csv')
del df['Unnamed: 0']

In [None]:
df.head()

In [6]:
## Pretrenuj model
train_x, test_x, train_y, test_y = train_test_split(df.iloc[:,1:6],df['Class'], test_size=0.3, random_state=123)
model = LogisticRegression().fit(train_x,train_y)
train_pred = model.predict_proba(train_x)[:,1]
test_pred = model.predict_proba(test_x)[:,1]
df['predict_proba'] = model.predict_proba(df[model.feature_names_in_])[:,1]

In [None]:
train_x

## Krzywa ROC

In [8]:
# Przygotwanie danych do wykresu
fpr_train, tpr_train, thresholds_train = roc_curve(train_y,train_pred)
fpr_test, tpr_test, thresholds_test = roc_curve(test_y, test_pred)


In [9]:
# Pole pod krzywą roc
auc_train = round(roc_auc_score(train_y, train_pred),3)
auc_test = round(roc_auc_score(test_y, test_pred),3)

In [None]:
# Wykres
plt.plot(fpr_train,tpr_train,label = 'train')
plt.plot(fpr_test,tpr_test,label = 'test')
plt.plot(np.arange(0,1.1,0.1),np.arange(0,1.1,0.1),'--')
plt.legend()
plt.annotate(f'AUC train: {auc_train}',xy=[0.2,0.8])
plt.annotate(f'AUC test: {auc_test}', xy=[0.2,0.75])
plt.title('Krzywa ROC')
plt.show()

## Precision-recall 

In [11]:
# Przygotowanie danych
pr_train, recall_train, pr_thresholds_train = precision_recall_curve(train_y,train_pred)
pr_test, recall_test, pr_thresholds_test = precision_recall_curve(test_y, test_pred)

In [None]:
plt.plot(recall_train, pr_train, label = 'train')
plt.plot(recall_test, pr_test, label ='test')
plt.legend()
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall')
plt.show()

## K-S plot

In [13]:
# Y train df
train_pred_df = pd.DataFrame()
train_pred_df['y_true'] = train_y
train_pred_df['y_pred'] = train_pred

In [14]:
# Y test df
test_pred_df = pd.DataFrame()
test_pred_df['y_true'] = test_y
test_pred_df['y_pred'] = test_pred

In [None]:
test_pred_df

In [None]:
sns.kdeplot(data=train_pred_df, x='y_pred', hue = 'y_true', fill =True)
plt.title('Density of predicted probability by actual class')
plt.show()

In [None]:
sns.kdeplot(data=test_pred_df, x='y_pred', hue = 'y_true', fill =True)
plt.show()

In [18]:
# inaczej
test_0 = test_pred_df.loc[test_pred_df.y_true==0, 'y_pred']
test_1 = test_pred_df.loc[test_pred_df.y_true==1, 'y_pred']


In [19]:
ks_stat, p_value = stats.ks_2samp(test_0, test_1)

In [None]:
p_value

In [None]:
plt.hist(test_0, bins=20, density = True, histtype='step',cumulative=True, label = 'Class 0')
plt.hist(test_1, bins=20, density=True, histtype='step',cumulative=True, label = 'Class 1')
plt.xlabel('Score')
plt.ylabel('Skumulowane prawdopodobieństwo')
plt.legend()
plt.show()

## Krzywa Gain i Lift
Do wyznaczenia krzywych Gain oraz Lift potrzebujemy wyznaczyć wartości TPR i RPP

In [22]:
def lift_metrics(y_true, y_score):
    cut_offs = np.arange(min(y_score),max(y_score),0.01)
    rpps = []
    lifts = []
    tprs = []
    for cut_off in cut_offs:
        pred_class = (y_score>=cut_off).astype(int)
        positive_values = np.sum(pred_class)
        rpp = positive_values / len(pred_class)
        tpr = np.sum((y_true ==  pred_class) & (y_true==1)) / np.sum(y_true)
        lift = tpr/ rpp
        rpps.append(rpp)
        lifts.append(lift)
        tprs.append(tpr)
    return rpps, lifts, tprs, cut_offs

In [23]:
# Wyliczenie miar train
rpps_train, lifts_train, tprs_train, cut_off_train = lift_metrics(train_pred_df['y_true'], train_pred_df['y_pred'])

In [None]:
rpps_train

In [None]:
lifts_train

In [26]:
# Wyliczenie miar test
rpps_test, lifts_test, tprs_test, cut_off_test = lift_metrics(test_pred_df['y_true'], test_pred_df['y_pred'])

In [27]:
# Gain perfect
x = [0,test_pred_df['y_true'].sum() / len(test_pred_df['y_true']),1]
y =[0,1,1]

In [None]:
# Gain 
plt.plot(rpps_train, tprs_train, label='train')
plt.plot(rpps_test, tprs_test, label='test')
plt.plot(rpps_train, rpps_train, label= 'random')
plt.plot(x,y, color='red',label = 'perfect')
plt.legend()
plt.show()

In [29]:
# x,y do wykresu idealnego lift
x_lift_p = [train_pred_df['y_true'].sum()/len(train_pred_df['y_true']),1]
y_lift_p = [len(train_pred_df['y_true']) / train_pred_df['y_true'].sum(),1]

In [None]:
y_lift_p

In [None]:
# Lift
plt.plot(rpps_train, lifts_train,label = 'train')
plt.plot(rpps_test, lifts_test, label = 'test')
plt.plot(rpps_train, np.repeat(1,len(rpps_train)),label = 'random')
plt.plot(x_lift_p,y_lift_p,'--', label = 'perfect')
plt.legend()
plt.title('Lift')
plt.show()