In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [34]:
mean=np.random.rand(5)*100
cov=np.random.rand(5,5)
cov=cov+cov.T

In [50]:
cov=np.diag(np.random.rand(5)*10)

In [69]:
cov

array([[7.49542821, 0.        , 0.        , 0.        , 0.        ],
       [0.        , 7.22337074, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 5.27733254, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 3.86453684, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.9378703 ]])

In [52]:
data=np.random.multivariate_normal(mean=mean, cov=cov, size=10**5)

In [91]:
1/cov.diagonal()

array([0.13341466, 0.13843952, 0.18948967, 0.25876322, 1.06624551])

In [97]:
log_prob =data-mean
log_prob =log_prob*log_prob/cov.diagonal()
log_prob =log_prob.sum(axis=1)

In [100]:
T=np.quantile(log_prob, 0.95)

In [101]:
df=pd.DataFrame(data, columns=['x1','x2','x3','x4','x5'])
df['target']=(log_prob>T).astype(int)

# Modelling

In [130]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

In [106]:
X=df.drop('target', axis=1)
y=df['target']
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2,stratify=y, random_state=42)

In [107]:
ytrain.mean(), ytest.mean()

(0.05, 0.05)

In [144]:
rf=RandomForestClassifier(n_estimators=10, random_state=42)
param_grid = {'max_depth': [5,10],'n_estimators':[10,20,30,80], 'min_samples_split':[5,10]}

In [145]:
cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
gcv=GridSearchCV(estimator=rf, param_grid=param_grid, scoring='recall', cv=cv, verbose=1)

In [146]:
gcv.fit(xtrain, ytrain)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
             estimator=RandomForestClassifier(n_estimators=10, random_state=42),
             param_grid={'max_depth': [5, 10], 'min_samples_split': [5, 10],
                         'n_estimators': [10, 20, 30, 80]},
             scoring='recall', verbose=1)

In [147]:
pd.DataFrame(gcv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.797049,0.150792,0.017447,0.001129,5,5,10,"{'max_depth': 5, 'min_samples_split': 5, 'n_es...",0.06625,0.1025,0.13375,0.045,0.10375,0.09025,0.031139,13
1,1.385079,0.026018,0.024824,0.000952,5,5,20,"{'max_depth': 5, 'min_samples_split': 5, 'n_es...",0.07125,0.095,0.09875,0.03375,0.09375,0.0785,0.024372,16
2,2.091738,0.07499,0.037103,0.002024,5,5,30,"{'max_depth': 5, 'min_samples_split': 5, 'n_es...",0.07625,0.10625,0.0925,0.085,0.11125,0.09425,0.013005,11
3,5.572384,0.209897,0.0818,0.003569,5,5,80,"{'max_depth': 5, 'min_samples_split': 5, 'n_es...",0.07875,0.0925,0.08375,0.095,0.07875,0.08575,0.006828,14
4,0.691594,0.009078,0.017976,0.001907,5,10,10,"{'max_depth': 5, 'min_samples_split': 10, 'n_e...",0.0925,0.11375,0.1425,0.04875,0.11625,0.10275,0.031319,9
5,1.35475,0.026328,0.02753,0.001752,5,10,20,"{'max_depth': 5, 'min_samples_split': 10, 'n_e...",0.0725,0.10375,0.08375,0.065,0.0775,0.0805,0.013148,15
6,2.04284,0.008414,0.035137,0.00124,5,10,30,"{'max_depth': 5, 'min_samples_split': 10, 'n_e...",0.08,0.11375,0.09,0.095,0.095,0.09475,0.010966,10
7,5.473011,0.136761,0.080682,0.001509,5,10,80,"{'max_depth': 5, 'min_samples_split': 10, 'n_e...",0.07625,0.105,0.09375,0.1125,0.08,0.0935,0.013951,12
8,1.355015,0.070963,0.019963,0.001549,10,5,10,"{'max_depth': 10, 'min_samples_split': 5, 'n_e...",0.45125,0.36875,0.45,0.47,0.4325,0.4345,0.034952,5
9,2.677257,0.095085,0.031366,0.000457,10,5,20,"{'max_depth': 10, 'min_samples_split': 5, 'n_e...",0.4525,0.3925,0.4575,0.4725,0.4275,0.4405,0.028036,2


In [131]:
def true_positive_rate(y_true, y_pred):
    return recall_score(y_true, y_pred)

def false_positive_rate(y_true, y_pred):
    idx=y_true==0
    return ypred[idx].mean()

In [148]:
ypred=gcv.predict(xtest)
f1_score(ytest, ypred),precision_score(ytest, ypred), recall_score(ytest, ypred)

(0.6061827956989247, 0.9241803278688525, 0.451)

In [149]:
false_positive_rate(ytest, ypred)

0.0019473684210526317

In [123]:
np=(ytest[ytest==1]).shape[0]
df_sample=pd.concat([xtest,ytest],axis=1)
sample_nn=df_sample.loc[df_sample.target==0].sample(n=np,replace=False, random_state=42)
df_sample=pd.concat([sample_nn,df_sample.loc[df_sample.target==1]],axis=0)

In [133]:
Xs=df_sample.drop('target', axis=1)
ys=df_sample['target']

In [150]:
ypred=gcv.predict(Xs)
f1_score(ys, ypred), precision_score(ys, ypred), recall_score(ys, ypred)

(0.6195054945054945, 0.9890350877192983, 0.451)

In [136]:
false_positive_rate(ys, ypred)

0.005

In [137]:
lgr=LogisticRegression(random_state=42)

In [142]:
lgr.fit(Xs, ys)

LogisticRegression(random_state=42)

In [143]:
ypred=lgr.predict(Xs)
f1_score(ys, ypred), precision_score(ys, ypred), recall_score(ys, ypred)

(0.5285285285285285, 0.5290581162324649, 0.528)