In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

In [2]:
mean=np.random.rand(5)*100
cov=np.random.rand(5,5)
cov=cov+cov.T

In [3]:
cov=np.diag(np.random.rand(5)*10)

In [4]:
cov

array([[8.58435009, 0.        , 0.        , 0.        , 0.        ],
       [0.        , 6.26374005, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 8.44903795, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 2.00710841, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 5.93313966]])

In [5]:
data=np.random.multivariate_normal(mean=mean, cov=cov, size=10**5)

In [6]:
1/cov.diagonal()

array([0.11649106, 0.15964903, 0.11835667, 0.49822919, 0.16854483])

In [7]:
log_prob =data-mean
log_prob =log_prob*log_prob/cov.diagonal()
log_prob =log_prob.sum(axis=1)

In [8]:
T=np.quantile(log_prob, 0.95)

In [9]:
df=pd.DataFrame(data, columns=['x1','x2','x3','x4','x5'])
df['target']=(log_prob>T).astype(int)

# Modelling

In [None]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score,average_precision_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

In [11]:
def true_positive_rate(y_true, y_pred):
    return recall_score(y_true, y_pred)

def false_positive_rate(y_true, y_pred):
    idx=y_true==0
    return ypred[idx].mean()

In [12]:
X=df.drop('target', axis=1)
y=df['target']
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2,stratify=y, random_state=42)

In [13]:
xtrain.var(0)

x1    8.531523
x2    6.278495
x3    8.441658
x4    1.995327
x5    5.883511
dtype: float64

In [176]:
xtrain_sq=(xtrain-xtrain.mean(0))/xtrain.std(0)
#xtrain_sq=xtrain_sq*xtrain_sq

In [107]:
ytrain.mean(), ytest.mean()

(0.05, 0.05)

In [44]:
rf=RandomForestClassifier(n_estimators=10, random_state=42,n_jobs=-1)
param_grid = {'max_depth': [3,5],'n_estimators':[3,5,10], 'min_samples_split':[10]}

In [51]:
cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
gcv=GridSearchCV(estimator=rf, param_grid=param_grid, scoring='average_precision', cv=cv, verbose=1)

In [52]:
gcv.fit(xtrain, ytrain)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
             estimator=RandomForestClassifier(n_estimators=10, n_jobs=-1,
                                              random_state=42),
             param_grid={'max_depth': [3, 5], 'min_samples_split': [10],
                         'n_estimators': [3, 5, 10]},
             scoring='average_precision', verbose=1)

In [53]:
pd.DataFrame(gcv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.260565,0.04966,0.011279,0.001753,3,10,3,"{'max_depth': 3, 'min_samples_split': 10, 'n_e...",0.252095,0.249787,0.297693,0.24078,0.290944,0.26626,0.023317,6
1,0.313516,0.080579,0.013468,0.002551,3,10,5,"{'max_depth': 3, 'min_samples_split': 10, 'n_e...",0.302644,0.397041,0.405113,0.326334,0.363424,0.358911,0.039588,5
2,0.386871,0.046247,0.015035,0.00166,3,10,10,"{'max_depth': 3, 'min_samples_split': 10, 'n_e...",0.370975,0.482001,0.490607,0.374231,0.422343,0.428031,0.051011,3
3,0.268604,0.008571,0.011423,0.000943,5,10,3,"{'max_depth': 5, 'min_samples_split': 10, 'n_e...",0.388282,0.393945,0.372632,0.372032,0.406168,0.386612,0.013015,4
4,0.345952,0.032791,0.012878,0.000442,5,10,5,"{'max_depth': 5, 'min_samples_split': 10, 'n_e...",0.416059,0.437899,0.503756,0.414921,0.519379,0.458403,0.04445,2
5,0.492676,0.048495,0.014727,0.001118,5,10,10,"{'max_depth': 5, 'min_samples_split': 10, 'n_e...",0.49867,0.5746,0.579584,0.476602,0.579429,0.541777,0.044789,1


In [54]:
ypred=gcv.predict(xtrain)
yscore=gcv.predict_proba(xtrain)[:,1]
f1_score(ytrain, ypred),\
    precision_score(ytrain, ypred),\
         recall_score(ytrain, ypred),\
             false_positive_rate(ytrain, ypred),\
             average_precision_score(ytrain,yscore),\
                 roc_auc_score(ytrain, yscore)

(0.10919404396123848,
 1.0,
 0.05775,
 0.0,
 0.6291125039510129,
 0.8878373947368422)

In [55]:
ypred=gcv.predict(xtest)
yscore=gcv.predict_proba(xtest)[:,1]
f1_score(ytest, ypred),\
    precision_score(ytest, ypred),\
         recall_score(ytest, ypred),\
             false_positive_rate(ytest, ypred),\
             average_precision_score(ytest,yscore),\
                 roc_auc_score(ytest, yscore)

(0.1042654028436019, 1.0, 0.055, 0.0, 0.6001024389461679, 0.882755947368421)

In [20]:
ypred=gcv.predict(xtest)
f1_score(ytest, ypred),precision_score(ytest, ypred), recall_score(ytest, ypred)

(0.8250265111346766, 0.8781038374717833, 0.778)

In [22]:
false_positive_rate(ytest, ypred)

0.00568421052631579

## Resample

In [185]:
np=(ytest[ytest==1]).shape[0]
df_sample=pd.concat([xtest,ytest],axis=1)
sample_nn=df_sample.loc[df_sample.target==0].sample(n=np,replace=False, random_state=42)
df_sample=pd.concat([sample_nn,df_sample.loc[df_sample.target==1]],axis=0)

In [186]:
Xs=df_sample.drop('target', axis=1)
ys=df_sample['target']

In [197]:
rf=RandomForestClassifier(n_estimators=10, random_state=42,n_jobs=-1)
param_grid = {'max_depth': [5],'n_estimators':[10], 'min_samples_split':[5,10]}

In [198]:
cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
gcv=GridSearchCV(estimator=rf, param_grid=param_grid, scoring='f1', cv=cv, verbose=1)

In [199]:
gcv.fit(Xs, ys)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
             estimator=RandomForestClassifier(n_estimators=10, n_jobs=-1,
                                              random_state=42),
             param_grid={'max_depth': [5], 'min_samples_split': [5, 10],
                         'n_estimators': [10]},
             scoring='f1', verbose=1)

In [202]:
ypred=gcv.predict(Xs)
f1_score(ys, ypred), precision_score(ys, ypred), recall_score(ys, ypred), false_positive_rate(ys, ypred)

(0.8916030534351146, 0.9077720207253887, 0.876, 0.089)

In [203]:
ypred=gcv.predict(xtest)
f1_score(ytest, ypred), precision_score(ytest, ypred), recall_score(ytest, ypred), false_positive_rate(ytest, ypred)

(0.41943978932248027, 0.2757318224740321, 0.876, 0.12110526315789474)

In [201]:
false_positive_rate(ys, ypred)

0.089

In [137]:
lgr=LogisticRegression(random_state=42)

In [142]:
lgr.fit(Xs, ys)

LogisticRegression(random_state=42)

In [143]:
ypred=lgr.predict(Xs)
f1_score(ys, ypred), precision_score(ys, ypred), recall_score(ys, ypred)

(0.5285285285285285, 0.5290581162324649, 0.528)