In [586]:
import os
import json
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import (
    confusion_matrix, roc_curve, 
    precision_recall_fscore_support, plot_roc_curve, roc_auc_score
)
from svm_utils import *


def get_confusion_matrix(pred_probs, y_test, threshold):
    pred = (pred_probs > threshold).astype(int)
    cm = pd.DataFrame(confusion_matrix(pred, y_test))
    cm.columns = ['F', 'T']
    cm.index = ['F', 'T']
    cm = pd.concat([cm], keys=['True'], names=[None], axis=1)
    cm = pd.concat([cm], keys=['Predicted'], names=[None], axis=0)
    
    tn = cm.values[0, 0]
    tp = cm.values[1, 1]
    fp = cm.values[1, 0]
    fn = cm.values[0, 1]
    tpr = tp/(tp+fn)
    tnr = tn/(tn+fp)
    tpr, tnr
    
    return cm, tpr, tnr

In [588]:
data_dir = '../../data'
pxm = 'p6m'

df_reg = pd.read_csv(os.path.join(data_dir, 'model/{}/full.csv'.format(pxm)))
train = pd.read_csv(os.path.join(data_dir, 'model/{}/train.csv'.format(pxm)))
val = pd.read_csv(os.path.join(data_dir, 'model/{}/validate.csv'.format(pxm)))
test = pd.read_csv(os.path.join(data_dir, 'model/{}/test.csv'.format(pxm)))
test = test.append(val)

train_idxs = [np.where(df_reg['id'] == cid)[0][0] for cid in train['id']]
test_idxs = [np.where(df_reg['id'] == cid)[0][0] for cid in test['id']]

In [590]:
dir_feat = 'data/features'
feats = {}
for fname in os.listdir(dir_feat):
    tmp = pd.read_csv(os.path.join(dir_feat, fname))
    feats[fname.split('.')[0]] = tmp

In [544]:
probs = {}

for model in feats:

    X = feats[model].drop(['y', 'id'], axis=1)
    X = np.array(X)
    y = df_reg['is_affluent_cur']
    X_train, y_train = split_by_idxs(X, y, train_idxs)
    X_test, y_test = split_by_idxs(X, y, test_idxs)


    svm = get_svm_classifier()
    svm.fit(X_train, y_train)
    pred_probs = svm.predict_proba(X_test)

    probs[model] = pred_probs[:, 1]

In [717]:
df_preds = pd.DataFrame(probs)
preds_en = df_preds.sum(axis=1) / 6

In [728]:
cm, tpr, tnr = get_confusion_matrix(preds_en, y_test, 0.0254)
fp = cm.values[1, 0]
tp = cm.values[1, 1]
precision = tp / (fp + tp)
auc = roc_auc_score(y_test, preds_en)

In [729]:
print("""
    AUC       : {}
    Precision : {}
""".format(auc, precision))
cm


    AUC       : 0.7223077378485585
    Precision : 0.10526315789473684



Unnamed: 0_level_0,Unnamed: 1_level_0,True,True
Unnamed: 0_level_1,Unnamed: 1_level_1,F,T
Predicted,F,1710,30
Predicted,T,68,8
