In [76]:
import json
import yaml
import importlib
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, average_precision_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.model_selection import GridSearchCV
from scipy.sparse import load_npz

from functools import partial

In [12]:
class DummyEstimator(BaseEstimator):
    def fit(self): pass
    def score(self): pass

In [78]:
METRICS = {
    'accuracy':accuracy_score,
    'precision':precision_score,
    'specificity':partial(precision_score,pos_label = -1),
    'recall':recall_score,
    'f1':f1_score,
    'average_precision':average_precision_score,
    'roc_auc':roc_auc_score
}

def generate_sample(df,n,balance=True):
    if balance and n:
        pos = df.loc[df['aft_net_sign_helpful'] > 0].sample(int(n/2))
        neg = df.loc[df['aft_net_sign_helpful'] < 0].sample(int(n/2))
        sample = pos.append(neg)
    elif n:
        sample = df.loc[df['aft_net_sign_helpful'] != 0].sample(n)
    else:
        sample = df.loc[df['aft_net_sign_helpful'] != 0]
    return sample

def proba_to_preds(probability_list,threshold=.5):
    preds = []
    for proba in probability_list:
        if proba[1] > threshold:
            preds.append(1)
        elif proba[0] > threshold:
            preds.append(-1)
        else:
            preds.append(0)
    return preds

def results_to_table(proba,truth):
    result_dict = results_to_json(proba,truth)
    return pd.DataFrame([result_dict])
    #return results

def results_to_json(proba,truth):
    preds = proba_to_preds(proba)
    results_dict = {}
    for metric in METRICS:
        if metric == 'roc_auc':
            results_dict[metric] = METRICS[metric](truth,proba[:, 1])
        else:
            results_dict[metric] = METRICS[metric](truth,preds)
    return results_dict

In [14]:
#sparse_matrix_path = '/Users/klogg/dev/aft-classification/datasets/vectorized/vectorized_bow_100k_rating_2021-03-30.npz'
#feature_path = '/Users/klogg/dev/aft-classification/datasets/vectorized/vectorized_bow_100k_rating_2021-03-30.json'

feature_path_w2v = '../datasets/vectorized/vectorized_w2v_rating_2021-03-31.json'

def train_and_test_split(feature_path,sparse_matrix_path=None):
    with open(feature_path,'r') as filestream:
        df = pd.DataFrame(json.load(filestream))

    df = generate_sample(df, None)
    df = df.reset_index()
    
    if sparse_matrix_path:
        with open(sparse_matrix_path,'rb') as filestream:
            features = load_npz(filestream)
            
    else:
        features = pd.DataFrame(df['feature_vector'].values.tolist()).to_numpy()

    labels = df['aft_net_sign_helpful'].to_numpy()
    indicies = np.arange(len(labels))

    features_train, features_test, labels_train, labels_test, i_train, i_test = train_test_split(
        features,
        labels,
        indicies,
        test_size = .2,
        stratify = labels,
        shuffle = True,
        random_state = 1
    )
    
    return features_train, features_test, labels_train, labels_test, i_train, i_test

features_train, features_test, labels_train, labels_test, i_train, i_test = train_and_test_split(feature_path_w2v)

In [15]:
RANDOM_STATE = 0

def train_and_validate(cls, features_train,labels_train,features_test,labels_test):
    cls.fit(
        features_train,
        labels_train,
    )

    preds = cls.predict(features_test)
    results = results_to_table(preds,labels_test)
    return results, preds, cls

def get_results_sample(df,preds,i_test,labels_test,n):
    result_df = pd.DataFrame({
        'i':i_test,
        'preds':preds,
        'labels':labels_test
    })

    df = df.merge(result_df,
                 how='left',
                 left_index=True,
                 right_on='i')

    false_pos = result_df.loc[(result_df['preds'] == 1) & (result_df['labels'] == -1)]['i'].values
    false_neg = result_df.loc[(result_df['preds'] == -1) & (result_df['labels'] == 1)]['i'].values
    true_pos = result_df.loc[(result_df['preds'] == 1) & (result_df['labels'] == 1)]['i'].values
    true_neg = result_df.loc[(result_df['preds'] == -1) & (result_df['labels'] == -1)]['i'].values
    
    sample_df = df.iloc[false_pos].sample(n).append(
        df.iloc[false_neg].sample(n)).append(
        df.iloc[true_pos].sample(n)).append(
        df.iloc[true_neg].sample(n))

    sample_df = sample_df.sample(frac=1)
    return sample_df


In [90]:
CONFIG_FILEPATH = '../model_config/classifiers.test_params_small.yaml'

def class_for_name(class_path):
    c = getattr(importlib.import_module(class_path.rsplit('.',1)[0]), class_path.rsplit('.',1)[1])()
    return c

def format_model_config(yaml):
    param_grid = []
    for model in yaml:
        clf = class_for_name(yaml[model]['class'])
        params = {}
        for param in yaml[model]['params']:
            key = 'clf__{0}'.format(param)
            params[key] = yaml[model]['params'][param]
        params['clf'] = [clf]
        param_grid.append(params)
    
    return param_grid

with open(CONFIG_FILEPATH) as filestream:
    model_config = yaml.load(filestream)

param_grid = format_model_config(model_config)
scoring = ('roc_auc','f1','accuracy','recall','precision')
pipe = Pipeline([('clf', DummyEstimator())])

print(param_grid)
best_models = {}

for model in param_grid:
    gs = GridSearchCV(pipe, [model], scoring=scoring, n_jobs=32, pre_dispatch=64, refit='roc_auc')
    gs.fit(features_train, labels_train)
    best_models[str(model['clf'][0].__class__.__name__)] = gs
    #cv_results = pd.DataFrame(gs.cv_results_)

[{'clf__penalty': ['l2'], 'clf__C': [0.1, 1], 'clf': [LogisticRegression()]}, {'clf': [BernoulliNB()]}]


In [92]:
best_models

{'LogisticRegression': GridSearchCV(estimator=Pipeline(steps=[('clf', DummyEstimator())]), n_jobs=32,
              param_grid=[{'clf': [LogisticRegression(C=0.1)],
                           'clf__C': [0.1, 1], 'clf__penalty': ['l2']}],
              pre_dispatch=64, refit='roc_auc',
              scoring=('roc_auc', 'f1', 'accuracy', 'recall', 'precision')),
 'BernoulliNB': GridSearchCV(estimator=Pipeline(steps=[('clf', DummyEstimator())]), n_jobs=32,
              param_grid=[{'clf': [BernoulliNB()]}], pre_dispatch=64,
              refit='roc_auc',
              scoring=('roc_auc', 'f1', 'accuracy', 'recall', 'precision'))}

In [81]:
def remove_uncertain_predictions(proba, labels, threshold=.5):
    df = pd.DataFrame({
        'proba_neg':proba.T[0],
        'proba_pos':proba.T[1],
        'labels':labels
    })
    #print(df)
    df = df.loc[(df['proba_pos'] > threshold) | ((df['proba_neg'] > threshold))]
    return np.vstack([df['proba_neg'].values,df['proba_pos'].values]).T, df['labels'].values, df

proba = gs.predict_proba(features_test)
t_proba, t_labels, _ = remove_uncertain_predictions(proba,labels_test,.5)
print(len(t_proba))
print(results_to_table(t_proba,t_labels))
t_proba, t_labels, df = remove_uncertain_predictions(proba,labels_test,.8)
print(len(t_proba))
print(results_to_table(t_proba,t_labels))
print(len(labels_test))
print(results_to_table(proba,labels_test))

21036
   accuracy  precision  specificity    recall        f1  average_precision  \
0  0.642755   0.595956     0.669042  0.502844  0.545455           0.511595   

   roc_auc  
0  0.68894  
2663
   accuracy  precision  specificity    recall        f1  average_precision  \
0  0.855051   0.622642      0.85977  0.082707  0.146018           0.188936   

    roc_auc  
0  0.576421  
21036
   accuracy  precision  specificity    recall        f1  average_precision  \
0  0.642755   0.595956     0.669042  0.502844  0.545455           0.511595   

   roc_auc  
0  0.68894  


In [75]:
print(len(df.loc[df['labels'] == -1]))
print(len(df.loc[df['proba_neg'].subtract(df['proba_pos']) > 0]))
print(len(df.loc[df['labels'] == 1]))
print(len(df.loc[df['proba_neg'].subtract(df['proba_pos']) < 0]))

2264
2610
399
53


In [60]:
t_labels

array([-1, -1, -1, ..., -1, -1, -1])

In [19]:
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf,param_clf__C,param_clf__penalty,params,split0_test_roc_auc,split1_test_roc_auc,...,std_test_recall,rank_test_recall,split0_test_precision,split1_test_precision,split2_test_precision,split3_test_precision,split4_test_precision,mean_test_precision,std_test_precision,rank_test_precision
0,0.844636,0.002922,0.032518,0.000766,LogisticRegression(C=0.1),0.1,l2,"{'clf': LogisticRegression(C=0.1), 'clf__C': 0...",0.681778,0.685509,...,0.008898,2,0.584574,0.587385,0.600471,0.587094,0.592099,0.590324,0.005626,1
1,0.937751,0.062996,0.029165,0.002166,LogisticRegression(C=0.1),1.0,l2,"{'clf': LogisticRegression(C=0.1), 'clf__C': 1...",0.681722,0.685535,...,0.008658,1,0.582915,0.585505,0.6,0.586859,0.590598,0.589175,0.005954,2


In [53]:
N = 50

classifier_list = [
    LogisticRegression(
        random_state=RANDOM_STATE,
        max_iter=1000),
    GradientBoostingClassifier(
        random_state=RANDOM_STATE,
        n_estimators=320,
        max_features='log2'),
    RandomForestClassifier(
        random_state=RANDOM_STATE,
        n_estimators=100,
        max_features='log2'),
    SVC(
        random_state=RANDOM_STATE
    )
]

sample_list = []
results_df = pd.DataFrame()

for cls in classifier_list:
    results, preds, cls = train_and_validate(
        cls,
        features_train,
        labels_train,
        features_test,
        labels_test
    )
    sample = get_results_sample(
        df,
        preds,
        i_test,
        labels_test,
        n=N
    )
    results['classifier'] = str(cls.__class__.__name__)
    results_df = results_df.append(results)
    sample_list.append(sample)

In [54]:
results_df

Unnamed: 0,balanced_accuracy,precision,recall,f1,average_precision,roc_auc,classifier
0,0.624093,0.593832,0.504628,0.545608,0.510826,0.624093,LogisticRegression
0,0.642559,0.608393,0.546448,0.575759,0.525791,0.642559,GradientBoostingClassifier
0,0.6257,0.600647,0.496822,0.543823,0.512904,0.6257,RandomForestClassifier
0,0.650304,0.61137,0.569644,0.58977,0.531711,0.650304,SVC
