In [84]:
import json
import yaml
import importlib
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score, f1_score, average_precision_score, roc_auc_score
from sklearn.pipeline import Pipeline
from scipy.sparse import load_npz

In [3]:
class DummyEstimator(BaseEstimator):
    def fit(self): pass
    def score(self): pass

In [69]:
METRICS = {
    'balanced_accuracy':balanced_accuracy_score,
    'precision':precision_score,
    'recall':recall_score,
    'f1':f1_score,
    'average_precision':average_precision_score,
    'roc_auc':roc_auc_score
}

def generate_sample(df,n,balance=True):
    if balance and n:
        pos = df.loc[df['aft_net_sign_helpful'] > 0].sample(int(n/2))
        neg = df.loc[df['aft_net_sign_helpful'] < 0].sample(int(n/2))
        sample = pos.append(neg)
    elif n:
        sample = df.loc[df['aft_net_sign_helpful'] != 0].sample(n)
    else:
        sample = df.loc[df['aft_net_sign_helpful'] != 0]
    return sample

def proba_to_preds(probability_list,threshold=.5):
    preds = []
    for proba in probability_list:
        if proba[1] > threshold:
            preds.append(1)
        else:
            preds.append(-1)
    return preds

def results_to_table(proba,truth):
    result_dict = results_to_json(proba,truth)
    return pd.DataFrame([result_dict])
    #return results

def results_to_json(proba,truth):
    preds = proba_to_preds(proba)
    results_dict = {}
    for metric in METRICS:
        if metric == 'roc_auc':
            results_dict[metric] = METRICS[metric](truth,proba[:, 1])
        else:
            results_dict[metric] = METRICS[metric](truth,preds)
    return results_dict

In [5]:
#sparse_matrix_path = '/Users/klogg/dev/aft-classification/datasets/vectorized/vectorized_bow_100k_rating_2021-03-30.npz'
#feature_path = '/Users/klogg/dev/aft-classification/datasets/vectorized/vectorized_bow_100k_rating_2021-03-30.json'

feature_path_w2v = '/Users/klogg/dev/aft-classification/datasets/vectorized/vectorized_w2v_rating_2021-03-31.json'

def train_and_test_split(feature_path,sparse_matrix_path=None):
    with open(feature_path,'r') as filestream:
        df = pd.DataFrame(json.load(filestream))

    df = generate_sample(df, None)
    df = df.reset_index()
    
    if sparse_matrix_path:
        with open(sparse_matrix_path,'rb') as filestream:
            features = load_npz(filestream)
            
    else:
        features = pd.DataFrame(df['feature_vector'].values.tolist()).to_numpy()

    labels = df['aft_net_sign_helpful'].to_numpy()
    indicies = np.arange(len(labels))

    features_train, features_test, labels_train, labels_test, i_train, i_test = train_test_split(
        features,
        labels,
        indicies,
        test_size = .2,
        stratify = labels,
        shuffle = True,
        random_state = 1
    )
    
    return features_train, features_test, labels_train, labels_test, i_train, i_test

features_train, features_test, labels_train, labels_test, i_train, i_test = train_and_test_split(feature_path_w2v)

In [6]:
RANDOM_STATE = 0

def train_and_validate(cls, features_train,labels_train,features_test,labels_test):
    cls.fit(
        features_train,
        labels_train,
    )

    preds = cls.predict(features_test)
    results = results_to_table(preds,labels_test)
    return results, preds, cls

def get_results_sample(df,preds,i_test,labels_test,n):
    result_df = pd.DataFrame({
        'i':i_test,
        'preds':preds,
        'labels':labels_test
    })

    df = df.merge(result_df,
                 how='left',
                 left_index=True,
                 right_on='i')

    false_pos = result_df.loc[(result_df['preds'] == 1) & (result_df['labels'] == -1)]['i'].values
    false_neg = result_df.loc[(result_df['preds'] == -1) & (result_df['labels'] == 1)]['i'].values
    true_pos = result_df.loc[(result_df['preds'] == 1) & (result_df['labels'] == 1)]['i'].values
    true_neg = result_df.loc[(result_df['preds'] == -1) & (result_df['labels'] == -1)]['i'].values
    
    sample_df = df.iloc[false_pos].sample(n).append(
        df.iloc[false_neg].sample(n)).append(
        df.iloc[true_pos].sample(n)).append(
        df.iloc[true_neg].sample(n))

    sample_df = sample_df.sample(frac=1)
    return sample_df


In [110]:
CONFIG_FILEPATH = '../model_config/classifiers.test_params.yaml'

def class_for_name(class_path):
    c = getattr(importlib.import_module(class_path.rsplit('.',1)[0]), class_path.rsplit('.',1)[1])()
    return c

def format_model_config(yaml):
    param_grid = []
    for model in yaml:
        clf = class_for_name(yaml[model]['class'])
        params = {}
        for param in yaml[model]['params']:
            key = 'clf__{0}'.format(param)
            params[key] = yaml[model]['params'][param]
        params['clf'] = [clf]
        param_grid.append(params)
    
    return param_grid

with open(CONFIG_FILEPATH) as filestream:
    model_config = yaml.load(filestream)

param_grid = format_model_config(model_config)
scoring = ('roc_auc','f1','accuracy','recall','precision')
pipe = Pipeline([('clf', DummyEstimator())])
    
gs = GridSearchCV(pipe, param_grid, scoring=scoring, n_jobs=4, pre_dispatch=8, refit='roc_auc')
gs.fit(features_train, labels_train)
cv_results = pd.DataFrame(gs.cv_results_)

KeyboardInterrupt: 

In [70]:
proba = gs.predict_proba(features_test)
results = results_to_table(proba,labels_test)

In [71]:
results

Unnamed: 0,balanced_accuracy,precision,recall,f1,average_precision,roc_auc
0,0.624107,0.593807,0.50474,0.545663,0.510832,0.689074


In [53]:
N = 50

classifier_list = [
    LogisticRegression(
        random_state=RANDOM_STATE,
        max_iter=1000),
    GradientBoostingClassifier(
        random_state=RANDOM_STATE,
        n_estimators=320,
        max_features='log2'),
    RandomForestClassifier(
        random_state=RANDOM_STATE,
        n_estimators=100,
        max_features='log2'),
    SVC(
        random_state=RANDOM_STATE
    )
]

sample_list = []
results_df = pd.DataFrame()

for cls in classifier_list:
    results, preds, cls = train_and_validate(
        cls,
        features_train,
        labels_train,
        features_test,
        labels_test
    )
    sample = get_results_sample(
        df,
        preds,
        i_test,
        labels_test,
        n=N
    )
    results['classifier'] = str(cls.__class__.__name__)
    results_df = results_df.append(results)
    sample_list.append(sample)

In [54]:
results_df

Unnamed: 0,balanced_accuracy,precision,recall,f1,average_precision,roc_auc,classifier
0,0.624093,0.593832,0.504628,0.545608,0.510826,0.624093,LogisticRegression
0,0.642559,0.608393,0.546448,0.575759,0.525791,0.642559,GradientBoostingClassifier
0,0.6257,0.600647,0.496822,0.543823,0.512904,0.6257,RandomForestClassifier
0,0.650304,0.61137,0.569644,0.58977,0.531711,0.650304,SVC
