# 4 Predicting Small Trigger Classes

## Import Packages

In [1]:
import pandas as pd
import numpy as np
np.random.seed(99)
RANDOM_STATE = 99
from sklearn.utils import resample
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc
from spacy.lang.en.stop_words import STOP_WORDS
import time

## Define Model Functions

In [2]:
def get_reshaped_lemmatized():
    # Import the CSV file
    df = pd.read_csv('../data/reshaped_lemmatized.csv').drop(columns = ('Unnamed: 0'), axis = 1)
    return df

In [3]:
def downsampling_data_set(df, threshold):
    #Total sum per row: 
    downsampling_set = df
    downsampling_set.loc[:,'Total'] = downsampling_set.sum(axis=1)

    # select only Sentences with 1 or 2 triggers
    downsampling_set = downsampling_set[downsampling_set['Total'].isin([1,2])]

    
    # isolate the trigger columns to sample from
    n = threshold
    trigger_cols = downsampling_set.drop(['Document', 'Sentence', 'SentenceLemmas', 'SentenceTokens'], axis=1).sum(axis=0)
    trigger_cols = trigger_cols.where(lambda x: x > n).dropna()
    trigger_cols = [t for t in list(trigger_cols.index) if t not in ['Total', 'nontrigger', 'unspecified']]
    nontrigger_cols = ['nontrigger']
    
    # randomly sample n rows from the selected trigger columns without replacement - samples is the training set
    init = True

    for col in trigger_cols:
        temp_col = downsampling_set[downsampling_set[col] == 1]
        sampled_col = resample(temp_col, replace = False, n_samples = n, random_state = RANDOM_STATE)
        if init:
            samples = sampled_col
            init = False
        else:
            samples = pd.concat([samples,sampled_col])
            
    n_unspecified = samples.shape[0] # Prepare to randomaly collect nontrigger data of an equivalent size
    for col in nontrigger_cols:
        temp_col = downsampling_set[downsampling_set[col] == 1]
        nontrigger_sampled_col = resample(temp_col, replace = False, n_samples = n_unspecified, random_state = RANDOM_STATE)
        samples = pd.concat([samples, nontrigger_sampled_col])
        
    # remove these rows from the main data set - select index and remove by index
    rmv_index = list(samples.index)
    filtered = df.drop(rmv_index, axis='index') # This will become our Test Set
    
    # make 'is trigger' column
    samples['istrigger'] = np.where(samples['nontrigger'] > 0, 0, 1)
    filtered['istrigger'] = np.where(filtered['nontrigger'] > 0, 0, 1)
    
    # Check which trigger types were included in the training set
#    in_train_set = (downsampling_set.drop(['Document', 'Sentence', 'Total', 'unspecified'], axis=1).sum(axis=0) > n).to_frame()
    
    return samples, filtered # Samples will be the Training Set, Filtered will be the test set

In [4]:
# Incorporate Stopwords
def get_stopwords():
    # might need space
    short_stopwords = ['the', 'to', 'of', 'be', 'and', 'in', 'a', 'marriott']
    short_stopwords2 = ['the', 'and', 'a', 'to', 'it', 'be', 'for', 'with', 'that', 'marriott']
    stopwords = list(STOP_WORDS) + ['marriott'];

    return short_stopwords, short_stopwords2, stopwords

In [5]:
# function to split data for each target column (trigger type)
def run_model(df, threshold):
    downsampling_data_set(df, threshold)
    short_stopwords, short_stopwords2, stopwords = get_stopwords()
    
    X_train = samples['SentenceLemmas']
    y_train = samples['istrigger']
    X_test = filtered['SentenceLemmas']
    y_test = filtered['istrigger']

    y_train = y_train.astype('int')
    y_test = y_test.astype('int')

    
    train_index = samples.index
    test_index = filtered.index
    
    pipe_cvec = Pipeline([('cvec', CountVectorizer()), ('lr', LogisticRegression(solver = 'liblinear', random_state = RANDOM_STATE))]) 
    cvec_params = {
        'cvec__ngram_range': [(1,2), (1,3), (1,4), (1,5)],
        'cvec__stop_words': [short_stopwords, short_stopwords2, stopwords],  
        'cvec__max_features': [100, 200, 400, 600, 1000],
        'cvec__min_df': [2],
        'cvec__max_df': [.99],
        }

    gs_cvec = GridSearchCV(pipe_cvec, param_grid = cvec_params, cv = 3, scoring = 'roc_auc')

    # Start the timer.
    t0 = time.time()

    results_cvec = gs_cvec.fit(X_train, y_train)
    return results_cvec, X_train, y_train, X_test, y_test, train_index, test_index

    print(f'Seconds elapsed for fitting: {(time.time() - t0):.3f}')
    print(f'Training score is {results_cvec.score(X_train, y_train):.3f}')
    print(f'Test score is {results_cvec.score(X_test, y_test):.3f}')
    
    # How many seconds elapsed.

In [6]:
def misclassification(results_cvec, X_train, y_train, X_test, y_test, train_index, test_index, filtered):
    best_model = results_cvec.best_estimator_
    preds = best_model.predict(X_test)
    pred_proba = [i[1] for i in results_cvec.predict_proba(X_test)]
    pred_df = pd.DataFrame({'true_values': y_test,
                        'pred_probs':pred_proba})
    result_cols = ['index', 'prediction', 'actual', 'model_input']
    results = pd.DataFrame({'index': list(test_index),'prediction': list(preds), 'actual': list(y_test), 'model_input': list(X_test)})
    results.set_index('index', inplace = True)
    misclassified = results[results['prediction'] != results['actual']]
    misclassified = misclassified.merge(df, how = 'left', left_index = True, right_index = True)
    misclassified = misclassified[['prediction', 'actual', 'model_input', 'Document', 'Sentence',
       'loan_default', 'aggregate_dscr_fall', 'dscr_fall', 'unspecified',
       'debt_yield_fall', 'aggregate_debt_yield_fall', 'mezzanine_default',
       'tenant_failure', 'mezzanine_outstanding', 'operator_termination', 'bankruptcy', 'sponsor_termination', 'renovations', 'nontrigger', 'sff', 'delayed_repayment']]
    full_test_set = filtered.drop(['Document', 'Sentence', 'Total', 'istrigger', 'SentenceTokens', 'SentenceLemmas'], axis = 1).sum(axis = 0).to_frame()
    misclassified_test_set = misclassified.drop(['prediction', 'actual', 'Document', 'Sentence', 'model_input'], axis=1).sum(axis=0).to_frame()
    misclassified_results = full_test_set.merge(misclassified_test_set, left_index = True, right_index = True)
    misclassified_results.rename(columns = {'0_x': 'full_test_set', '0_y': 'num_misclassified'}, inplace = True)
    misclassified_results['percent_misclassified'] = 100 * misclassified_results['num_misclassified'] / misclassified_results['full_test_set']
    misclassified_results['percent_misclassified'] = misclassified_results['percent_misclassified'].round(1)
    misclassified_results = misclassified_results.merge(in_train_set, left_index = True, right_index = True)
    misclassified_results.rename(columns = {0: 'in_train_set'}, inplace = True)
    misclassified_results['in_train_set'] = misclassified_results['in_train_set'].map({True: 'yes', False: 'no'})
    return misclassified_results

## Perform modeling steps

In [7]:
df = get_reshaped_lemmatized()
samples, filtered = downsampling_data_set(df, 10)
results_cvec, X_train, y_train, X_test, y_test, train_index, test_index = run_model(df, 10)
misclassified_results = misclassification(results_cvec, X_train, y_train, X_test, y_test, train_index, test_index, filtered)

NameError: name 'in_train_set' is not defined