# Notebook 2: Modeling of full dataset

## Overview

This notebook takes in the reshaped data, with lemmatized sentences from the prior 'Data Cleaning and NLP Preprocessing' notebook & fully models the dataset, returning a dataframe of all modeling inputs, hyperparameters and results.

Steps performed by the included functions:

- import the dataset that has been preprocessed with Spacy and reshaped to have each document-sentence represented only once 
- list trigger types contained in the current dataset
- train-test-split the preprocessed dataset & return a dictionary of processed and split data that is ready for modeling
- create stopword lists
- run modeling via logistic or random forest regression methods, per a user-defined dictionary (with defaults set as logistic regression) - modeling is performed via gridsearch, and a dataframe is returned, including all modeling inputs, hyperparameters and results.

## Import packages

In [1]:
import pandas as pd
import numpy as np
np.random.seed(99)
RANDOM_STATE = 99

from spacy.lang.en.stop_words import STOP_WORDS

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

import time
import pickle

## Defining Modeling Functions

In [2]:
def get_reshaped_lemmatized():
    # Import the CSV file containing the reshaped data set
    df = pd.read_csv('../data/reshaped_lemmatized.csv')
    return df

In [3]:
def list_triggers(df):
    '''This function lists the trigger types that exist in the current data set'''
    triggers = list(df.columns.drop(['Document', 'Sentence', 'SentenceTokens', 'SentenceLemmas', 'Unnamed: 0']))
    return triggers

In [4]:
# function to split data for each target column (trigger type)
def get_split_data(df, target_info):
    # extract target name
    target = target_info['target']
    model_input  = target_info['model_input']
    
    # create X, Y
    X = df[model_input]
    y = df[target]
    indices = df.index

    print(f"Number of distinct labeled {target} document/sentence combinations within the full data set: {y.value_counts()[1]}")   
    
    y = y.astype('int')

    # run test, train split
    X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y, indices, test_size = 0.3, stratify = y, random_state = RANDOM_STATE)

    
    # create output dictionary
    split_data = {}
    split_data['X_train'] = X_train
    split_data['X_test'] = X_test
    split_data['y_train'] = y_train
    split_data['y_test'] = y_test
    split_data['indices_train'] = indices_train
    split_data['indices_test'] = indices_test
    
    # Return the train-test-split data in a dictionary form
    return split_data

In [5]:
# Incorporate Stopwords
def get_stopwords():
    '''This function includes the creation/usage of various Stopword lists, which can be modified as needed.'''
    short_stopwords = ['the', 'and', 'a', 'to', 'it', 'in', 'be', 'for', 'with', 'that', 'marriott', 'facebook', 'an', 'if', 'have', 'than', 'of', 'at', 'by', 'as', 'that', 'will', 'or', 'on', 'ii', 'iii', 'iv', 'v', 'vi']
    stopwords = list(STOP_WORDS) + ['marriott', 'facebook','ii', 'iii', 'iv', 'v', 'vi']
 
    return short_stopwords, stopwords

In [6]:
# run the model with the chosen model and metric
def run_model(split_data, target_info):
    '''This function takes in an entry from the user-defined target dictionary and 
    carries out the modeling for that Trigger type accordingly, with a GridSearch set of 
    hyperparameters defined below'''
    # get stopwords
    short_stopwords, stopwords = get_stopwords()
    
    # set pipeline according to the selected model:
    if target_info['model'] == 'lr':
        # Define CVEC + Logistic Regression Pipeline
        pipe = Pipeline([('cvec', CountVectorizer()), ('lr', LogisticRegression(solver = 'liblinear', random_state = RANDOM_STATE))])
        params = {
            'cvec__ngram_range': [(1,2), (1,3), (1,4), (1,5), (1,6), (1,7)],
            'cvec__stop_words': [short_stopwords, stopwords],  
            'cvec__max_features': [100, 200, 400, 600, 1000],
            'cvec__min_df': [2],
            'cvec__max_df': [.99],
            }

    elif target_info['model'] == 'rf':
        # Define CVEC + Logistic Regression Pipeline
        pipe = Pipeline([('cvec', CountVectorizer()), ('rf', RandomForestClassifier(random_state = RANDOM_STATE, n_jobs = 2))])
        params = {
            'cvec__ngram_range': [(1,2), (1,3), (1,4), (1,5)],
            'cvec__stop_words': [short_stopwords, stopwords],  
            'cvec__max_features': [100, 200, 400, 800],
            'cvec__min_df': [2],
            'cvec__max_df': [.99],
            'rf__max_depth': [4,5, 6],
            'rf__min_samples_split': [2,3],
            'rf__min_samples_leaf': [10, 12]
            }

    else:
        print('Error: did not specify model')

    # define pipeline
    gs_model = GridSearchCV(pipe, param_grid = params, cv = 3, scoring = target_info['metric'])

    # Start the timer.
    t0 = time.time()

    # extract X_train and y_train
    X_train = split_data['X_train']
    y_train = split_data['y_train']
    
    # run pipeline
    model_result = gs_model.fit(X_train, y_train)

    print(f"Seconds elapsed for fitting: {(time.time() - t0):.3f}") # How many seconds elapsed.   
    return model_result

## Perform all modeling steps for all targets: user inputs modeling parameters

In [7]:
# Review list of Trigger types in the current data set. 
# As needed, Additions can be made to the target_dict below
df = get_reshaped_lemmatized()
list_triggers(df)

['loan_default',
 'aggregate_dscr_fall',
 'dscr_fall',
 'unspecified',
 'debt_yield_fall',
 'aggregate_debt_yield_fall',
 'mezzanine_default',
 'tenant_failure',
 'mezzanine_outstanding',
 'operator_termination',
 'bankruptcy',
 'sponsor_termination',
 'renovations',
 'nontrigger',
 'sff',
 'delayed_repayment']

__Note__: In advance, users may update the modeling details/parameters within the the target_dict below. Defaults are set for logistic regression modeling.

Options:
- __'model'__:  'lr' or 'rf' (logistic regression or random forest)
- __'metric'__:  any of the standard sklearn classification metric options, such as 'roc_auc', 'accuracy', 'f1' (https://scikit-learn.org/stable/modules/model_evaluation.html)
- __'model_input"__: 'Sentence' (original text), 'SentenceTokens' (tokenized but not lemmatized), 'SentenceLemmas' (lemmatized)

In [8]:
# Control of the main project

# define dictionary of targets contains: tag, model, metric, input
target_dict = {}
target_dict['nontrigger'] = {'target': 'nontrigger', 'model': 'lr', 'metric': 'roc_auc', 'model_input': 'SentenceLemmas'}
target_dict['loan_default'] = {'target': 'loan_default', 'model': 'lr', 'metric':'roc_auc', 'model_input':'SentenceLemmas'}
target_dict['unspecified'] = {'target': 'unspecified', 'model': 'lr', 'metric':'roc_auc', 'model_input':'SentenceLemmas'}
target_dict['debt_yield_fall'] = {'target': 'debt_yield_fall', 'model': 'lr', 'metric': 'roc_auc', 'model_input': 'SentenceLemmas'}
target_dict['mezzanine_default'] = {'target': 'mezzanine_default', 'model': 'lr', 'metric': 'roc_auc', 'model_input': 'SentenceLemmas'}
target_dict['bankruptcy'] = {'target': 'bankruptcy', 'model': 'lr', 'metric': 'roc_auc', 'model_input': 'SentenceLemmas'}
target_dict['tenant_failure'] = {'target': 'tenant_failure', 'model': 'lr', 'metric': 'roc_auc', 'model_input': 'SentenceLemmas'}
target_dict['renovations'] = {'target': 'renovations', 'model': 'lr', 'metric': 'f1', 'model_input': 'SentenceLemmas'}
target_dict['aggregate_debt_yield_fall'] = {'target': 'aggregate_debt_yield_fall', 'model': 'lr', 'metric': 'f1', 'model_input': 'SentenceLemmas'}
target_dict['dscr_fall'] = {'target': 'dscr_fall', 'model': 'lr', 'metric': 'f1', 'model_input': 'SentenceLemmas'}
target_dict['operator_termination'] = {'target': 'operator_termination', 'model': 'lr', 'metric': 'f1', 'model_input': 'SentenceLemmas'}
target_dict['sponsor_termination'] = {'target': 'sponsor_termination', 'model': 'lr', 'metric': 'f1', 'model_input': 'SentenceLemmas'}
target_dict['sff'] = {'target': 'sff', 'model': 'lr', 'metric': 'f1', 'model_input': 'SentenceLemmas'}
target_dict['mezzanine_outstanding'] = {'target': 'mezzanine_outstanding', 'model': 'lr', 'metric': 'f1', 'model_input': 'SentenceLemmas'}
target_dict['aggregate_dscr_fall'] = {'target': 'aggregate_dscr_fall', 'model': 'lr', 'metric': 'f1', 'model_input': 'SentenceLemmas'}
target_dict['delayed_repayment'] = {'target': 'delayed_repayment', 'model': 'lr', 'metric': 'f1', 'model_input': 'SentenceLemmas'}


# set output_dict - will contain target + output of calculations
output_dict = {}

# get data
df = get_reshaped_lemmatized()

# run for each model definition
for k,v in target_dict.items():
    
    print(f"{target_dict[k]['target'].replace('_', ' ').title()}: creating Train-Test split")
    # get split data
    split_data = get_split_data(df, v)
    
    print(f"Model fit in progress: {target_dict[k]}")
    # run model
    model_result = run_model(split_data, v)
    
    # make the output dictionary
    output_dict[k] = v
    output_dict[k]['split_data'] = split_data
    output_dict[k]['model_result'] = model_result

    
    print(f"Best fit parameters: {model_result.best_params_}")
    print(f"Best fit 3-fold cross validation score: {model_result.best_score_:.3f}")
    print(f"{target_dict[k]['target'].replace('_', ' ').title()} {target_dict[k]['metric']} Train score: {model_result.score(split_data['X_train'], split_data['y_train']):.3f}")
    print(f"{target_dict[k]['target'].replace('_', ' ').title()} {target_dict[k]['metric']} Test score: {model_result.score(split_data['X_test'], split_data['y_test']):.3f}")
    print("\n")
    
    output_dict[k]['best_params'] = model_result.best_params_
    output_dict[k]['count_deduplicated'] = df[target_dict[k]['target']].value_counts()[1]
    output_dict[k]['test_data_count'] = output_dict[k]['split_data']['y_test'].value_counts()[1]
    output_dict[k]['best_crossval_score'] = model_result.best_score_
    output_dict[k]['train_score'] = model_result.score(split_data['X_train'], split_data['y_train'])
    output_dict[k]['test_score'] = model_result.score(split_data['X_test'], split_data['y_test'])                                                   
    
    '''
    outfile = open(f"../data/models/{target_dict[k]['target']}_pickle", 'wb')
    pickle.dump(model_result, outfile)
    outfile.close()
    '''
    

full_output_dict = [output_dict[key] for key in output_dict.keys()]
results_df = pd.DataFrame.from_dict(full_output_dict)

# Export summary table
results_df.to_csv(f'../data/exported_data/results_df_fulldata.csv')

print("Modeling complete!")


Nontrigger: creating Train-Test split
Number of distinct labeled nontrigger document/sentence combinations within the full data set: 1030
Model fit in progress: {'target': 'nontrigger', 'model': 'lr', 'metric': 'roc_auc', 'model_input': 'SentenceLemmas'}
Seconds elapsed for fitting: 64.337
Best fit parameters: {'cvec__max_df': 0.99, 'cvec__max_features': 1000, 'cvec__min_df': 2, 'cvec__ngram_range': (1, 3), 'cvec__stop_words': ['be', 'someone', 'unless', 'during', 'everyone', 'which', 'those', 'rather', 'give', 'since', 'therefore', 'may', 'anyone', 'if', 'nine', 'a', 'own', 'there', 'now', 'twelve', 'as', 'eleven', 'thereby', 'off', 'everywhere', 'upon', 'thus', 'anyhow', 'wherever', 'under', 'whenever', 'fifteen', 'myself', 'seems', 'thence', 'thereafter', 'whereby', 'has', 'my', 'noone', 'among', 'him', 'can', 'no', 'nor', 'than', 'all', 'sometime', 'former', 'to', 'herein', 'became', 'into', 'part', 'you', 'herself', 'the', 'while', 'i', 'should', 'ever', 'nevertheless', 'and', 'or

Debt Yield Fall roc_auc Test score: 0.998


Mezzanine Default: creating Train-Test split
Number of distinct labeled mezzanine_default document/sentence combinations within the full data set: 72
Model fit in progress: {'target': 'mezzanine_default', 'model': 'lr', 'metric': 'roc_auc', 'model_input': 'SentenceLemmas'}
Seconds elapsed for fitting: 60.227
Best fit parameters: {'cvec__max_df': 0.99, 'cvec__max_features': 100, 'cvec__min_df': 2, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': ['be', 'someone', 'unless', 'during', 'everyone', 'which', 'those', 'rather', 'give', 'since', 'therefore', 'may', 'anyone', 'if', 'nine', 'a', 'own', 'there', 'now', 'twelve', 'as', 'eleven', 'thereby', 'off', 'everywhere', 'upon', 'thus', 'anyhow', 'wherever', 'under', 'whenever', 'fifteen', 'myself', 'seems', 'thence', 'thereafter', 'whereby', 'has', 'my', 'noone', 'among', 'him', 'can', 'no', 'nor', 'than', 'all', 'sometime', 'former', 'to', 'herein', 'became', 'into', 'part', 'you', 'herself', 'th

Aggregate Debt Yield Fall: creating Train-Test split
Number of distinct labeled aggregate_debt_yield_fall document/sentence combinations within the full data set: 19
Model fit in progress: {'target': 'aggregate_debt_yield_fall', 'model': 'lr', 'metric': 'f1', 'model_input': 'SentenceLemmas'}
Seconds elapsed for fitting: 60.089
Best fit parameters: {'cvec__max_df': 0.99, 'cvec__max_features': 200, 'cvec__min_df': 2, 'cvec__ngram_range': (1, 3), 'cvec__stop_words': ['be', 'someone', 'unless', 'during', 'everyone', 'which', 'those', 'rather', 'give', 'since', 'therefore', 'may', 'anyone', 'if', 'nine', 'a', 'own', 'there', 'now', 'twelve', 'as', 'eleven', 'thereby', 'off', 'everywhere', 'upon', 'thus', 'anyhow', 'wherever', 'under', 'whenever', 'fifteen', 'myself', 'seems', 'thence', 'thereafter', 'whereby', 'has', 'my', 'noone', 'among', 'him', 'can', 'no', 'nor', 'than', 'all', 'sometime', 'former', 'to', 'herein', 'became', 'into', 'part', 'you', 'herself', 'the', 'while', 'i', 'should

Operator Termination f1 Test score: 0.800


Sponsor Termination: creating Train-Test split
Number of distinct labeled sponsor_termination document/sentence combinations within the full data set: 14
Model fit in progress: {'target': 'sponsor_termination', 'model': 'lr', 'metric': 'f1', 'model_input': 'SentenceLemmas'}
Seconds elapsed for fitting: 59.476
Best fit parameters: {'cvec__max_df': 0.99, 'cvec__max_features': 1000, 'cvec__min_df': 2, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': ['the', 'and', 'a', 'to', 'it', 'in', 'be', 'for', 'with', 'that', 'marriott', 'facebook', 'an', 'if', 'have', 'than', 'of', 'at', 'by', 'as', 'that', 'will', 'or', 'on', 'ii', 'iii', 'iv', 'v', 'vi']}
Best fit 3-fold cross validation score: 0.457
Sponsor Termination f1 Train score: 1.000
Sponsor Termination f1 Test score: 0.857


Sff: creating Train-Test split
Number of distinct labeled sff document/sentence combinations within the full data set: 9
Model fit in progress: {'target': 'sff', 'model': '

Delayed Repayment: creating Train-Test split
Number of distinct labeled delayed_repayment document/sentence combinations within the full data set: 3
Model fit in progress: {'target': 'delayed_repayment', 'model': 'lr', 'metric': 'f1', 'model_input': 'SentenceLemmas'}




Seconds elapsed for fitting: 60.026
Best fit parameters: {'cvec__max_df': 0.99, 'cvec__max_features': 100, 'cvec__min_df': 2, 'cvec__ngram_range': (1, 2), 'cvec__stop_words': ['the', 'and', 'a', 'to', 'it', 'in', 'be', 'for', 'with', 'that', 'marriott', 'facebook', 'an', 'if', 'have', 'than', 'of', 'at', 'by', 'as', 'that', 'will', 'or', 'on', 'ii', 'iii', 'iv', 'v', 'vi']}
Best fit 3-fold cross validation score: 0.000
Delayed Repayment f1 Train score: 1.000
Delayed Repayment f1 Test score: 0.000


Modeling complete!


In [9]:
# Show results of modeling (already exported as 'results_df_fulldata.csv')
results_df


Unnamed: 0,target,model,metric,model_input,split_data,model_result,best_params,count_deduplicated,test_data_count,best_crossval_score,train_score,test_score
0,nontrigger,lr,roc_auc,SentenceLemmas,{'X_train': [' trigger period mean a period a ...,"GridSearchCV(cv=3,\n estimator=Pip...","{'cvec__max_df': 0.99, 'cvec__max_features': 1...",1030,309,0.986805,0.999823,0.985161
1,loan_default,lr,roc_auc,SentenceLemmas,{'X_train': ['upon the occurrence of a lockbox...,"GridSearchCV(cv=3,\n estimator=Pip...","{'cvec__max_df': 0.99, 'cvec__max_features': 4...",553,166,0.980591,0.99917,0.993301
2,unspecified,lr,roc_auc,SentenceLemmas,{'X_train': ['any fund remain in the reserve a...,"GridSearchCV(cv=3,\n estimator=Pip...","{'cvec__max_df': 0.99, 'cvec__max_features': 1...",498,149,0.96624,0.999926,0.962905
3,debt_yield_fall,lr,roc_auc,SentenceLemmas,{'X_train': [' approve operating expense mean ...,"GridSearchCV(cv=3,\n estimator=Pip...","{'cvec__max_df': 0.99, 'cvec__max_features': 2...",188,56,0.976543,0.999969,0.997569
4,mezzanine_default,lr,roc_auc,SentenceLemmas,{'X_train': ['borrower hereby represent and wa...,"GridSearchCV(cv=3,\n estimator=Pip...","{'cvec__max_df': 0.99, 'cvec__max_features': 1...",72,22,0.949633,1.0,0.99485
5,bankruptcy,lr,roc_auc,SentenceLemmas,{'X_train': ['hard lockbox and spring cash man...,"GridSearchCV(cv=3,\n estimator=Pip...","{'cvec__max_df': 0.99, 'cvec__max_features': 2...",44,13,0.826654,0.999897,0.994841
6,tenant_failure,lr,roc_auc,SentenceLemmas,{'X_train': ['follow the occurrence and prior ...,"GridSearchCV(cv=3,\n estimator=Pip...","{'cvec__max_df': 0.99, 'cvec__max_features': 4...",74,22,0.955315,1.0,0.999931
7,renovations,lr,f1,SentenceLemmas,{'X_train': ['see description of the mortgage ...,"GridSearchCV(cv=3,\n estimator=Pip...","{'cvec__max_df': 0.99, 'cvec__max_features': 1...",26,8,0.755556,0.888889,0.933333
8,aggregate_debt_yield_fall,lr,f1,SentenceLemmas,{'X_train': ['the mortgage lender will make di...,"GridSearchCV(cv=3,\n estimator=Pip...","{'cvec__max_df': 0.99, 'cvec__max_features': 2...",19,6,0.555556,1.0,0.909091
9,dscr_fall,lr,f1,SentenceLemmas,{'X_train': ['upon the write request of mortga...,"GridSearchCV(cv=3,\n estimator=Pip...","{'cvec__max_df': 0.99, 'cvec__max_features': 1...",23,7,0.616667,1.0,0.8


In [10]:
# Pickle the model details generated by this notebook, to be accessible for later analysis in other notebooks

path = '../data/exported_data/'
outfile = open(path + 'results_df.pkl', 'wb')
pickle.dump(results_df, outfile)
outfile.close()

outfile = open(path + 'full_output_dict.pkl', 'wb')
pickle.dump(full_output_dict, outfile)
outfile.close()

