# Claira Project: Mortgage Contracts

## Import Packages

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import json
import string
import numpy as np
import gzip
import seaborn as sns
np.random.seed(99)
RANDOM_STATE = 99
import datetime
pd.set_option('display.max_rows', 151)


# Import vectorizing and modeling tools in preparation for modeling steps
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.feature_extraction import text, stop_words
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier
import time

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

import spacy
import en_core_web_lg
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import roc_curve, auc

# Import CountVectorizer and TFIDFVectorizer from feature_extraction.text.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

## Import Data

In [None]:
# Import the CSV file
# raw_df = pd.read_csv('../data/sasb_cash_trap_triggers.csv')

In [None]:
# Import the CSV file
raw_df = pd.read_csv('../data/sasb_cash_trap_triggers_06_09_20.csv')

In [None]:
# Review the first few rows of the data set
raw_df.head()

In [None]:
# Check size of the data set
raw_df.shape

In [None]:
raw_df = raw_df[['Document', 'Sentence', 'Trigger', 'Multiclass']]

In [None]:
# Check for null values
raw_df.isnull().sum()

In [None]:
raw_df.dropna()
raw_df.shape

In [None]:
raw_df.head()

## Data Cleaning

In [None]:
# Review number of Trigger types
raw_df['Trigger'].value_counts()

In [None]:
# Convert each category to Title format (to remove discrepancies based on capitalization)
raw_df['Trigger'] = raw_df['Trigger'].str.title()

In [None]:
raw_df['Trigger'] = raw_df['Trigger'].str.replace('Falll', 'Fall')

In [None]:
raw_df.loc[raw_df['Trigger'] == 'Aggregate Debt Yield', 'Trigger'] = 'Aggregate Debt Yield Fall'

In [None]:
raw_df['Sentence'] = raw_df['Sentence'].replace({'\$':''}, regex = True)

In [None]:
# Review clean Trigger Types
raw_df['Trigger'].value_counts()

In [None]:
# Rename first column (document title)
# raw_df = raw_df.rename(columns = {'0': 'Document'})

In [None]:
# Drop Multiclass column for now 
raw_df = raw_df.drop('Multiclass', axis = 1)
raw_df.head()

In [None]:
raw_df.shape

In [None]:
# Count the number of distinct documents
num_docs = raw_df['Document'].nunique()
print(f'{num_docs} unique documents have been included in the data set')

In [None]:
# Review the distinct documents and the number of Trigger events identified for each
raw_df['Document'].value_counts()

In [None]:
# Summarize the trigger counts for the existing Document set
max_triggers = raw_df['Document'].value_counts().max()
min_triggers = raw_df['Document'].value_counts().min()
print(f'The {num_docs} Documents have tag counts that range from {min_triggers} to {max_triggers}.')

In [None]:
# Check for duplicate rows
raw_df['is_duplicate'] = raw_df.duplicated(subset = None)
raw_df['is_duplicate'].value_counts()

In [None]:
num_duplicated_rows = (raw_df['is_duplicate'] == True).sum()
print(f'There are {num_duplicated_rows} row duplicates in the data set.')

In [None]:
# This removes duplicate rows, but those rows are in fact legitimate in the document
raw_df = raw_df.drop('is_duplicate', axis = 1)
raw_df = raw_df.drop_duplicates()

In [None]:
raw_df.shape

In [None]:
raw_df.head()

In [None]:
# Review new number of Trigger types
raw_df['Trigger'].value_counts()

In [None]:
trigger_order = raw_df['Trigger'].value_counts().sort_values(ascending = False).index

fig = plt.gcf()
fig.set_size_inches(8, 8)
ax = sns.countplot(y=raw_df['Trigger'], data=raw_df, orient = 'h', order = trigger_order)
ax.set_title('Number of Trigger Types');

for p in ax.patches:
    ax.annotate(int(p.get_width()),((p.get_x() + p.get_width()), p.get_y()), xytext=(17, -15),fontsize=9,textcoords='offset points', horizontalalignment='right')
    
#https://stackoverflow.com/questions/50190409/how-to-annotate-horizontal-seaborn-countplots

In [None]:
# Review new distinct documents and the number of Trigger events identified for each
raw_df['Document'].value_counts()

In [None]:
raw_df.dtypes

## Reshape the Data

In this section, we'll reshape the data, such that each sentence is represented once, with separate categorization columns for each Trigger type

In [None]:
# function to reshape the dataframe such that the triggers are 0/1 columns. Remove duplicates
def reshape_trigger_representation(dataframe):
    # select trigger types
    trigger_types = list(dataframe['Trigger'].unique())
    trigger_types = [x for x in trigger_types if str(x) != 'nan']
    
    # create new dataframe with unique document-sentence pairs (no duplicates)
    reshaped = dataframe.drop('Trigger', axis = 1).drop_duplicates().reset_index().drop('index', axis = 1)
    
    # select rows by trigger, reassign as 1 or 0 (for True or False)
    for tt in trigger_types:
        # select part that is trigger_type
        temp = dataframe.loc[dataframe['Trigger'] == tt].copy()

        # create a new column of true with trigger name
        temp[tt.lower().replace(' ', '_')] = np.int64(1)
        temp.drop('Trigger', axis=1, inplace=True)
        temp[tt.lower().replace(' ', '_')] = temp[tt.lower().replace(' ', '_')].astype('Int64')
        
        # left join this adjusted column to the unique data
        reshaped = reshaped.merge(temp, how='left', left_on=['Document', 'Sentence'], right_on=['Document', 'Sentence'])

    # replace nulls with False
    reshaped = reshaped.fillna(0)    
    
    return reshaped

In [None]:
# Reshape our original dataframe
df = reshape_trigger_representation(raw_df)

In [None]:
df.head(5)

In [None]:
# Export reshaped to CSV file
#df.to_csv('../data/reshaped_06_04_20.csv')

In [None]:
# Create a dataframe showing the document tagging details (removing sentence details)
doc_view = df.groupby('Document').sum()


In [None]:
doc_view['sum'] = doc_view.sum(axis = 1)
doc_view

In [None]:
# Export reshaped to CSV file
#doc_view.to_csv('../data/doc_view_06_04_20.csv')

In [None]:
# Isolate documents that don't have any Trigger tags (only nontriggers)
no_trigger_tags = list(doc_view.loc[doc_view['sum'] == doc_view['nontrigger']].index)
no_trigger_tags # Save document names to a list

In [None]:
# we will remove any documents that have 0 Cash Trap Trigger clauses within the data set

to_remove = df[df['Document'].isin(no_trigger_tags)]
df = df.drop(to_remove.index)

In [None]:
df.shape

## Feature Engineering

In [None]:
# Create a column with length of Sentence
df['sentence_char_count'] = df['Sentence'].str.len()

In [None]:
df.head()

## Natural Language Processing of the sentence column

In [None]:
nlp = spacy.load('en_core_web_sm', disable = 'parser')

In [None]:
# Review a tokenized sample Sentence
[token.text for token in nlp(df['Sentence'][2])]

In [None]:
# Review a lemmatized sample Sentence
[token.lemma_ for token in nlp(df['Sentence'][2])]

In [None]:
def tokenize(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    return ' '.join(tokens)

df['SentenceTokens'] = df['Sentence'].apply(tokenize)

In [None]:
def lemmatize(text):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc]
    return ' '.join(lemmas)

df['SentenceLemmas'] = df['Sentence'].apply(lemmatize)

In [65]:
doc = nlp(df['SentenceTokens'][68])

print(doc)



The Mortgage Loan Documents require the lender to maintain the Cash Management Account , and the Borrower to remit , or cause to be remitted funds into the operating account maintained by the Hotel Operator for disbursement into the Cash Management Account and subsequent deposit into the following reserve accounts , each of which are subaccounts of the Cash Management Account : an   account for all amounts payable in respect of taxes and insurance premiums ( the “ Basic Carrying Costs Escrow Account ” ) which is , ( i ) on the Origination Date , in an amount equal to the sum of 1,083,335 in respect of taxes , and ( ii ) on each Mortgage Loan Payment Date , an amount equal to the sum of ( a ) 1/12 of projected annual taxes , and ( b ) 1/12 of projected annual insurance premiums ; provided , that to the extent that the insurance required to be maintained by the Borrower is effected under a blanket policy maintained by Marriott International , Inc. or its subsidiaries and a Marriott Manag

In [66]:
for lemma in doc:
    print(lemma,  lemma.pos_, lemma.tag_, lemma.dep_, lemma.shape_,  lemma.is_stop)

The DET DT  Xxx False
Mortgage PROPN NNP  Xxxxx False
Loan PROPN NNP  Xxxx False
Documents PROPN NNPS  Xxxxx False
require VERB VBP  xxxx False
the DET DT  xxx True
lender NOUN NN  xxxx False
to PART TO  xx True
maintain VERB VB  xxxx False
the DET DT  xxx True
Cash PROPN NNP  Xxxx False
Management PROPN NNP  Xxxxx False
Account PROPN NNP  Xxxxx False
, PUNCT ,  , False
and CCONJ CC  xxx True
the DET DT  xxx True
Borrower PROPN NNP  Xxxxx False
to PART TO  xx True
remit VERB VB  xxxx False
, PUNCT ,  , False
or CCONJ CC  xx True
cause VERB VB  xxxx False
to PART TO  xx True
be VERB VB  xx True
remitted VERB VBN  xxxx False
funds NOUN NNS  xxxx False
into ADP IN  xxxx True
the DET DT  xxx True
operating NOUN NN  xxxx False
account NOUN NN  xxxx False
maintained VERB VBN  xxxx False
by ADP IN  xx True
the DET DT  xxx True
Hotel PROPN NNP  Xxxxx False
Operator PROPN NNP  Xxxxx False
for ADP IN  xxx True
disbursement NOUN NN  xxxx False
into ADP IN  xxxx True
the DET DT  xxx True
Cash PROP

and CCONJ CC  xxx True
the DET DT  xxx True
Hotel PROPN NNP  Xxxxx False
Operator PROPN NNP  Xxxxx False
is VERB VBZ  xx True
reserving VERB VBG  xxxx False
funds NOUN NNS  xxxx False
pursuant ADJ JJ  xxxx False
to ADP IN  xx True
the DET DT  xxx True
terms NOUN NNS  xxxx False
of ADP IN  xx True
the DET DT  xxx True
Hotel PROPN NNP  Xxxxx False
Operating PROPN NNP  Xxxxx False
Agreement PROPN NNP  Xxxxx False
, PUNCT ,  , False
the DET DT  xxx True
Borrower PROPN NNP  Xxxxx False
is VERB VBZ  xx True
not ADV RB  xxx True
required VERB VBN  xxxx False
to PART TO  xx True
reserve VERB VB  xxxx False
amounts NOUN NNS  xxxx False
in ADP IN  xx True
the DET DT  xxx True
Seasonality PROPN NNP  Xxxxx False
Reserve PROPN NNP  Xxxxx False
Account PROPN NNP  Xxxxx False
, PUNCT ,  , False
as ADP IN  xx True
described VERB VBN  xxxx False
under ADP IN  xxxx True
“ PUNCT ``  “ False
Description NOUN NN  Xxxxx False
of ADP IN  xx True
the DET DT  xxx True
Mortgage PROPN NNP  Xxxxx False
Loan PROPN

In [60]:
df[df['SentenceLemmas'].str.contains('marriott')]

Unnamed: 0,Document,Sentence,loan_default,aggregate_dscr_fall,dscr_fall,unspecified,debt_yield_fall,aggregate_debt_yield_fall,mezzanine_default,tenant_failure,mezzanine_outstanding,operator_termination,bankruptcy,sponsor_termination,renovations,nontrigger,sff,sentence_char_count,SentenceTokens,SentenceLemmas
62,20160510_02B0XU_Prospectus_SD00000000231272338...,"On each Mortgage Loan Payment Date, other than...",1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,2663,"On each Mortgage Loan Payment Date , other tha...","on each mortgage loan payment date , other tha..."
67,20160510_02B0XU_Prospectus_SD00000000231272338...,“Lockbox Period” means any time when the Mortg...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,236,“ Lockbox Period ” means any time when the Mor...,""" lockbox period "" mean any time when the mort..."
68,20160510_02B0XU_Prospectus_SD00000000231272338...,The Mortgage Loan Documents require the lender...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3608,The Mortgage Loan Documents require the lender...,the mortgage loan documents require the lender...
81,20160510_02B0XU_Prospectus_SD00000000231272338...,During any time when the Mortgage Loan is outs...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,372,During any time when the Mortgage Loan is outs...,during any time when the mortgage loan be outs...
83,20160510_02B0XU_Prospectus_SD00000000231272338...,"On each Mortgage Loan Payment Date, other than...",1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,2418,"On each Mortgage Loan Payment Date , other tha...","on each mortgage loan payment date , other tha..."
84,20160510_02B0XU_Prospectus_SD00000000231272338...,In the event no Marriott Management Period or ...,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,266,In the event no Marriott Management Period or ...,in the event no marriott management period or ...
347,20190108_02IFF1_Prospectus_SD00000000257045846...,"Mortgage Lender and Borrower agree that, provi...",1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1725,"Mortgage Lender and Borrower agree that , prov...","mortgage lender and borrower agree that , prov..."
798,20160510_02B0XU_Prospectus_SD00000000231272338...,For so long as a Marriott Management Period or...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,320,For so long as a Marriott Management Period or...,for so long as a marriott management period or...
804,20160510_02B0XU_Prospectus_SD00000000231272338...,“Marriott Management Period” means any period ...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,285,“ Marriott Management Period ” means any perio...,""" marriott management period "" mean any period..."
809,20160510_02B0XU_Prospectus_SD00000000231272338...,"On each Mortgage Loan Payment Date, the Borrow...",0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1053,"On each Mortgage Loan Payment Date , the Borro...","on each mortgage loan payment date , the borro..."


In [None]:
df.head()

In [None]:
# Put all sentences into a list of review tokens
all_sentence_tokens = ' '.join(df['SentenceTokens'])
all_sentence_tokens[:75]

In [None]:
# Put all sentences into a list of review lemmas
all_sentence_lemmas = ' '.join(df['SentenceLemmas'])
all_sentence_lemmas[:75]

In [None]:
sentence_token_list = [token for token in all_sentence_tokens.split(' ')]
sentence_lemma_list = [lemma for lemma in all_sentence_lemmas.split(' ')]

In [None]:
token_df = pd.Series(sentence_token_list)
lemma_df = pd.Series(sentence_lemma_list)

In [None]:
# Plotting the raw token count
token_series = token_df.value_counts().head(15).sort_values(ascending = True)
ax = token_series.plot.barh(figsize = (6,6))
ax.set_xlabel('count')
ax.set_ylabel('token')
ax.set_title('Sentences: Raw Token Count');

In [None]:
# Plotting the raw lemma count
lemma_series = lemma_df.value_counts().head(15).sort_values(ascending = True)
ax = lemma_series.plot.barh(figsize = (6,6))
ax.set_xlabel('count')
ax.set_ylabel('lemma')
ax.set_title('Sentences: Raw Lemma Count');

In [None]:
# Remove punctuation, maintain letters and numbers
df['SentenceLemmas_nopunc'] = df['SentenceLemmas'].str.replace("[^a-zA-Z0-9#']", " ")

In [None]:
# Put all SentenceLemmas_nopunc into a list
all_lemmas_nopunc = ' '.join(df['SentenceLemmas_nopunc'])
lemma_list_nopunc = [lemma for lemma in all_lemmas_nopunc.split(' ')]
lemma_list_nopunc = [lemma for lemma in lemma_list_nopunc if lemma not in ['']]
lemma_list_nopunc[:10]

In [None]:
# Create a data series containing the list of words
lemma_df_nopunc = pd.Series(lemma_list_nopunc)
lemma_series_nopunc = lemma_df_nopunc.value_counts().head(15).sort_values(ascending = True)
ax = lemma_series_nopunc.plot.barh(figsize = (6,6))
ax.set_xlabel('count')
ax.set_ylabel('lemma')
ax.set_title('Sentences: Lemma Count, no punctuation');

In [None]:
df['SentenceLemmas_nopunc'].head()

In [None]:
df['SentenceLemmas_nopunc'] = df['SentenceLemmas_nopunc'].str.replace('\s{2,}', ' ')

In [None]:
df['SentenceLemmas_nopunc'].head()

## Preprocessing and Preparation for Modeling

In [None]:
short_stopwords = ['the', 'to', 'of', 'be', 'and', 'in', 'a']
short_stopwords2 = ['the', 'and', 'a', 'to', 'it', 'be', 'for', 'with', 'that']

In [None]:
# Look at original stopword list
stopwords = list(STOP_WORDS)
stopwords

## Train/Test Split

In [None]:
df.head()

In [None]:
# Start by predicting Loan Default trigger

target = 'bankruptcy'


X = df['SentenceLemmas_nopunc']
y = df[target]
indices = df.index

y = y.astype('int')

In [None]:
y.dtypes

In [None]:
# Checking shape of X and y 
print(f'X shape is {X.shape}')
print(f'y shape is {y.shape}')

In [None]:
y.value_counts()

In [None]:
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y, indices, test_size = 0.3, stratify = y, random_state = RANDOM_STATE)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
print(indices_train.shape, indices_test.shape)

In [None]:
X.head()

In [None]:
X_train.shape[0]

In [None]:
y_train.shape[0]

## Modeling

### CountVectorizer + Logistic Regression GridSearch and modeling

In [None]:
# Define CVEC + Logistic Regression Pipeline
pipe_cvec = Pipeline([('cvec', CountVectorizer()), ('lr', LogisticRegression(solver = 'liblinear', random_state = RANDOM_STATE))])
cvec_params = {
    'cvec__ngram_range': [(1,2), (1,3), (1,4), (1,5), (1,6), (1,7), (1,8)],
    'cvec__stop_words': [None, short_stopwords, short_stopwords2, stopwords],  
    'cvec__max_features': [100, 200, 400, 600, 1000],
    'cvec__min_df': [2],
    'cvec__max_df': [.99],
    }

gs_cvec = GridSearchCV(pipe_cvec, param_grid = cvec_params, cv = 3, scoring = 'f1')

# Start the timer.
t0 = time.time()

results_cvec = gs_cvec.fit(X_train, y_train)

print(f'Seconds elapsed for fitting: {(time.time() - t0):.3f}') # How many seconds elapsed.



In [None]:
t0 = time.time()
print(f'Training score is {results_cvec.score(X_train, y_train):.3f}')
print(f'Test score is {results_cvec.score(X_test, y_test):.3f}')
print(f'Cross Validation score is {cross_val_score(results_cvec.best_estimator_, X, y, cv = 3).mean():.3f}')
print(f'Seconds elapsed for score calculation: {(time.time() - t0):.3f}') # How many seconds elapsed.

In [None]:
print(f'Best Score:{(results_cvec.best_score_):.3f}')
print(f'Best Parameters :{results_cvec.best_params_}')

### 5: CountVectorizor + Random Forest

In [None]:
'''
# Define CVEC + Logistic Regression Pipeline
pipe_rf = Pipeline([('cvec', CountVectorizer()), ('rf', RandomForestClassifier(random_state = RANDOM_STATE, n_jobs = 2))])
rf_params = {
    'cvec__ngram_range': [(1,2), (1,3), (1,4), (1,5)],
    'cvec__stop_words': [None, short_stopwords, short_stopwords2],  
    'cvec__max_features': [100, 200, 400, 800],
    'cvec__min_df': [2],
    'cvec__max_df': [.99],
    'rf__max_depth': [4,5, 6],
    'rf__min_samples_split': [2,3],
    'rf__min_samples_leaf': [10, 12]
    }

gs_rf = GridSearchCV(pipe_rf, param_grid = rf_params, cv = 3, scoring = 'f1')

# Start the timer.
t0 = time.time()

results_rf = gs_rf.fit(X_train, y_train)

print(f'Seconds elapsed for fitting: {(time.time() - t0):.3f}') # How many seconds elapsed.

'''

In [None]:
'''
t0 = time.time()
print(f'Training score is {results_rf.score(X_train, y_train):.3f}')
print(f'Test score is {results_rf.score(X_test, y_test):.3f}')
print(f'Cross Validation score is {cross_val_score(results_rf.best_estimator_, X, y, cv = 3).mean():.3f}')
print(f'Seconds elapsed for score calculation: {(time.time() - t0):.3f}') # How many seconds elapsed.
'''

In [None]:
'''
print(f'Best Score: {results_rf.best_score_}')
print(f'Best Parameters: {results_rf.best_params_}')
'''

## Model Selection
- In this section we compare the Train and Test scores across the various models

In [None]:
# Check Training Scores across all models
print(f'Baseline Score:                                {y_test.value_counts(normalize=True)[0]:.3f}')
print(f'CountVectorizer + LogisticRegression Accuracy: {results_cvec.score(X_train, y_train):.3f}')
# print(f'TfidfVectorizer + LogisticRegression Accuracy: {results_tfidf.score(X_train, y_train):.3f}')
# print(f'CountVectorizer + Multinomial Naive Bayes:     {results_mnb.score(X_train, y_train):.3f}')
# print(f'TfidfVectorizer + Gaussian Naive Bayes:        {results_gnb.score(X_train, y_train):.3f}')
# print(f'CountVectorizer + Random Forest Accuracy:      {results_rf.score(X_train, y_train):.3f}')

In [None]:
# Check Testing Scores across all models
print(f'Baseline Score:                                {y_test.value_counts(normalize=True)[0]:.3f}')
print(f'CountVectorizer + LogisticRegression Accuracy: {results_cvec.score(X_test, y_test):.3f}')
# print(f'TfidfVectorizer + LogisticRegression Accuracy: {results_tfidf.score(X_test, y_test):.3f}')
# print(f'CountVectorizer + Multinomial Naive Bayes:     {results_mnb.score(X_test, y_test):.3f}')
# print(f'TfidfVectorizer + Gaussian Naive Bayes:        {results_gnb.score(X_test, y_test):.3f}')
# print(f'CountVectorizer + Random Forest Accuracy:      {results_rf.score(X_test, y_test):.3f}')

## Model Evaluation - Logistic Regression

In [None]:
# Isolate the individual words and their coefficients
# Feature names (i.e., words in the Sentences):
names = results_cvec.best_estimator_.steps[0][1].get_feature_names()

# classifier (betas):
classifier = results_cvec.best_estimator_.named_steps['lr']

# https://stackoverflow.com/questions/43856280/return-coefficients-from-pipeline-object-in-sklearn

In [None]:
# Extract coefficients from the classifier defined above
coef_cvec = np.array(classifier.coef_).tolist()[0]
coef_cvec[:10] #Look at 10 coefficients

In [None]:
# Create zipped list of the word names with their corresponding beta coefficients
cvec_top_words = list(zip(names, coef_cvec))

In [None]:
# Create final dataframe of words with their corresponding coefficients
df_cvec_coefs = pd.DataFrame(cvec_top_words).rename(columns = {0: 'word', 1: 'coef'}).sort_values(by = 'coef', ascending = True)

In [None]:
# Top differentiating words and phrases for this Trigger type
df_cvec_coefs.tail(20).sort_values('coef', ascending = False)

In [None]:
# Display top opposing words and phrases for this Trigger type
df_cvec_coefs.head(20)

In [None]:
def coef_plot(category):
    '''
    Plots the top 10 and bottom 10 coefficients for the complexity category specified
    '''
    coefs_1 = df_cvec_coefs.sort_values(by=category, ascending=False).tail(10) # getting the top 10 coefficients
    coefs_2 = df_cvec_coefs.sort_values(by=category, ascending=False).head(10) # getting the bottom 10 coefficients
    coefs = pd.concat([coefs_2, coefs_1], axis = 0) # merging the two into one
    # plotting importance
    plt.figure(figsize=(10, 8)) # plotting the coefficients
    plt.title(f'Feature Coefficients for {target.replace("_", " ").title()}', fontsize=25)
    sns.set_style("darkgrid")
    sns.barplot(data=coefs,
                x=category,
                y='word',
                orient='h',
                palette = 'PuBuGn_d')
    plt.xlabel('coefficient', fontsize=15)
    plt.ylabel('feature', fontsize=15)
    plt.tick_params(labelsize=15)
coef_plot('coef')

## Calculating predictions and Confusion Matrix

In [None]:
# Here, we isolate our best model & make predictions based on our test data
best_model = results_cvec.best_estimator_
preds = best_model.predict(X_test)

In [None]:
# Checking our predictions
preds[:10]

In [None]:
## Create a confusion matrix
from sklearn.metrics import confusion_matrix
cm_test = confusion_matrix(y_test, preds)
print('This is a confusion matrix for our test data vs predictions:')
print(cm_test)

In [None]:
# Converting our confusion matrix into a dataframe
cm_test = pd.DataFrame(cm_test, columns=['Predicted Negative','Predicted Positive'], 
                       index=['Actual Negative','Actual Positive'])

In [None]:
cm_test

In [None]:
# Plot our confusion matrix
plt.figure(figsize = (6,6))
ax = plt.subplot()
sns.heatmap(cm_test, 
            annot=True, 
            ax = ax, 
            fmt='g', 
            cbar=False,
            cmap="Blues"); #annot=True to annotate cells

# labels, title and ticks
ax.set_title('Confusion Matrix - Trigger Prediction', size=16)
ax.set_xlabel('Predicted', size=14)
ax.set_ylabel('Actual', size=14)
ax.xaxis.set_ticklabels(['Negative', 'Positive'])
ax.yaxis.set_ticklabels(['Negative', 'Positive']);

In [None]:
# Setting up probabilities for the ROC curve
pred_proba = results_cvec.predict_proba(X_test)
preds = results_cvec.predict(X_test)

In [None]:
fpr, tpr, _= roc_curve(y_test, pred_proba[:,1])
roc_auc = auc(fpr, tpr)
# Plot of a ROC curve for a specific class
plt.figure(figsize = (8,8))
plt.plot(fpr, tpr, label='ROC curve (area = %0.4f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--', label='baseline')
plt.xlim([-0.01, 1.0])
plt.ylim([0.0, 1.01])
plt.xlabel('False Positive Rate', fontsize =20)
plt.ylabel('True Positive Rate', fontsize = 20)
plt.title('Receiver Operating Characteristic Curve', fontsize=18)
plt.legend(loc="lower right");

In [None]:
# To allow us to better read the Sentences
pd.set_option('display.max_colwidth', None)

In [None]:
result_cols = ['index', 'prediction', 'actual', 'model_input']
results = pd.DataFrame({'index': list(indices_test),'prediction': list(preds), 'actual': list(y_test), 'model_input': list(X_test)})

In [None]:
# set index as index column
results.set_index('index', inplace = True)
results.head()

In [None]:
misclassified = results[results['prediction'] != results['actual']]


In [None]:
misclassified = misclassified.merge(df, how = 'left', left_index = True, right_index = True)

In [None]:
misclassified = misclassified[['prediction', 'actual', 'model_input', 'Document', 'Sentence',
       'loan_default', 'aggregate_dscr_fall', 'dscr_fall', 'unspecified',
       'debt_yield_fall', 'aggregate_debt_yield_fall', 'mezzanine_default',
       'tenant_failure', 'mezzanine_outstanding', 'operator_termination',
       'bankruptcy', 'sponsor_termination', 'renovations', 'nontrigger']]

In [None]:
misclassified.head()

In [None]:
# misclassified['prediction'].astype('Int64')
# misclassified['actual'].astype('Int64')

In [None]:
print(f'We incorrectly predicted the trigger type for {misclassified.shape[0]} sentences.')

In [None]:
# Show the split of the misclassified predictions
misclassified['prediction'].value_counts()

## Review the Incorrect Predictions

### Misclassifications: wrongly predicted to be the trigger category

In [None]:
misclassified_as_true = misclassified.loc[misclassified['prediction'] == 1]
misclassified_as_true

In [None]:
misclassified_as_true_summary = misclassified_as_true[['prediction', 'actual', 'loan_default', 'aggregate_dscr_fall', 'dscr_fall', 'unspecified',
       'debt_yield_fall', 'aggregate_debt_yield_fall', 'mezzanine_default',
       'tenant_failure', 'mezzanine_outstanding', 'operator_termination',
       'bankruptcy', 'sponsor_termination', 'renovations', 'nontrigger']]


In [None]:
misclassified_as_true_count = misclassified_as_true_summary.shape[0]
print(f'{misclassified_as_true_count} sentences were predicted to be the trigger category, but in fact were not this category.')

In [None]:
misclassified_as_true_summary.loc['sum',:] = misclassified_as_true.sum(axis = 0).copy(deep = True)


In [None]:
misclassified_as_true_summary

### Misclassifications: wrongly predicted NOT to be the trigger category

In [None]:
misclassified_as_false = misclassified.loc[misclassified['prediction'] == 0]
misclassified_as_false

In [None]:
misclassified_as_false_summary = misclassified_as_false[['prediction', 'actual', 'loan_default', 'aggregate_dscr_fall', 'dscr_fall', 'unspecified',
       'debt_yield_fall', 'aggregate_debt_yield_fall', 'mezzanine_default',
       'tenant_failure', 'mezzanine_outstanding', 'operator_termination',
       'bankruptcy', 'sponsor_termination', 'renovations', 'nontrigger']]


In [None]:
misclassified_as_false_count = misclassified_as_false.shape[0]
print(f'{misclassified_as_false_count} sentences were predicted not to be the trigger category, but in fact were this trigger category.')

In [None]:
misclassified_as_false_summary.loc['sum',:] = misclassified_as_false.sum(axis = 0).copy(deep = True)
misclassified_as_false_summary

### Export Misclassification summaries

In [None]:
# Export to CSV file
#misclassified_as_false_summary.to_csv(f'../data/misclassified/{target}_misclassified_as_false_06_04_20.csv')
#misclassified_as_true_summary.to_csv(f'../data/misclassified/{target}_misclassified_as_true_06_04_20.csv')