# SVM Bag-of-Words Model
This notebook includes the final CV runs of the SVM model for predicting CFPB Complaint Outcomes.

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_squared_error, roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report


import logging
import os

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
PATH_PARENT = os.path.dirname(os.getcwd())
PATH_RAW = PATH_PARENT + "\\data_raw\\"
PATH_PROC = PATH_PARENT + "\\data_processed\\"

# Import Data + Prep for Models

In [3]:
data_raw = pd.read_csv(PATH_PROC+"narr_df_02.csv"
                       , dtype={"Product": "category", "Sub-product": "category", "Issue": "category"
                              , "Sub-issue": "category", "Company public response": "category"
                              , "Consumer consent provided?": "category", "Consumer disputed?": "category"
                              , "Submitted via": "category", "Tags": "category"
                              , "State": "category", "Timely response?": "category"
                                , "Mulvaney_Dir": "int", "nar_wordct": "int", "nar_charct": "int"
                              , "nar_numerics": "int"
                               }
                      )

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [4]:
print(data_raw.shape)
data_raw.head()

(374772, 29)


Unnamed: 0,Date_received,Product,Sub_product,Issue,Sub_issue,Consumer_complaint_narrative,Company_public_response,Company,State,ZIP_code,Tags,Consumer_consent_provided,Submitted_via,Date_sent_to_company,Company_response_to_consumer,Timely_response,Consumer_disputed,Complaint_ID,Trump_Admin,Mulvaney_Dir,New_Product,narrative_clean,nar_wordct,nar_charct,nar_numerics,narrative_lemma,narrative_clean_token,narrative_lemma_token,company_clean
0,2019-03-06,Debt collection,Credit card debt,False statements or representation,"Impersonated attorney, law enforcement, or gov...","On XXXX XXXX,2019 my friend got a call from XX...",Company has responded to the consumer and the ...,SYNCHRONY FINANCIAL,CA,,,Consent provided,Web,2019-03-07,Closed with explanation,Yes,,3171169,1,1,Debt collection,on 2019 my friend got a call from at becau...,212,1063,8,2019 friend get contact lawyer behalf fr...,"['on', '2019', 'my', 'friend', 'got', 'a', 'ca...","['2019', 'friend', 'get', 'contact', 'lawyer',...",synchrony financial
1,2019-03-05,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Account information incorrect,There are many mistakes appear in my report wi...,Company has responded to the consumer and the ...,Experian Information Solutions Inc.,VA,220XX,,Consent provided,Web,2019-03-05,Closed with explanation,Yes,,3169888,1,1,"Credit reporting, credit repair services, or o...",there are many mistakes appear in my report wi...,11,68,0,mistake appear report understanding,"['there', 'are', 'many', 'mistakes', 'appear',...","['mistake', 'appear', 'report', 'understanding']",experian information solutions inc
2,2019-03-05,Debt collection,Private student loan debt,Attempts to collect debt not owed,Debt is not yours,I HAVE DISPUTED THIS ACCOUNT AND YOU HAVE FAIL...,Company has responded to the consumer and the ...,"Ability Recovery Services, LLC",OK,731XX,,Consent provided,Web,2019-03-05,Closed with explanation,Yes,,3170239,1,1,Debt collection,i have disputed this account and you have fail...,67,407,1,dispute account fail provide validation requir...,"['i', 'have', 'disputed', 'this', 'account', '...","['dispute', 'account', 'fail', 'provide', 'val...",ability recovery services llc
3,2019-03-05,Debt collection,Other debt,Attempts to collect debt not owed,Debt was paid,This debit was paid however is has been on my ...,Company believes it acted appropriately as aut...,Source Receivables Management LLC,NY,,,Consent provided,Web,2019-03-05,Closed with explanation,Yes,,3170244,1,1,Debt collection,this debit was paid however is has been on my ...,15,68,1,debit pay credit 3 year,"['this', 'debit', 'was', 'paid', 'however', 'i...","['debit', 'pay', 'credit', '3', 'year']",source receivables management llc
4,2019-03-04,Debt collection,Other debt,Written notification about debt,Didn't receive notice of right to dispute,I tried to apply for a car loan and discovered...,Company believes it acted appropriately as aut...,"Waypoint Resource Group, LLC",NM,871XX,,Consent provided,Web,2019-03-08,Closed with explanation,Yes,,3168736,1,1,Debt collection,i tried to apply for a car loan and discovered...,65,336,0,try apply car loan discover bill collection ...,"['i', 'tried', 'to', 'apply', 'for', 'a', 'car...","['try', 'apply', 'car', 'loan', 'discover', 'b...",waypoint resource group llc


In [5]:
# Limit Set to Narrative and Response to consumer
text = data_raw[['narrative_lemma', 'Company_response_to_consumer']]

# Drop NAs in response to consumer
text = text.dropna(subset=['Company_response_to_consumer'])

# Encode monetary relief as 1. Drop categorical field thereafter.
text['label'] = np.where(text['Company_response_to_consumer'] == "Closed with monetary relief", 1, 0)
text = text.drop(columns=['Company_response_to_consumer'])

# Combine Company and Narrative
text['company_narrative_clean'] = text['narrative_lemma'].astype(str)
text = text.drop(columns=['narrative_lemma'])


text.head()

Unnamed: 0,label,company_narrative_clean
0,0,2019 friend get contact lawyer behalf fr...
1,0,mistake appear report understanding
2,0,dispute account fail provide validation requir...
3,0,debit pay credit 3 year
4,0,try apply car loan discover bill collection ...


In [6]:
# text = text.sample(n=20000, random_state=123)
text['label'].value_counts()

0    353452
1     21316
Name: label, dtype: int64

In [7]:
"""Binary - Upsample"""

tr, ts = train_test_split(text.copy(), test_size = 0.2, random_state=123)

train_minority1 =  tr[tr.label==0]
train_minority2 =  tr[tr.label==1]


len_majority = len(train_minority1)
ls_items = [train_minority2]
tr_up = train_minority1.copy()

for i in ls_items:
    i = resample(i.copy(), replace=True, n_samples=len_majority, random_state=0)
    tr_up = pd.concat([tr_up, i])

X_tr_up = tr_up.copy().drop(columns=[ 'label'])
y_tr_up = tr_up.copy()['label']
X_ts_up = ts.copy().drop(columns=[ 'label'])

y_ts_up = ts.copy()['label']

In [8]:
sentences_train, sentences_test, y_train, y_test = X_tr_up['company_narrative_clean'].values, X_ts_up['company_narrative_clean'].values, y_tr_up.values, y_ts_up.values

# Hyperparameter Tuning

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,3))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', SGDClassifier(loss='hinge', penalty='l2', random_state=42, tol=None)),
])

In [14]:
from sklearn.model_selection import GridSearchCV
parameters = {
     'vect__max_df': [0.95, 0.99]
    , 'vect__min_df': [0.0001, 0.001]
    , 'clf__alpha': (1e-3, 1e-4)
    , 'clf__max_iter': [50, 500]
}

In [15]:
scoring = {'f1': 'f1', 'accuracy': 'accuracy', 'precision': 'precision', 'recall' : 'recall', 'AUC': 'roc_auc'}

In [16]:
%%time
gs_svm = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1, verbose=2, scoring=scoring, refit='f1', return_train_score=True)
gs_svm = gs_svm.fit(sentences_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed: 18.3min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed: 93.6min finished


Wall time: 1h 36min 13s


In [58]:
gs_svm_cv_df = pd.DataFrame(gs_svm.cv_results_)
gs_svm_cv_df.sort_values(by=['rank_test_f1'])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__alpha,param_clf__max_iter,param_vect__max_df,param_vect__min_df,params,split0_test_f1,split1_test_f1,split2_test_f1,split3_test_f1,split4_test_f1,mean_test_f1,std_test_f1,rank_test_f1,split0_train_f1,split1_train_f1,split2_train_f1,split3_train_f1,split4_train_f1,mean_train_f1,std_train_f1,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,split3_test_accuracy,split4_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,split0_train_accuracy,split1_train_accuracy,split2_train_accuracy,split3_train_accuracy,split4_train_accuracy,mean_train_accuracy,std_train_accuracy,split0_test_precision,split1_test_precision,split2_test_precision,split3_test_precision,split4_test_precision,mean_test_precision,std_test_precision,rank_test_precision,split0_train_precision,split1_train_precision,split2_train_precision,split3_train_precision,split4_train_precision,mean_train_precision,std_train_precision,split0_test_recall,split1_test_recall,split2_test_recall,split3_test_recall,split4_test_recall,mean_test_recall,std_test_recall,rank_test_recall,split0_train_recall,split1_train_recall,split2_train_recall,split3_train_recall,split4_train_recall,mean_train_recall,std_train_recall,split0_test_AUC,split1_test_AUC,split2_test_AUC,split3_test_AUC,split4_test_AUC,mean_test_AUC,std_test_AUC,rank_test_AUC,split0_train_AUC,split1_train_AUC,split2_train_AUC,split3_train_AUC,split4_train_AUC,mean_train_AUC,std_train_AUC
8,366.805255,4.48758,122.190378,1.661367,0.0001,50,0.95,0.0001,"{'clf__alpha': 0.0001, 'clf__max_iter': 50, 'v...",0.868391,0.869149,0.867329,0.869045,0.868819,0.868547,0.000662,1,0.870458,0.871222,0.871429,0.871342,0.871197,0.871129,0.000346,0.863908,0.864306,0.86282,0.864217,0.863934,0.863837,0.000532,1,0.865879,0.866627,0.866777,0.866704,0.866554,0.866508,0.000323,0.840693,0.839209,0.839725,0.839202,0.838704,0.839507,0.000675,3,0.841728,0.842201,0.842029,0.842047,0.841904,0.841982,0.000158,0.897977,0.901303,0.89681,0.901089,0.901178,0.899671,0.001898,1,0.901218,0.902314,0.902956,0.902748,0.902602,0.902367,0.000611,0.925746,0.9251,0.925053,0.925964,0.926124,0.925597,0.000442,3,0.927909,0.928241,0.928154,0.928148,0.927987,0.928088,0.000121
10,345.357878,7.563345,124.881182,2.862496,0.0001,50,0.99,0.0001,"{'clf__alpha': 0.0001, 'clf__max_iter': 50, 'v...",0.868391,0.869149,0.867329,0.869045,0.868819,0.868547,0.000662,1,0.870458,0.871222,0.871429,0.871342,0.871197,0.871129,0.000346,0.863908,0.864306,0.86282,0.864217,0.863934,0.863837,0.000532,1,0.865879,0.866627,0.866777,0.866704,0.866554,0.866508,0.000323,0.840693,0.839209,0.839725,0.839202,0.838704,0.839507,0.000675,3,0.841728,0.842201,0.842029,0.842047,0.841904,0.841982,0.000158,0.897977,0.901303,0.89681,0.901089,0.901178,0.899671,0.001898,1,0.901218,0.902314,0.902956,0.902748,0.902602,0.902367,0.000611,0.925746,0.9251,0.925053,0.925964,0.926124,0.925597,0.000442,3,0.927909,0.928241,0.928154,0.928148,0.927987,0.928088,0.000121
12,590.586146,20.476332,122.848817,0.364024,0.0001,500,0.95,0.0001,"{'clf__alpha': 0.0001, 'clf__max_iter': 500, '...",0.868401,0.869156,0.867278,0.868978,0.868757,0.868514,0.000667,3,0.870453,0.871235,0.871396,0.871347,0.871123,0.871111,0.000342,0.863926,0.864315,0.862775,0.864155,0.863881,0.86381,0.000541,3,0.865879,0.866642,0.866744,0.866715,0.86649,0.866494,0.00032,0.840743,0.839223,0.839723,0.839184,0.838711,0.839517,0.000692,1,0.841753,0.842226,0.842002,0.842084,0.841908,0.841995,0.00016,0.897941,0.901303,0.896704,0.900966,0.901036,0.89959,0.001896,3,0.901178,0.902314,0.902916,0.902717,0.902438,0.902313,0.000605,0.925744,0.925099,0.92506,0.925965,0.926124,0.925599,0.000441,1,0.92791,0.928238,0.928167,0.92815,0.927989,0.928091,0.000122
14,656.140341,6.50543,127.681491,2.260658,0.0001,500,0.99,0.0001,"{'clf__alpha': 0.0001, 'clf__max_iter': 500, '...",0.868401,0.869156,0.867278,0.868978,0.868757,0.868514,0.000667,3,0.870453,0.871235,0.871396,0.871347,0.871123,0.871111,0.000342,0.863926,0.864315,0.862775,0.864155,0.863881,0.86381,0.000541,3,0.865879,0.866642,0.866744,0.866715,0.86649,0.866494,0.00032,0.840743,0.839223,0.839723,0.839184,0.838711,0.839517,0.000692,1,0.841753,0.842226,0.842002,0.842084,0.841908,0.841995,0.00016,0.897941,0.901303,0.896704,0.900966,0.901036,0.89959,0.001896,3,0.901178,0.902314,0.902916,0.902717,0.902438,0.902313,0.000605,0.925744,0.925099,0.92506,0.925965,0.926124,0.925599,0.000441,1,0.92791,0.928238,0.928167,0.92815,0.927989,0.928091,0.000122
13,509.777816,19.838121,114.930397,1.593149,0.0001,500,0.95,0.001,"{'clf__alpha': 0.0001, 'clf__max_iter': 500, '...",0.86234,0.861844,0.860042,0.861922,0.862591,0.861748,0.000896,5,0.863103,0.863322,0.863709,0.863473,0.863142,0.86335,0.000223,0.858196,0.857374,0.855834,0.857443,0.858107,0.857391,0.000847,5,0.858766,0.859069,0.859396,0.859157,0.858821,0.859042,0.00023,0.837851,0.835661,0.835652,0.835669,0.836164,0.8362,0.000848,5,0.837388,0.838027,0.838004,0.837802,0.837511,0.837747,0.000258,0.888303,0.88972,0.885898,0.889877,0.890744,0.888909,0.001697,5,0.890448,0.890191,0.891041,0.890766,0.890391,0.890567,0.0003,0.922202,0.921342,0.921242,0.921888,0.922659,0.921867,0.00053,7,0.923267,0.923582,0.923618,0.923481,0.923281,0.923446,0.000147
15,530.220525,14.593473,97.482436,7.793037,0.0001,500,0.99,0.001,"{'clf__alpha': 0.0001, 'clf__max_iter': 500, '...",0.86234,0.861844,0.860042,0.861922,0.862591,0.861748,0.000896,5,0.863103,0.863322,0.863709,0.863473,0.863142,0.86335,0.000223,0.858196,0.857374,0.855834,0.857443,0.858107,0.857391,0.000847,5,0.858766,0.859069,0.859396,0.859157,0.858821,0.859042,0.00023,0.837851,0.835661,0.835652,0.835669,0.836164,0.8362,0.000848,5,0.837388,0.838027,0.838004,0.837802,0.837511,0.837747,0.000258,0.888303,0.88972,0.885898,0.889877,0.890744,0.888909,0.001697,5,0.890448,0.890191,0.891041,0.890766,0.890391,0.890567,0.0003,0.922202,0.921342,0.921242,0.921888,0.922659,0.921867,0.00053,7,0.923267,0.923582,0.923618,0.923481,0.923281,0.923446,0.000147
9,339.485278,13.228865,112.49172,1.19529,0.0001,50,0.95,0.001,"{'clf__alpha': 0.0001, 'clf__max_iter': 50, 'v...",0.862263,0.861861,0.859993,0.861876,0.862605,0.86172,0.000906,7,0.863105,0.863291,0.863618,0.863425,0.863156,0.863319,0.000187,0.858099,0.857392,0.855799,0.857399,0.858107,0.857359,0.000842,7,0.858752,0.859038,0.859323,0.859106,0.858826,0.859009,0.000204,0.837675,0.835678,0.835686,0.835645,0.836097,0.836156,0.000777,7,0.837305,0.838001,0.838033,0.837747,0.837471,0.837711,0.000287,0.888339,0.889738,0.885757,0.889807,0.89085,0.888898,0.001762,7,0.890545,0.890156,0.890815,0.890727,0.890466,0.890542,0.00023,0.922201,0.921353,0.921238,0.921895,0.922665,0.92187,0.000531,5,0.923268,0.923593,0.923614,0.92349,0.923285,0.92345,0.000148
11,346.104875,6.223093,116.5746,1.057806,0.0001,50,0.99,0.001,"{'clf__alpha': 0.0001, 'clf__max_iter': 50, 'v...",0.862263,0.861861,0.859993,0.861876,0.862605,0.86172,0.000906,7,0.863105,0.863291,0.863618,0.863425,0.863156,0.863319,0.000187,0.858099,0.857392,0.855799,0.857399,0.858107,0.857359,0.000842,7,0.858752,0.859038,0.859323,0.859106,0.858826,0.859009,0.000204,0.837675,0.835678,0.835686,0.835645,0.836097,0.836156,0.000777,7,0.837305,0.838001,0.838033,0.837747,0.837471,0.837711,0.000287,0.888339,0.889738,0.885757,0.889807,0.89085,0.888898,0.001762,7,0.890545,0.890156,0.890815,0.890727,0.890466,0.890542,0.00023,0.922201,0.921353,0.921238,0.921895,0.922665,0.92187,0.000531,5,0.923268,0.923593,0.923614,0.92349,0.923285,0.92345,0.000148
1,484.940992,9.443367,116.155907,0.810969,0.001,50,0.95,0.001,"{'clf__alpha': 0.001, 'clf__max_iter': 50, 've...",0.838797,0.838965,0.838431,0.839883,0.839847,0.839184,0.000582,9,0.839379,0.83956,0.83974,0.839397,0.839748,0.839565,0.00016,0.836064,0.835905,0.835568,0.836576,0.836735,0.836169,0.000431,9,0.836403,0.836565,0.836693,0.836441,0.836667,0.836554,0.000116,0.825038,0.823614,0.824082,0.823223,0.824135,0.824018,0.000609,9,0.824388,0.824451,0.824362,0.824498,0.824198,0.824379,0.000102,0.853022,0.854899,0.853288,0.857231,0.85617,0.854922,0.001623,9,0.854924,0.855233,0.855702,0.854844,0.855897,0.85532,0.000417,0.904373,0.90391,0.9033,0.904206,0.904608,0.904079,0.000451,11,0.90427,0.90461,0.904747,0.904403,0.904404,0.904487,0.00017
3,389.143402,49.697015,117.692399,1.738052,0.001,50,0.99,0.001,"{'clf__alpha': 0.001, 'clf__max_iter': 50, 've...",0.838797,0.838965,0.838431,0.839883,0.839847,0.839184,0.000582,9,0.839379,0.83956,0.83974,0.839397,0.839748,0.839565,0.00016,0.836064,0.835905,0.835568,0.836576,0.836735,0.836169,0.000431,9,0.836403,0.836565,0.836693,0.836441,0.836667,0.836554,0.000116,0.825038,0.823614,0.824082,0.823223,0.824135,0.824018,0.000609,9,0.824388,0.824451,0.824362,0.824498,0.824198,0.824379,0.000102,0.853022,0.854899,0.853288,0.857231,0.85617,0.854922,0.001623,9,0.854924,0.855233,0.855702,0.854844,0.855897,0.85532,0.000417,0.904373,0.90391,0.9033,0.904206,0.904608,0.904079,0.000451,11,0.90427,0.90461,0.904747,0.904403,0.904404,0.904487,0.00017


In [18]:
%%time
# print("Accuracy:", score)
y_pred_gs_svm = gs_svm.predict(sentences_test)
print('\n Classification Report\n')
print(classification_report(y_test, y_pred_gs_svm))
print(confusion_matrix(y_test, y_pred_gs_svm))


 Classification Report

              precision    recall  f1-score   support

           0       0.99      0.83      0.90     70721
           1       0.23      0.86      0.36      4233

    accuracy                           0.83     74954
   macro avg       0.61      0.85      0.63     74954
weighted avg       0.95      0.83      0.87     74954

[[58573 12148]
 [  579  3654]]
Wall time: 12.1 s


In [19]:
gs_svm.best_params_

{'clf__alpha': 0.0001,
 'clf__max_iter': 50,
 'vect__max_df': 0.95,
 'vect__min_df': 0.0001}

In [20]:
import pickle
filename = 'gs_svm.sav'
pickle.dump(gs_svm, open(PATH_MODEL+filename, 'wb'))