In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('consumer_complaints.csv')

In [3]:
data.head()

Unnamed: 0,date_received,product,sub_product,issue,sub_issue,consumer_complaint_narrative,company_public_response,company,state,zipcode,tags,consumer_consent_provided,submitted_via,date_sent_to_company,company_response_to_consumer,timely_response,consumer_disputed?,complaint_id
0,08/30/2013,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,U.S. Bancorp,CA,95993,,,Referral,09/03/2013,Closed with explanation,Yes,Yes,511074
1,08/30/2013,Mortgage,Other mortgage,"Loan servicing, payments, escrow account",,,,Wells Fargo & Company,CA,91104,,,Referral,09/03/2013,Closed with explanation,Yes,Yes,511080
2,08/30/2013,Credit reporting,,Incorrect information on credit report,Account status,,,Wells Fargo & Company,NY,11764,,,Postal mail,09/18/2013,Closed with explanation,Yes,No,510473
3,08/30/2013,Student loan,Non-federal student loan,Repaying your loan,Repaying your loan,,,"Navient Solutions, Inc.",MD,21402,,,Email,08/30/2013,Closed with explanation,Yes,Yes,510326
4,08/30/2013,Debt collection,Credit card,False statements or representation,Attempted to collect wrong amount,,,Resurgent Capital Services L.P.,GA,30106,,,Web,08/30/2013,Closed with explanation,Yes,Yes,511067


In [4]:
data.shape

(555957, 18)

In [5]:
data.isna().sum()

date_received                        0
product                              0
sub_product                     158322
issue                                0
sub_issue                       343335
consumer_complaint_narrative    489151
company_public_response         470833
company                              0
state                             4887
zipcode                           4505
tags                            477998
consumer_consent_provided       432499
submitted_via                        0
date_sent_to_company                 0
company_response_to_consumer         0
timely_response                      0
consumer_disputed?                   0
complaint_id                         0
dtype: int64

In [31]:
new_df = data[['product', 'issue', 'sub_issue', 'submitted_via', 'consumer_complaint_narrative', 'timely_response', 'consumer_disputed?']]

In [32]:
new_df.head()

Unnamed: 0,product,issue,sub_issue,submitted_via,consumer_complaint_narrative,timely_response,consumer_disputed?
0,Mortgage,"Loan modification,collection,foreclosure",,Referral,,Yes,Yes
1,Mortgage,"Loan servicing, payments, escrow account",,Referral,,Yes,Yes
2,Credit reporting,Incorrect information on credit report,Account status,Postal mail,,Yes,No
3,Student loan,Repaying your loan,Repaying your loan,Email,,Yes,Yes
4,Debt collection,False statements or representation,Attempted to collect wrong amount,Web,,Yes,Yes


In [33]:
new_df.shape

(555957, 7)

In [34]:
for i in list(new_df.columns.values):
    print(list((i, new_df[str(i)].nunique())))

['product', 11]
['issue', 95]
['sub_issue', 68]
['submitted_via', 6]
['consumer_complaint_narrative', 65646]
['timely_response', 2]
['consumer_disputed?', 2]


In [35]:
new_df.isna().sum()

product                              0
issue                                0
sub_issue                       343335
submitted_via                        0
consumer_complaint_narrative    489151
timely_response                      0
consumer_disputed?                   0
dtype: int64

In [37]:
new_df['consumer_complaint_narrative'].fillna('missing', inplace = True)
new_df['sub_issue'].fillna('missing', inplace = True)


In [38]:
new_df.isna().sum()

product                         0
issue                           0
sub_issue                       0
submitted_via                   0
consumer_complaint_narrative    0
timely_response                 0
consumer_disputed?              0
dtype: int64

In [39]:
for i in list(new_df.columns.values):
    print(list((i, new_df[str(i)].nunique())))

['product', 11]
['issue', 95]
['sub_issue', 69]
['submitted_via', 6]
['consumer_complaint_narrative', 65647]
['timely_response', 2]
['consumer_disputed?', 2]


In [67]:
import re
stop = stopwords.words('english')
lemma = WordNetLemmatizer()


new_df['consumer_complaint_narrative'] = new_df['consumer_complaint_narrative'].apply(lambda x: ' '.join(x.lower() for x in x.split()))
new_df['consumer_complaint_narrative'] = new_df['consumer_complaint_narrative'].apply(lambda x: re.sub('[^\w\s][0-9][!"$%&\'()*,./:;<=>?@[\\]^_`{|}~]', '', x))
new_df['consumer_complaint_narrative'] = new_df['consumer_complaint_narrative'].apply(lambda x: ' '.join(x for x in x.split() if x not in stop))
new_df['consumer_complaint_narrative'] = new_df['consumer_complaint_narrative'].apply(lambda x: ' '.join(lemma.lemmatize(word) for word in x.split()))

In [68]:
new_df['sub_issue'] = new_df['sub_issue'].apply(lambda x: ' '.join(x.lower() for x in x.split()))
new_df['sub_issue'] = new_df['sub_issue'].apply(lambda x: re.sub('[^\w\s][0-9][!"$%&\'()*,./:;<=>?@[\\]^_`{|}~]', '', x))
new_df['sub_issue'] = new_df['sub_issue'].apply(lambda x: ' '.join(x for x in x.split() if x not in stop))
new_df['sub_issue'] = new_df['sub_issue'].apply(lambda x: ' '.join(lemma.lemmatize(word) for word in x.split()))

In [69]:
new_df['issue'] = new_df['issue'].apply(lambda x: ' '.join(x.lower() for x in x.split()))
new_df['issue'] = new_df['issue'].apply(lambda x: re.sub('[^\w\s][0-9][!"$%&\'()*,./:;<=>?@[\\]^_`{|}~]', '', x))
new_df['issue'] = new_df['issue'].apply(lambda x: ' '.join(x for x in x.split() if x not in stop))
new_df['issue'] = new_df['issue'].apply(lambda x: ' '.join(lemma.lemmatize(word) for word in x.split()))

In [70]:
new_df.head()

Unnamed: 0,product,issue,sub_issue,submitted_via,consumer_complaint_narrative,timely_response,consumer_disputed?
0,Mortgage,"loan modification,collection,foreclosure",missing,Referral,missing,Yes,Yes
1,Mortgage,"loan servicing, payments, escrow account",missing,Referral,missing,Yes,Yes
2,Credit reporting,incorrect information credit report,account status,Postal mail,missing,Yes,No
3,Student loan,repaying loan,repaying loan,Email,missing,Yes,Yes
4,Debt collection,false statement representation,attempted collect wrong amount,Web,missing,Yes,Yes


In [80]:
ccn = new_df['consumer_complaint_narrative']
si = new_df['sub_issue']
issue = new_df['issue']

In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(analyzer='word', max_df=5000, min_df=20, stop_words='english', max_features=100)

***TFIDF for consumer_complaint_narrative***

In [81]:
tfidf.fit(ccn)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=5000, max_features=100, min_df=20,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [82]:
ccn_tfidf = tfidf.transform(ccn)

In [83]:
ccn_tfidf=pd.DataFrame(ccn_tfidf.toarray(),
                             columns=tfidf.get_feature_names())

In [65]:
ccn_tfidf.shape

(555957, 100)

In [84]:
ccn_tfidf.head()

Unnamed: 0,15,30,action,advised,ago,agreement,america,application,applied,ask,...,supervisor,taken,tax,tell,transaction,transfer,use,used,ve,years
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


***TFIDF for Sub-Issue***

In [85]:
tfidf.fit(si)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=5000, max_features=100, min_df=20,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [86]:
si_tfidf = tfidf.transform(si)

In [87]:
si_tfidf=pd.DataFrame(si_tfidf.toarray(),
                             columns=tfidf.get_feature_names())

In [88]:
si_tfidf.shape

(555957, 100)

In [89]:
si_tfidf.head()

Unnamed: 0,8am,9pm,abusive,acct,action,agree,alert,annual,apply,arrest,...,suit,talked,temporarily,terms,theft,took,trouble,unable,used,written
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


***TFIDF for Issue***

In [90]:
tfidf.fit(issue)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=5000, max_features=100, min_df=20,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [91]:
issue_tfidf = tfidf.transform(issue)

In [92]:
issue_tfidf=pd.DataFrame(issue_tfidf.toarray(),
                             columns=tfidf.get_feature_names())

In [93]:
issue_tfidf.shape

(555957, 85)

In [94]:
issue_tfidf.head()

Unnamed: 0,acct,adding,advance,advertising,amt,applied,apply,apr,arbitration,atm,...,transfer,unauthorized,underwriting,unexpected,unsolicited,use,using,vehicle,workout,wrong
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


***Creating Dummy Variables***

In [97]:
dummies = pd.get_dummies(new_df[['product','submitted_via', 'timely_response']], drop_first=True)

In [98]:
dummies.shape

(555957, 16)

In [101]:
new_df['consumer_disputed?'] = np.where(new_df['consumer_disputed?'] == 'Yes', 1,0)

In [102]:
new_df.head()

Unnamed: 0,product,issue,sub_issue,submitted_via,consumer_complaint_narrative,timely_response,consumer_disputed?
0,Mortgage,"loan modification,collection,foreclosure",missing,Referral,missing,Yes,1
1,Mortgage,"loan servicing, payments, escrow account",missing,Referral,missing,Yes,1
2,Credit reporting,incorrect information credit report,account status,Postal mail,missing,Yes,0
3,Student loan,repaying loan,repaying loan,Email,missing,Yes,1
4,Debt collection,false statement representation,attempted collect wrong amount,Web,missing,Yes,1


In [103]:
df = pd.concat([dummies, ccn_tfidf, si_tfidf, issue_tfidf, new_df['consumer_disputed?']], axis = 1)

In [104]:
df.head()

Unnamed: 0,product_Consumer Loan,product_Credit card,product_Credit reporting,product_Debt collection,product_Money transfers,product_Mortgage,product_Other financial service,product_Payday loan,product_Prepaid card,product_Student loan,...,unauthorized,underwriting,unexpected,unsolicited,use,using,vehicle,workout,wrong,consumer_disputed?
0,0,0,0,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0,0,0,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0,0,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0,0,0,0,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [105]:
df.shape

(555957, 302)

In [110]:
df.to_csv('preprocessed_data.csv', index=False)