# Societe Generale Complaint Tracker

In [59]:
import pandas as panda

from sklearn.model_selection import learning_curve, train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, mean_absolute_error, classification_report, \
    confusion_matrix, f1_score, roc_curve, roc_auc_score
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier


from matplotlib import pyplot as plot
import seaborn as sns


from numpy import bincount, linspace, mean, std, arange, squeeze

import itertools, time, datetime

import warnings
warnings.simplefilter('ignore')

%matplotlib inline

In [1]:
test_data_path ='dataset/test.csv'
train_data_path = 'dataset/train.csv'
sample_submission_path = 'dataset/sample_submission.csv'


In [8]:
train_data = panda.read_csv(train_data_path)

In [10]:
train_data_shape = train_data.shape
train_data_columns = train_data.columns.tolist()

In [11]:
train_data_columns

['Complaint-ID',
 'Date-received',
 'Transaction-Type',
 'Complaint-reason',
 'Company-response',
 'Date-sent-to-company',
 'Complaint-Status',
 'Consumer-disputes',
 'Consumer-complaint-summary']

In [16]:
# converting all columns to lower case for ease of typing

train_data.columns = [x.lower().strip() for x in train_data.columns.tolist()]

train_data.columns.tolist()

['complaint-id',
 'date-received',
 'transaction-type',
 'complaint-reason',
 'company-response',
 'date-sent-to-company',
 'complaint-status',
 'consumer-disputes',
 'consumer-complaint-summary']

In [17]:
target_column = 'complaint-status'

In [18]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43266 entries, 0 to 43265
Data columns (total 9 columns):
complaint-id                  43266 non-null object
date-received                 43266 non-null object
transaction-type              43266 non-null object
complaint-reason              43266 non-null object
company-response              20760 non-null object
date-sent-to-company          43266 non-null object
complaint-status              43266 non-null object
consumer-disputes             35568 non-null object
consumer-complaint-summary    43266 non-null object
dtypes: object(9)
memory usage: 3.0+ MB


In [19]:
train_data.describe(include='all')

Unnamed: 0,complaint-id,date-received,transaction-type,complaint-reason,company-response,date-sent-to-company,complaint-status,consumer-disputes,consumer-complaint-summary
count,43266,43266,43266,43266,20760,43266,43266,35568,43266
unique,43266,920,18,150,10,930,5,2,43022
top,Tr-17718,1/19/2017,Debt collection,Incorrect information on credit report,Company has responded to the consumer and the ...,1/19/2017,Closed with explanation,No,I am filing this complaint because Experian ha...
freq,1,226,10422,4466,10928,182,34300,27663,13


### lets work on the date columns first. 

##### Date Received

<li> Check for empty values </li>
<li> Convert into days passed since today </li> <br>

In [41]:
train_data[['date-received']].isnull().sum()

date-received    0
dtype: int64

In [39]:
train_data['date-received'].value_counts()

1/19/2017     226
1/20/2017     175
4/26/2017     133
4/25/2017     125
9/8/2017      125
9/9/2017      113
1/24/2017     111
1/23/2017     110
7/18/2017     108
5/9/2017      106
5/17/2017     101
9/20/2016     100
1/25/2017     100
7/12/2017      97
6/6/2017       95
5/23/2017      95
5/10/2017      94
2/15/2017      94
7/17/2017      93
5/2/2017       91
1/26/2017      90
5/16/2017      90
9/28/2016      90
4/24/2017      89
3/29/2017      89
4/28/2017      89
8/4/2017       89
7/13/2017      89
7/7/2017       89
7/27/2017      89
             ... 
8/23/2017      10
8/9/2017       10
9/2/2017       10
8/6/2017       10
6/21/2015       9
9/4/2016        9
8/25/2017       9
1/3/2016        9
9/12/2017       8
12/25/2016      8
9/11/2017       8
9/24/2017       7
12/25/2015      7
8/26/2017       6
9/4/2017        6
11/3/2017       6
9/3/2017        5
9/25/2017       5
9/10/2017       5
9/20/2017       4
9/19/2017       4
9/29/2017       4
9/16/2017       3
8/27/2017       2
8/29/2017 

In [31]:
from dateutil import relativedelta


def get_days_passed( given):
    
    current = datetime.datetime.now()
    given = datetime.datetime.strptime(given, '%m/%d/%Y')
    
    return (current-given).days

In [33]:
train_data['days_passed_since_complaint_received'] = train_data['date-received'].apply(lambda x : get_days_passed(x))

In [37]:

##lets compare the first 10 rows to check
train_data[['date-received', 'days_passed_since_complaint_received']].head(10)

Unnamed: 0,date-received,days_passed_since_complaint_received
0,11/11/2015,1154
1,7/7/2015,1281
2,5/7/2015,1342
3,11/12/2016,787
4,9/29/2016,831
5,8/2/2016,889
6,3/26/2017,653
7,10/15/2016,815
8,1/18/2016,1086
9,8/17/2015,1240


In [38]:
train_data[['date-received', 'days_passed_since_complaint_received']].tail(10)

##seems about right.. the months 

Unnamed: 0,date-received,days_passed_since_complaint_received
43256,9/28/2015,1198
43257,4/28/2015,1351
43258,4/25/2017,623
43259,11/23/2016,776
43260,5/10/2017,608
43261,7/28/2017,529
43262,1/23/2017,715
43263,3/9/2017,670
43264,7/15/2017,542
43265,6/8/2017,579


##### Date Sent to Company

<li> Check for empty values </li>
<li> Convert into days passed since today </li> <br>

In [40]:
train_data['date-sent-to-company'].isnull().sum()

0

In [42]:
train_data['date-sent-to-company'].value_counts()

1/19/2017     182
1/20/2017     156
4/26/2017     124
5/3/2017      119
5/17/2017     119
4/25/2017     116
9/8/2017      115
9/9/2017      113
1/25/2017     113
1/24/2017     112
11/13/2015    110
1/18/2017     109
7/18/2017     109
9/20/2016     108
2/15/2017     106
7/12/2017     104
4/28/2017     103
7/17/2017     102
6/12/2017     102
5/10/2017     101
6/6/2017      101
1/27/2017     101
5/23/2017     100
3/22/2017      99
7/7/2017       99
5/4/2017       97
1/23/2017      97
6/21/2017      97
3/29/2017      97
5/31/2017      96
             ... 
8/26/2017       6
12/25/2015      6
6/14/2015       6
8/29/2017       6
11/3/2017       6
6/7/2015        6
5/17/2015       6
6/21/2015       5
9/25/2017       5
9/4/2017        5
9/10/2017       5
5/10/2015       5
9/13/2017       5
9/20/2017       4
9/29/2017       4
9/16/2017       3
9/14/2017       3
9/28/2017       2
8/27/2017       2
9/21/2017       2
9/22/2017       2
11/2/2017       2
10/11/2017      2
10/15/2017      1
10/10/2017

In [43]:
train_data['days_since_complaint_sent_to_company'] = train_data['date-sent-to-company'].apply(lambda x: get_days_passed(x))

In [44]:
train_data.head()

Unnamed: 0,complaint-id,date-received,transaction-type,complaint-reason,company-response,date-sent-to-company,complaint-status,consumer-disputes,consumer-complaint-summary,days_passed_since_complaint_received,days_since_complaint_sent_to_company
0,Tr-1,11/11/2015,Mortgage,"Loan servicing, payments, escrow account",,11/11/2015,Closed with explanation,Yes,"Seterus, Inc a déposé un faux rapport auprès d...",1154,1154
1,Tr-2,7/7/2015,Credit reporting,Incorrect information on credit report,Company chooses not to provide a public response,7/7/2015,Closed with non-monetary relief,No,XX / XX / XXXX La requête en faillite n ° XXXX...,1281,1281
2,Tr-3,5/7/2015,Bank account or service,Using a debit or ATM card,,5/7/2015,Closed with explanation,No,"El XXXX / XXXX / 15, estaba preparando el vuel...",1342,1342
3,Tr-4,11/12/2016,Debt collection,Cont'd attempts collect debt not owed,Company believes it acted appropriately as aut...,11/12/2016,Closed with explanation,No,"The loan was paid in XXXX XXXX. In XXXX, 4 yea...",787,787
4,Tr-5,9/29/2016,Credit card,Payoff process,Company has responded to the consumer and the ...,9/29/2016,Closed with explanation,No,J'ai obtenu un compte de crédit de soins pour ...,831,831


In [45]:
## lets check for complaints where days when complaint was received and complaint was sent to company was different

train_data[train_data['days_passed_since_complaint_received']!=train_data['days_since_complaint_sent_to_company']]

Unnamed: 0,complaint-id,date-received,transaction-type,complaint-reason,company-response,date-sent-to-company,complaint-status,consumer-disputes,consumer-complaint-summary,days_passed_since_complaint_received,days_since_complaint_sent_to_company
5,Tr-6,8/2/2016,Mortgage,"Loan modification,collection,foreclosure",,8/3/2016,Closed with explanation,Yes,The owner of my original mortgage filed for ba...,889,888
10,Tr-11,1/10/2016,Mortgage,"Loan servicing, payments, escrow account",,1/15/2016,Closed with explanation,No,"My mortgage provider, United Wholesale Mortgag...",1094,1089
19,Tr-20,8/31/2016,Debt collection,Cont'd attempts collect debt not owed,,9/7/2016,Closed with explanation,No,"Cuando disputé con las agencias de crédito, so...",860,853
31,Tr-32,5/26/2015,Mortgage,Settlement process and costs,Company chooses not to provide a public response,5/29/2015,Closed with explanation,Yes,We request a simplified refinancing XXXX XXXX ...,1323,1320
33,Tr-34,3/2/2017,Student loan,Dealing with my lender or servicer,,3/6/2017,Closed with explanation,No,I reviewed my credit report and noticed 3 late...,677,673
34,Tr-35,8/8/2016,Debt collection,Cont'd attempts collect debt not owed,,8/10/2016,Closed with explanation,No,J'ai déposé un différend avec XXXX concernant ...,883,881
37,Tr-38,11/21/2015,Debt collection,Cont'd attempts collect debt not owed,,12/29/2015,Untimely response,No,My husband and I had a credit card debt and th...,1144,1106
45,Tr-46,10/28/2015,Payday loan,Charged fees or interest I didn't expect,,10/30/2015,Closed with explanation,No,He tenido un préstamo con Rise Credit por más ...,1168,1166
46,Tr-47,4/16/2016,Student loan,Dealing with my lender or servicer,,4/20/2016,Closed with explanation,No,"XX / XX / XXXX, I submitted my tax documents f...",997,993
51,Tr-52,7/26/2015,Mortgage,"Loan servicing, payments, escrow account",Company believes it acted appropriately as aut...,7/29/2015,Closed with non-monetary relief,No,"Here we go again. Obviously, Greetree does not...",1262,1259


There is a pretty sizeable chunk where complaints received and complaints sent were different
<br><br>
Now that we have dealt with date columns, we will deal with the other categorical columns. since most are categorical.

<br>
We are going to ignore the column consumer-complaint-summary for now. Lets check our scores without it

##### Transaction Type

<li> Check for empty values </li>
<li> Convert to numerical values </li>

In [48]:
train_data['transaction-type'].value_counts()

Debt collection                                                                 10422
Mortgage                                                                         7950
Credit reporting                                                                 6706
Credit card                                                                      4119
Bank account or service                                                          3264
Credit reporting, credit repair services, or other personal consumer reports     3169
Student loan                                                                     2874
Consumer Loan                                                                    2033
Credit card or prepaid card                                                       719
Checking or savings account                                                       496
Payday loan                                                                       363
Money transfers                                       

In [51]:
train_data['transaction-type'].isnull().sum()

0

In [71]:
le = LabelEncoder().fit(train_data['transaction-type'].values)

print(le.classes_)

transformed_transaction_types = le.transform(train_data['transaction-type'].values)


['Bank account or service' 'Checking or savings account' 'Consumer Loan'
 'Credit card' 'Credit card or prepaid card' 'Credit reporting'
 'Credit reporting, credit repair services, or other personal consumer reports'
 'Debt collection' 'Money transfer, virtual currency, or money service'
 'Money transfers' 'Mortgage' 'Other financial service' 'Payday loan'
 'Payday loan, title loan, or personal loan' 'Prepaid card' 'Student loan'
 'Vehicle loan or lease' 'Virtual currency']


In [80]:
transformed = OneHotEncoder().fit_transform(transformed_transaction_types[:,np.newaxis])

print(transformed.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [84]:
tt = panda.DataFrame(transformed.toarray())
tt.columns = le.classes_
tt

Unnamed: 0,Bank account or service,Checking or savings account,Consumer Loan,Credit card,Credit card or prepaid card,Credit reporting,"Credit reporting, credit repair services, or other personal consumer reports",Debt collection,"Money transfer, virtual currency, or money service",Money transfers,Mortgage,Other financial service,Payday loan,"Payday loan, title loan, or personal loan",Prepaid card,Student loan,Vehicle loan or lease,Virtual currency
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [86]:
train_data_transaction_type_encoded = panda.concat([train_data,tt], axis =1)

In [93]:
## in order to not run out of memory we will delete the tt object

del tt
del le

In [87]:
train_data_transaction_type_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43266 entries, 0 to 43265
Data columns (total 29 columns):
complaint-id                                                                    43266 non-null object
date-received                                                                   43266 non-null object
transaction-type                                                                43266 non-null object
complaint-reason                                                                43266 non-null object
company-response                                                                20760 non-null object
date-sent-to-company                                                            43266 non-null object
complaint-status                                                                43266 non-null object
consumer-disputes                                                               35568 non-null object
consumer-complaint-summary                                                 

##### Complaint Reason

<li> Check for empty values </li>
<li> Convert to numerical value </li>

In [88]:
train_data_transaction_type_encoded['complaint-reason'].isnull().sum()

0

In [90]:
train_data_transaction_type_encoded['complaint-reason'].value_counts()
## geez total to 150 diff values

Incorrect information on credit report                                              4466
Cont'd attempts collect debt not owed                                               3755
Loan servicing, payments, escrow account                                            3216
Loan modification,collection,foreclosure                                            2385
Dealing with my lender or servicer                                                  1683
Disclosure verification of debt                                                     1625
Incorrect information on your report                                                1587
Communication tactics                                                               1487
Account opening, closing, or management                                             1423
Credit reporting company's investigation                                            1249
Managing the loan or lease                                                          1027
False statements or r

#### IMPORTANT:We will attempt to bin these values later.. since business acumen would be required in order to do so. By binning i mean, values like managing line of credit, incorrect line of credit, getting line of credit can be marked as LOC

In [95]:
le = LabelEncoder().fit(train_data_transaction_type_encoded['complaint-reason'].values)
transformed_values  = le.transform(train_data_transaction_type_encoded['complaint-reason'].values)

transformed = OneHotEncoder().fit_transform(transformed_values[:,np.newaxis])

tt = panda.DataFrame(transformed.toarray())
tt.columns = le.classes_

tt

Unnamed: 0,APR or interest rate,"Account opening, closing, or management",Account terms and changes,Adding money,Advertising,Advertising and marketing,"Advertising and marketing, including promotional offers","Advertising, marketing or disclosures",Application processing delay,"Application, originator, mortgage broker",...,Unauthorized transactions or other transaction problem,Unauthorized transactions/trans. issues,Unexpected or other fees,Unexpected/Other fees,Unsolicited issuance of credit card,Using a debit or ATM card,Vehicle was repossessed or sold the vehicle,"Was approved for a loan, but didn't receive the money",Written notification about debt,Wrong amount charged or received
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [96]:
train_data_two_encoded = panda.concat([train_data_transaction_type_encoded,tt],axis = 1)

train_data_two_encoded

Unnamed: 0,complaint-id,date-received,transaction-type,complaint-reason,company-response,date-sent-to-company,complaint-status,consumer-disputes,consumer-complaint-summary,days_passed_since_complaint_received,...,Unauthorized transactions or other transaction problem,Unauthorized transactions/trans. issues,Unexpected or other fees,Unexpected/Other fees,Unsolicited issuance of credit card,Using a debit or ATM card,Vehicle was repossessed or sold the vehicle,"Was approved for a loan, but didn't receive the money",Written notification about debt,Wrong amount charged or received
0,Tr-1,11/11/2015,Mortgage,"Loan servicing, payments, escrow account",,11/11/2015,Closed with explanation,Yes,"Seterus, Inc a déposé un faux rapport auprès d...",1154,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Tr-2,7/7/2015,Credit reporting,Incorrect information on credit report,Company chooses not to provide a public response,7/7/2015,Closed with non-monetary relief,No,XX / XX / XXXX La requête en faillite n ° XXXX...,1281,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Tr-3,5/7/2015,Bank account or service,Using a debit or ATM card,,5/7/2015,Closed with explanation,No,"El XXXX / XXXX / 15, estaba preparando el vuel...",1342,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,Tr-4,11/12/2016,Debt collection,Cont'd attempts collect debt not owed,Company believes it acted appropriately as aut...,11/12/2016,Closed with explanation,No,"The loan was paid in XXXX XXXX. In XXXX, 4 yea...",787,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Tr-5,9/29/2016,Credit card,Payoff process,Company has responded to the consumer and the ...,9/29/2016,Closed with explanation,No,J'ai obtenu un compte de crédit de soins pour ...,831,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Tr-6,8/2/2016,Mortgage,"Loan modification,collection,foreclosure",,8/3/2016,Closed with explanation,Yes,The owner of my original mortgage filed for ba...,889,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Tr-7,3/26/2017,Credit reporting,Incorrect information on credit report,,3/26/2017,Closed with explanation,No,J'ai été victime d'une fraude d'identité et j'...,653,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Tr-8,10/15/2016,Bank account or service,Problems caused by my funds being low,Company has responded to the consumer and the ...,10/15/2016,Closed with explanation,No,"Je suis en train de faire faillite et, par con...",815,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Tr-9,1/18/2016,Debt collection,Cont'd attempts collect debt not owed,,1/18/2016,Closed with explanation,Yes,Una agencia de cobranza me hizo adulterar de q...,1086,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Tr-10,8/17/2015,Credit reporting,Incorrect information on credit report,Company chooses not to provide a public response,8/17/2015,Closed with non-monetary relief,No,"Le XXXX / XXXX / 2015, j'ai reçu une réponse d...",1240,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [97]:
del tt
del le

##### Company Response

<li> Check for empty values </li>
<li> Convert to numerical value </li>

In [100]:
train_data_two_encoded['company-response'].isnull().sum()

## there are quite a few null values.. we will see what we can do about it.
## check the complaint status for the ones which has nan in company response
## if status is same throughout-- we can replace nan with some made up status
## else we will replace with the most appearing response

22506

In [101]:
train_data_two_encoded['company-response'].value_counts()

Company has responded to the consumer and the CFPB and chooses not to provide a public response                            10928
Company chooses not to provide a public response                                                                            4322
Company believes it acted appropriately as authorized by contract or law                                                    3811
Company believes the complaint is the result of a misunderstanding                                                           387
Company disputes the facts presented in the complaint                                                                        379
Company believes complaint is the result of an isolated error                                                                302
Company believes complaint caused principally by actions of third party outside the control or direction of the company      300
Company can't verify or dispute the facts in the complaint                                       

In [112]:
empty_response = train_data_two_encoded[train_data_two_encoded['company-response'].isnull()][['company-response','complaint-status']]

empty_response

Unnamed: 0,company-response,complaint-status
0,,Closed with explanation
2,,Closed with explanation
5,,Closed with explanation
6,,Closed with explanation
8,,Closed with explanation
10,,Closed with explanation
11,,Closed with explanation
14,,Closed
17,,Closed with explanation
19,,Closed with explanation


In [113]:
empty_response['complaint-status'].value_counts()

Closed with explanation            18696
Closed with non-monetary relief     1863
Closed with monetary relief         1171
Closed                               455
Untimely response                    321
Name: complaint-status, dtype: int64

#### Since most status of the are closed..we will decide on categorical variable to replace NaN values for company-response. Lets get the most appearing company response for complaint status similar to the one above (except the NaN ones)

In [132]:
a = empty_response['complaint-status'].value_counts()

t =train_data_two_encoded[train_data_two_encoded['complaint-status'].isin(a.index.tolist())][['company-response']]

t['company-response'].value_counts()

Company has responded to the consumer and the CFPB and chooses not to provide a public response                            10928
Company chooses not to provide a public response                                                                            4322
Company believes it acted appropriately as authorized by contract or law                                                    3811
Company believes the complaint is the result of a misunderstanding                                                           387
Company disputes the facts presented in the complaint                                                                        379
Company believes complaint is the result of an isolated error                                                                302
Company believes complaint caused principally by actions of third party outside the control or direction of the company      300
Company can't verify or dispute the facts in the complaint                                       

In [134]:
most_appearing = t['company-response'].value_counts().index.tolist()[0]

In [135]:
del t, empty_response, a


<br> We will replace the empty value with the most appearing value for the column company-response

In [136]:
train_data_two_encoded['company-response'].fillna(value = most_appearing, inplace=True)

train_data_two_encoded['company-response'].isnull().sum()

0

### At this point we are done with our feature engineering/data wrangling except for the complaint-summary column which we will perform later