## Merging Complaint Reason and Preparing Test/Train Data

In [2]:
import pandas as panda

from sklearn.model_selection import learning_curve, train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, mean_absolute_error, classification_report, \
    confusion_matrix, f1_score, roc_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

from matplotlib import pyplot as plot
import seaborn as sns


from numpy import bincount, linspace, mean, std, arange, squeeze

import itertools, time, datetime

import warnings
warnings.simplefilter('ignore')

%matplotlib inline

In [3]:

test_data_path ='dataset/test.csv'
train_data_path = 'dataset/train.csv'
sample_submission_path = 'dataset/sample_submission.csv'
test_complaint_reason_path = 'dataset/test_data_complaint_reason.csv'
train_complaint_reason_path = 'dataset/train_data_complaint_reason.csv'

In [4]:
test_data = panda.read_csv(test_data_path)
train_data = panda.read_csv(train_data_path)
test_data_complaint_reason = panda.read_csv(test_complaint_reason_path)
train_data_complaint_reason = panda.read_csv(train_complaint_reason_path)

In [5]:
test_data.shape, train_data.shape, test_data_complaint_reason.shape, train_data_complaint_reason.shape

((18543, 8), (43266, 9), (18543, 3), (43266, 3))

In [6]:
train_data.columns = [i.lower().strip().replace('-','_') for i in train_data.columns.tolist()]
train_data.columns.tolist()

['complaint_id',
 'date_received',
 'transaction_type',
 'complaint_reason',
 'company_response',
 'date_sent_to_company',
 'complaint_status',
 'consumer_disputes',
 'consumer_complaint_summary']

In [7]:
test_data.columns = [i.lower().strip().replace('-','_') for i in test_data.columns.tolist()]
test_data.columns.tolist()

['complaint_id',
 'date_received',
 'transaction_type',
 'complaint_reason',
 'company_response',
 'date_sent_to_company',
 'consumer_disputes',
 'consumer_complaint_summary']

In [10]:
test_data_complaint_reason.columns.tolist(),train_data_complaint_reason.columns.tolist()

(['Unnamed: 0', 'complaint_reason', 'complaint_reason_encoded'],
 ['Unnamed: 0', 'complaint_reason', 'complaint_reason_encoded'])

In [11]:
test_data_complaint_reason.drop(['Unnamed: 0'], axis =1, inplace = True)
test_data_complaint_reason.columns.tolist()

['complaint_reason', 'complaint_reason_encoded']

In [12]:
train_data_complaint_reason.drop(['Unnamed: 0'], axis =1, inplace = True)
train_data_complaint_reason.columns.tolist()

['complaint_reason', 'complaint_reason_encoded']

In [17]:
test_data_merged = panda.concat([test_data, test_data_complaint_reason], axis =1 ) #test_data.merge(test_data_complaint_reason, how ='inner', on='complaint_reason')

In [19]:
test_data_merged.shape, test_data.shape

((18543, 10), (18543, 8))

In [20]:
test_data_merged.head(5)

Unnamed: 0,complaint_id,date_received,transaction_type,complaint_reason,company_response,date_sent_to_company,consumer_disputes,consumer_complaint_summary,complaint_reason.1,complaint_reason_encoded
0,Te-1,8/18/2016,Bank account or service,"Account opening, closing, or management",Company has responded to the consumer and the ...,8/18/2016,No,XXXX / XXXX / 16 I called Citibank to open a c...,"Account opening, closing, or management",18
1,Te-2,4/18/2016,Debt collection,Communication tactics,Company believes it acted appropriately as aut...,4/20/2016,No,I'm struggling financially. I called and I off...,Communication tactics,24
2,Te-3,3/23/2016,Credit reporting,Incorrect information on credit report,,3/23/2016,No,"In XXXX of 2015, an automatic payment was conf...",Incorrect information on credit report,13
3,Te-4,6/26/2017,Student loan,Dealing with your lender or servicer,,6/26/2017,,"I submitted a request to XXXX, which is my cur...",Dealing with your lender or servicer,1
4,Te-5,5/13/2016,Credit reporting,Incorrect information on credit report,Company has responded to the consumer and the ...,5/13/2016,No,A state tax lien was filed against me XXXX / X...,Incorrect information on credit report,13


In [22]:
train_data_merged = panda.concat([train_data, train_data_complaint_reason], axis =1)

train_data_merged.shape, train_data.shape

((43266, 11), (43266, 9))

In [23]:
from dateutil import relativedelta


def get_days_passed( given):
    
    current = datetime.datetime.now()
    given = datetime.datetime.strptime(given, '%m/%d/%Y')
    
    return (current-given).days

In [24]:
train_data_merged['days_passed_since_complaint_received'] = train_data_merged['date_received'].apply(lambda x : get_days_passed(x))

In [25]:
test_data_merged['days_passed_since_complaint_received'] = test_data_merged['date_received'].apply(lambda x : get_days_passed(x))

In [28]:
train_data_merged['days_since_complaint_sent_to_company'] = train_data_merged['date_sent_to_company'].apply(lambda x: get_days_passed(x))

In [29]:
test_data_merged['days_since_complaint_sent_to_company'] = test_data_merged['date_sent_to_company'].apply(lambda x: get_days_passed(x))

In [30]:
le = LabelEncoder().fit(train_data_merged['transaction_type'].values)

print(le.classes_)

transformed_transaction_types = le.transform(train_data_merged['transaction_type'].values)


['Bank account or service' 'Checking or savings account' 'Consumer Loan'
 'Credit card' 'Credit card or prepaid card' 'Credit reporting'
 'Credit reporting, credit repair services, or other personal consumer reports'
 'Debt collection' 'Money transfer, virtual currency, or money service'
 'Money transfers' 'Mortgage' 'Other financial service' 'Payday loan'
 'Payday loan, title loan, or personal loan' 'Prepaid card' 'Student loan'
 'Vehicle loan or lease' 'Virtual currency']


In [31]:
transformed = OneHotEncoder().fit_transform(transformed_transaction_types[:,np.newaxis])

print(transformed.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [33]:
tt = panda.DataFrame(transformed.toarray())
tt.columns = le.classes_
tt.head()

Unnamed: 0,Bank account or service,Checking or savings account,Consumer Loan,Credit card,Credit card or prepaid card,Credit reporting,"Credit reporting, credit repair services, or other personal consumer reports",Debt collection,"Money transfer, virtual currency, or money service",Money transfers,Mortgage,Other financial service,Payday loan,"Payday loan, title loan, or personal loan",Prepaid card,Student loan,Vehicle loan or lease,Virtual currency
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
train_data_transaction_type_encoded = panda.concat([train_data_merged,tt], axis =1)

In [35]:
train_data_transaction_type_encoded.head()

Unnamed: 0,complaint_id,date_received,transaction_type,complaint_reason,company_response,date_sent_to_company,complaint_status,consumer_disputes,consumer_complaint_summary,complaint_reason.1,...,"Money transfer, virtual currency, or money service",Money transfers,Mortgage,Other financial service,Payday loan,"Payday loan, title loan, or personal loan",Prepaid card,Student loan,Vehicle loan or lease,Virtual currency
0,Tr-1,11/11/2015,Mortgage,"Loan servicing, payments, escrow account",,11/11/2015,Closed with explanation,Yes,"Seterus, Inc a déposé un faux rapport auprès d...","Loan servicing, payments, escrow account",...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Tr-2,7/7/2015,Credit reporting,Incorrect information on credit report,Company chooses not to provide a public response,7/7/2015,Closed with non-monetary relief,No,XX / XX / XXXX La requête en faillite n ° XXXX...,Incorrect information on credit report,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Tr-3,5/7/2015,Bank account or service,Using a debit or ATM card,,5/7/2015,Closed with explanation,No,"El XXXX / XXXX / 15, estaba preparando el vuel...",Using a debit or ATM card,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Tr-4,11/12/2016,Debt collection,Cont'd attempts collect debt not owed,Company believes it acted appropriately as aut...,11/12/2016,Closed with explanation,No,"The loan was paid in XXXX XXXX. In XXXX, 4 yea...",Cont'd attempts collect debt not owed,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Tr-5,9/29/2016,Credit card,Payoff process,Company has responded to the consumer and the ...,9/29/2016,Closed with explanation,No,J'ai obtenu un compte de crédit de soins pour ...,Payoff process,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
del tt, le

In [37]:
le = LabelEncoder().fit(test_data_merged['transaction_type'].values)


transformed_transaction_types = le.transform(test_data_merged['transaction_type'].values)


transformed = OneHotEncoder().fit_transform(transformed_transaction_types[:,np.newaxis])

# print(transformed.toarray())

tt = panda.DataFrame(transformed.toarray())
tt.columns = le.classes_
# tt.head()

test_data_transaction_type_encoded = panda.concat([test_data_merged,tt], axis =1)

test_data_transaction_type_encoded.head()

Unnamed: 0,complaint_id,date_received,transaction_type,complaint_reason,company_response,date_sent_to_company,consumer_disputes,consumer_complaint_summary,complaint_reason.1,complaint_reason_encoded,...,"Money transfer, virtual currency, or money service",Money transfers,Mortgage,Other financial service,Payday loan,"Payday loan, title loan, or personal loan",Prepaid card,Student loan,Vehicle loan or lease,Virtual currency
0,Te-1,8/18/2016,Bank account or service,"Account opening, closing, or management",Company has responded to the consumer and the ...,8/18/2016,No,XXXX / XXXX / 16 I called Citibank to open a c...,"Account opening, closing, or management",18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Te-2,4/18/2016,Debt collection,Communication tactics,Company believes it acted appropriately as aut...,4/20/2016,No,I'm struggling financially. I called and I off...,Communication tactics,24,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Te-3,3/23/2016,Credit reporting,Incorrect information on credit report,,3/23/2016,No,"In XXXX of 2015, an automatic payment was conf...",Incorrect information on credit report,13,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Te-4,6/26/2017,Student loan,Dealing with your lender or servicer,,6/26/2017,,"I submitted a request to XXXX, which is my cur...",Dealing with your lender or servicer,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,Te-5,5/13/2016,Credit reporting,Incorrect information on credit report,Company has responded to the consumer and the ...,5/13/2016,No,A state tax lien was filed against me XXXX / X...,Incorrect information on credit report,13,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
del tt, le

In [42]:
le = LabelEncoder().fit(test_data_transaction_type_encoded['complaint_reason_encoded'].values)


transformed_transaction_types = le.transform(test_data_transaction_type_encoded['complaint_reason_encoded'].values)


transformed = OneHotEncoder().fit_transform(transformed_transaction_types[:,np.newaxis])

# print(transformed.toarray())

tt = panda.DataFrame(transformed.toarray())
tt.columns = le.classes_
# tt.head()

test_data_complaint_reason_encoded = panda.concat([test_data_transaction_type_encoded,tt], axis =1)

del tt, le
test_data_complaint_reason_encoded.columns.tolist()

['complaint_id',
 'date_received',
 'transaction_type',
 'complaint_reason',
 'company_response',
 'date_sent_to_company',
 'consumer_disputes',
 'consumer_complaint_summary',
 'complaint_reason',
 'complaint_reason_encoded',
 'days_passed_since_complaint_received',
 'days_since_complaint_sent_to_company',
 'Bank account or service',
 'Checking or savings account',
 'Consumer Loan',
 'Credit card',
 'Credit card or prepaid card',
 'Credit reporting',
 'Credit reporting, credit repair services, or other personal consumer reports',
 'Debt collection',
 'Money transfer, virtual currency, or money service',
 'Money transfers',
 'Mortgage',
 'Other financial service',
 'Payday loan',
 'Payday loan, title loan, or personal loan',
 'Prepaid card',
 'Student loan',
 'Vehicle loan or lease',
 'Virtual currency',
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24]

In [43]:
test_data_complaint_reason_encoded.head()

Unnamed: 0,complaint_id,date_received,transaction_type,complaint_reason,company_response,date_sent_to_company,consumer_disputes,consumer_complaint_summary,complaint_reason.1,complaint_reason_encoded,...,15,16,17,18,19,20,21,22,23,24
0,Te-1,8/18/2016,Bank account or service,"Account opening, closing, or management",Company has responded to the consumer and the ...,8/18/2016,No,XXXX / XXXX / 16 I called Citibank to open a c...,"Account opening, closing, or management",18,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Te-2,4/18/2016,Debt collection,Communication tactics,Company believes it acted appropriately as aut...,4/20/2016,No,I'm struggling financially. I called and I off...,Communication tactics,24,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,Te-3,3/23/2016,Credit reporting,Incorrect information on credit report,,3/23/2016,No,"In XXXX of 2015, an automatic payment was conf...",Incorrect information on credit report,13,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Te-4,6/26/2017,Student loan,Dealing with your lender or servicer,,6/26/2017,,"I submitted a request to XXXX, which is my cur...",Dealing with your lender or servicer,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Te-5,5/13/2016,Credit reporting,Incorrect information on credit report,Company has responded to the consumer and the ...,5/13/2016,No,A state tax lien was filed against me XXXX / X...,Incorrect information on credit report,13,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [78]:
le = LabelEncoder().fit(train_data_transaction_type_encoded['complaint_reason_encoded'].values)


transformed_transaction_types = le.transform(train_data_transaction_type_encoded['complaint_reason_encoded'].values)


transformed = OneHotEncoder().fit_transform(transformed_transaction_types[:,np.newaxis])

# print(transformed.toarray())

tt = panda.DataFrame(transformed.toarray())
tt.columns = le.classes_
# tt.head()

train_data_complaint_reason_encoded = panda.concat([train_data_transaction_type_encoded,tt], axis =1)

del tt, le
train_data_complaint_reason_encoded.head()

Unnamed: 0,complaint_id,date_received,transaction_type,complaint_reason,company_response,date_sent_to_company,complaint_status,consumer_disputes,consumer_complaint_summary,complaint_reason.1,...,15,16,17,18,19,20,21,22,23,24
0,Tr-1,11/11/2015,Mortgage,"Loan servicing, payments, escrow account",,11/11/2015,Closed with explanation,Yes,"Seterus, Inc a déposé un faux rapport auprès d...","Loan servicing, payments, escrow account",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Tr-2,7/7/2015,Credit reporting,Incorrect information on credit report,Company chooses not to provide a public response,7/7/2015,Closed with non-monetary relief,No,XX / XX / XXXX La requête en faillite n ° XXXX...,Incorrect information on credit report,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Tr-3,5/7/2015,Bank account or service,Using a debit or ATM card,,5/7/2015,Closed with explanation,No,"El XXXX / XXXX / 15, estaba preparando el vuel...",Using a debit or ATM card,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Tr-4,11/12/2016,Debt collection,Cont'd attempts collect debt not owed,Company believes it acted appropriately as aut...,11/12/2016,Closed with explanation,No,"The loan was paid in XXXX XXXX. In XXXX, 4 yea...",Cont'd attempts collect debt not owed,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Tr-5,9/29/2016,Credit card,Payoff process,Company has responded to the consumer and the ...,9/29/2016,Closed with explanation,No,J'ai obtenu un compte de crédit de soins pour ...,Payoff process,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [46]:
train_data_complaint_reason_encoded['company_response'].isnull().sum()

## there are quite a few null values.. we will see what we can do about it.
## check the complaint status for the ones which has nan in company response
## if status is same throughout-- we can replace nan with some made up status
## else we will replace with the most appearing response

22506

In [47]:
train_data_complaint_reason_encoded['company_response'].value_counts()

Company has responded to the consumer and the CFPB and chooses not to provide a public response                            10928
Company chooses not to provide a public response                                                                            4322
Company believes it acted appropriately as authorized by contract or law                                                    3811
Company believes the complaint is the result of a misunderstanding                                                           387
Company disputes the facts presented in the complaint                                                                        379
Company believes complaint is the result of an isolated error                                                                302
Company believes complaint caused principally by actions of third party outside the control or direction of the company      300
Company can't verify or dispute the facts in the complaint                                       

In [66]:
empty_response = train_data_complaint_reason_encoded[train_data_complaint_reason_encoded['company_response'].isnull()][['company_response','complaint_status']]


In [67]:
empty_response['complaint_status'].value_counts()

Closed with explanation            18696
Closed with non-monetary relief     1863
Closed with monetary relief         1171
Closed                               455
Untimely response                    321
Name: complaint_status, dtype: int64

#### Since most status of the are closed..we will decide on categorical variable to replace NaN values for company-response. Lets get the most appearing company response for complaint status similar to the one above (except the NaN ones)

In [68]:
a = empty_response['complaint_status'].value_counts()

t =train_data_complaint_reason_encoded[train_data_complaint_reason_encoded['complaint_status'].isin(a.index.tolist())][['company_response']]

t['company_response'].value_counts()

Company has responded to the consumer and the CFPB and chooses not to provide a public response                            10928
Company chooses not to provide a public response                                                                            4322
Company believes it acted appropriately as authorized by contract or law                                                    3811
Company believes the complaint is the result of a misunderstanding                                                           387
Company disputes the facts presented in the complaint                                                                        379
Company believes complaint is the result of an isolated error                                                                302
Company believes complaint caused principally by actions of third party outside the control or direction of the company      300
Company can't verify or dispute the facts in the complaint                                       

In [69]:
most_appearing = t['company_response'].value_counts().index.tolist()[0]
most_appearing

'Company has responded to the consumer and the CFPB and chooses not to provide a public response'

In [52]:
del t, empty_response, a


<br> We will replace the empty value with the most appearing value for the column company-response

In [80]:
train_data_complaint_reason_encoded.company_response = train_data_complaint_reason_encoded['company_response'].fillna(value = most_appearing)
# train_data_complaint_reason_encoded


In [81]:
train_data_complaint_reason_encoded.columns.tolist()

['complaint_id',
 'date_received',
 'transaction_type',
 'complaint_reason',
 'company_response',
 'date_sent_to_company',
 'complaint_status',
 'consumer_disputes',
 'consumer_complaint_summary',
 'complaint_reason',
 'complaint_reason_encoded',
 'days_passed_since_complaint_received',
 'days_since_complaint_sent_to_company',
 'Bank account or service',
 'Checking or savings account',
 'Consumer Loan',
 'Credit card',
 'Credit card or prepaid card',
 'Credit reporting',
 'Credit reporting, credit repair services, or other personal consumer reports',
 'Debt collection',
 'Money transfer, virtual currency, or money service',
 'Money transfers',
 'Mortgage',
 'Other financial service',
 'Payday loan',
 'Payday loan, title loan, or personal loan',
 'Prepaid card',
 'Student loan',
 'Vehicle loan or lease',
 'Virtual currency',
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24]

In [82]:
train_data_complaint_reason_encoded['company_response'].isnull().sum()

0

In [83]:
test_data_complaint_reason_encoded['company_response'].isnull().sum()

9701

In [84]:
test_data_complaint_reason_encoded.company_response = test_data_complaint_reason_encoded['company_response'].fillna(value = most_appearing)
# train_data_complaint_reason_encoded


In [85]:
test_data_complaint_reason_encoded['company_response'].isnull().sum()

0

In [89]:
len(set(test_data_complaint_reason_encoded['company_response'].values.tolist())) ,len(set(train_data_complaint_reason_encoded['company_response'].values.tolist()))

(10, 10)

In [90]:
le = LabelEncoder().fit(train_data_complaint_reason_encoded['company_response'].values)


transformed_transaction_types = le.transform(train_data_complaint_reason_encoded['company_response'].values)


transformed = OneHotEncoder().fit_transform(transformed_transaction_types[:,np.newaxis])

# print(transformed.toarray())

tt = panda.DataFrame(transformed.toarray())
tt.columns = le.classes_
# tt.head()

train_data_company_response_encoded = panda.concat([train_data_complaint_reason_encoded,tt], axis =1)

del tt, le
train_data_company_response_encoded.head()

Unnamed: 0,complaint_id,date_received,transaction_type,complaint_reason,company_response,date_sent_to_company,complaint_status,consumer_disputes,consumer_complaint_summary,complaint_reason.1,...,Company believes complaint caused principally by actions of third party outside the control or direction of the company,Company believes complaint is the result of an isolated error,Company believes complaint relates to a discontinued policy or procedure,Company believes complaint represents an opportunity for improvement to better serve consumers,Company believes it acted appropriately as authorized by contract or law,Company believes the complaint is the result of a misunderstanding,Company can't verify or dispute the facts in the complaint,Company chooses not to provide a public response,Company disputes the facts presented in the complaint,Company has responded to the consumer and the CFPB and chooses not to provide a public response
0,Tr-1,11/11/2015,Mortgage,"Loan servicing, payments, escrow account",Company has responded to the consumer and the ...,11/11/2015,Closed with explanation,Yes,"Seterus, Inc a déposé un faux rapport auprès d...","Loan servicing, payments, escrow account",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Tr-2,7/7/2015,Credit reporting,Incorrect information on credit report,Company chooses not to provide a public response,7/7/2015,Closed with non-monetary relief,No,XX / XX / XXXX La requête en faillite n ° XXXX...,Incorrect information on credit report,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,Tr-3,5/7/2015,Bank account or service,Using a debit or ATM card,Company has responded to the consumer and the ...,5/7/2015,Closed with explanation,No,"El XXXX / XXXX / 15, estaba preparando el vuel...",Using a debit or ATM card,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,Tr-4,11/12/2016,Debt collection,Cont'd attempts collect debt not owed,Company believes it acted appropriately as aut...,11/12/2016,Closed with explanation,No,"The loan was paid in XXXX XXXX. In XXXX, 4 yea...",Cont'd attempts collect debt not owed,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,Tr-5,9/29/2016,Credit card,Payoff process,Company has responded to the consumer and the ...,9/29/2016,Closed with explanation,No,J'ai obtenu un compte de crédit de soins pour ...,Payoff process,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [91]:
le = LabelEncoder().fit(test_data_complaint_reason_encoded['company_response'].values)


transformed_transaction_types = le.transform(test_data_complaint_reason_encoded['company_response'].values)


transformed = OneHotEncoder().fit_transform(transformed_transaction_types[:,np.newaxis])

# print(transformed.toarray())

tt = panda.DataFrame(transformed.toarray())
tt.columns = le.classes_
# tt.head()

test_data_company_response_encoded = panda.concat([test_data_complaint_reason_encoded,tt], axis =1)

del tt, le
test_data_company_response_encoded.head()

Unnamed: 0,complaint_id,date_received,transaction_type,complaint_reason,company_response,date_sent_to_company,consumer_disputes,consumer_complaint_summary,complaint_reason.1,complaint_reason_encoded,...,Company believes complaint caused principally by actions of third party outside the control or direction of the company,Company believes complaint is the result of an isolated error,Company believes complaint relates to a discontinued policy or procedure,Company believes complaint represents an opportunity for improvement to better serve consumers,Company believes it acted appropriately as authorized by contract or law,Company believes the complaint is the result of a misunderstanding,Company can't verify or dispute the facts in the complaint,Company chooses not to provide a public response,Company disputes the facts presented in the complaint,Company has responded to the consumer and the CFPB and chooses not to provide a public response
0,Te-1,8/18/2016,Bank account or service,"Account opening, closing, or management",Company has responded to the consumer and the ...,8/18/2016,No,XXXX / XXXX / 16 I called Citibank to open a c...,"Account opening, closing, or management",18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Te-2,4/18/2016,Debt collection,Communication tactics,Company believes it acted appropriately as aut...,4/20/2016,No,I'm struggling financially. I called and I off...,Communication tactics,24,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,Te-3,3/23/2016,Credit reporting,Incorrect information on credit report,Company has responded to the consumer and the ...,3/23/2016,No,"In XXXX of 2015, an automatic payment was conf...",Incorrect information on credit report,13,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,Te-4,6/26/2017,Student loan,Dealing with your lender or servicer,Company has responded to the consumer and the ...,6/26/2017,,"I submitted a request to XXXX, which is my cur...",Dealing with your lender or servicer,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,Te-5,5/13/2016,Credit reporting,Incorrect information on credit report,Company has responded to the consumer and the ...,5/13/2016,No,A state tax lien was filed against me XXXX / X...,Incorrect information on credit report,13,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [94]:

print(train_data_company_response_encoded['consumer_disputes'].isnull().sum())
## dere are close to 8k empty values..for now we will replace it will most appearing value

# train_data_two_encoded['consumer-disputes'].value_counts()

most_appearing = train_data_company_response_encoded['consumer_disputes'].value_counts().idxmax(axis=1)
print(most_appearing)

train_data_company_response_encoded['consumer_disputes'].fillna(value = most_appearing, inplace = True)

print(train_data_company_response_encoded['consumer_disputes'].isnull().sum())

train_data_company_response_encoded['encoded_consumer_disputes']= train_data_company_response_encoded['consumer_disputes'].apply(lambda x: 1 if x=='Yes' else 0)

7698
No
7698


In [95]:
train_data_company_response_encoded.head()

Unnamed: 0,complaint_id,date_received,transaction_type,complaint_reason,company_response,date_sent_to_company,complaint_status,consumer_disputes,consumer_complaint_summary,complaint_reason.1,...,Company believes complaint is the result of an isolated error,Company believes complaint relates to a discontinued policy or procedure,Company believes complaint represents an opportunity for improvement to better serve consumers,Company believes it acted appropriately as authorized by contract or law,Company believes the complaint is the result of a misunderstanding,Company can't verify or dispute the facts in the complaint,Company chooses not to provide a public response,Company disputes the facts presented in the complaint,Company has responded to the consumer and the CFPB and chooses not to provide a public response,encoded_consumer_disputes
0,Tr-1,11/11/2015,Mortgage,"Loan servicing, payments, escrow account",Company has responded to the consumer and the ...,11/11/2015,Closed with explanation,Yes,"Seterus, Inc a déposé un faux rapport auprès d...","Loan servicing, payments, escrow account",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
1,Tr-2,7/7/2015,Credit reporting,Incorrect information on credit report,Company chooses not to provide a public response,7/7/2015,Closed with non-monetary relief,No,XX / XX / XXXX La requête en faillite n ° XXXX...,Incorrect information on credit report,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
2,Tr-3,5/7/2015,Bank account or service,Using a debit or ATM card,Company has responded to the consumer and the ...,5/7/2015,Closed with explanation,No,"El XXXX / XXXX / 15, estaba preparando el vuel...",Using a debit or ATM card,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3,Tr-4,11/12/2016,Debt collection,Cont'd attempts collect debt not owed,Company believes it acted appropriately as aut...,11/12/2016,Closed with explanation,No,"The loan was paid in XXXX XXXX. In XXXX, 4 yea...",Cont'd attempts collect debt not owed,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
4,Tr-5,9/29/2016,Credit card,Payoff process,Company has responded to the consumer and the ...,9/29/2016,Closed with explanation,No,J'ai obtenu un compte de crédit de soins pour ...,Payoff process,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0


In [96]:

print(test_data_company_response_encoded['consumer_disputes'].isnull().sum())
## dere are close to 8k empty values..for now we will replace it will most appearing value

# train_data_two_encoded['consumer-disputes'].value_counts()

most_appearing = test_data_company_response_encoded['consumer_disputes'].value_counts().idxmax(axis=1)
print(most_appearing)

test_data_company_response_encoded['consumer_disputes'].fillna(value = most_appearing, inplace = True)

print(test_data_company_response_encoded['consumer_disputes'].isnull().sum())

test_data_company_response_encoded['encoded_consumer_disputes']= test_data_company_response_encoded['consumer_disputes'].apply(lambda x: 1 if x=='Yes' else 0)


test_data_company_response_encoded.head()

3304
No
3304


Unnamed: 0,complaint_id,date_received,transaction_type,complaint_reason,company_response,date_sent_to_company,consumer_disputes,consumer_complaint_summary,complaint_reason.1,complaint_reason_encoded,...,Company believes complaint is the result of an isolated error,Company believes complaint relates to a discontinued policy or procedure,Company believes complaint represents an opportunity for improvement to better serve consumers,Company believes it acted appropriately as authorized by contract or law,Company believes the complaint is the result of a misunderstanding,Company can't verify or dispute the facts in the complaint,Company chooses not to provide a public response,Company disputes the facts presented in the complaint,Company has responded to the consumer and the CFPB and chooses not to provide a public response,encoded_consumer_disputes
0,Te-1,8/18/2016,Bank account or service,"Account opening, closing, or management",Company has responded to the consumer and the ...,8/18/2016,No,XXXX / XXXX / 16 I called Citibank to open a c...,"Account opening, closing, or management",18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
1,Te-2,4/18/2016,Debt collection,Communication tactics,Company believes it acted appropriately as aut...,4/20/2016,No,I'm struggling financially. I called and I off...,Communication tactics,24,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
2,Te-3,3/23/2016,Credit reporting,Incorrect information on credit report,Company has responded to the consumer and the ...,3/23/2016,No,"In XXXX of 2015, an automatic payment was conf...",Incorrect information on credit report,13,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3,Te-4,6/26/2017,Student loan,Dealing with your lender or servicer,Company has responded to the consumer and the ...,6/26/2017,,"I submitted a request to XXXX, which is my cur...",Dealing with your lender or servicer,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
4,Te-5,5/13/2016,Credit reporting,Incorrect information on credit report,Company has responded to the consumer and the ...,5/13/2016,No,A state tax lien was filed against me XXXX / X...,Incorrect information on credit report,13,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0


In [97]:
#lets apply encoding on our target columns

target_dict = {
    
    'Closed with explanation':1,
    'Closed with non-monetary relief':2,
    'Closed with monetary relief':3,
    'Closed':4,
    'Untimely response':5
    
}
train_data_company_response_encoded['y'] = train_data_company_response_encoded['complaint_status'].apply(lambda x: target_dict.get(x))

In [99]:
train_data_company_response_encoded.to_csv('dataset/train_all_merged.csv')

In [101]:
test_data_company_response_encoded.to_csv('dataset/test_all_merged.csv')