## Merging Complaint Reason and Preparing Test/Train Data

In [1]:
import pandas as panda

from sklearn.model_selection import learning_curve, train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, mean_absolute_error, classification_report, \
    confusion_matrix, f1_score, roc_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

from matplotlib import pyplot as plot
import seaborn as sns


from numpy import bincount, linspace, mean, std, arange, squeeze

import itertools, time, datetime

import warnings
warnings.simplefilter('ignore')

%matplotlib inline

In [2]:

test_data_path ='dataset/test.csv'
train_data_path = 'dataset/train.csv'
sample_submission_path = 'dataset/sample_submission.csv'
test_complaint_reason_path = 'dataset/test_data_complaint_reason.csv'
train_complaint_reason_path = 'dataset/train_data_complaint_reason.csv'

In [3]:
test_data = panda.read_csv(test_data_path)
train_data = panda.read_csv(train_data_path)
test_data_complaint_reason = panda.read_csv(test_complaint_reason_path)
train_data_complaint_reason = panda.read_csv(train_complaint_reason_path)

In [4]:
test_data.shape, train_data.shape, test_data_complaint_reason.shape, train_data_complaint_reason.shape

((18543, 8), (43266, 9), (18543, 4), (43266, 4))

In [5]:
train_data.columns = [i.lower().strip().replace('-','_') for i in train_data.columns.tolist()]
train_data.columns.tolist()

['complaint_id',
 'date_received',
 'transaction_type',
 'complaint_reason',
 'company_response',
 'date_sent_to_company',
 'complaint_status',
 'consumer_disputes',
 'consumer_complaint_summary']

In [6]:
test_data.columns = [i.lower().strip().replace('-','_') for i in test_data.columns.tolist()]
test_data.columns.tolist()

['complaint_id',
 'date_received',
 'transaction_type',
 'complaint_reason',
 'company_response',
 'date_sent_to_company',
 'consumer_disputes',
 'consumer_complaint_summary']

In [7]:
test_data_complaint_reason.columns.tolist(),train_data_complaint_reason.columns.tolist()

(['Unnamed: 0',
  'complaint_id',
  'complaint_reason',
  'complaint_reason_encoded'],
 ['Unnamed: 0',
  'complaint_id',
  'complaint_reason',
  'complaint_reason_encoded'])

In [8]:
test_data_complaint_reason.drop(['Unnamed: 0'], axis =1, inplace = True)
test_data_complaint_reason.columns.tolist()

['complaint_id', 'complaint_reason', 'complaint_reason_encoded']

In [9]:
train_data_complaint_reason.drop(['Unnamed: 0'], axis =1, inplace = True)
train_data_complaint_reason.columns.tolist()

['complaint_id', 'complaint_reason', 'complaint_reason_encoded']

In [10]:
# test_data_merged = panda.concat([test_data, test_data_complaint_reason], axis =1 ) #test_data.merge(test_data_complaint_reason, how ='inner', on='complaint_reason')

test_data_merged  = test_data_complaint_reason.merge(test_data, how='inner', on='complaint_id')

In [11]:
test_data_merged.shape, test_data.shape

((18543, 10), (18543, 8))

In [12]:
test_data_merged.head(5)

Unnamed: 0,complaint_id,complaint_reason_x,complaint_reason_encoded,date_received,transaction_type,complaint_reason_y,company_response,date_sent_to_company,consumer_disputes,consumer_complaint_summary
0,Te-1,"Account opening, closing, or management",5,8/18/2016,Bank account or service,"Account opening, closing, or management",Company has responded to the consumer and the ...,8/18/2016,No,XXXX / XXXX / 16 I called Citibank to open a c...
1,Te-2,Communication tactics,24,4/18/2016,Debt collection,Communication tactics,Company believes it acted appropriately as aut...,4/20/2016,No,I'm struggling financially. I called and I off...
2,Te-3,Incorrect information on credit report,13,3/23/2016,Credit reporting,Incorrect information on credit report,,3/23/2016,No,"In XXXX of 2015, an automatic payment was conf..."
3,Te-4,Dealing with your lender or servicer,4,6/26/2017,Student loan,Dealing with your lender or servicer,,6/26/2017,,"I submitted a request to XXXX, which is my cur..."
4,Te-5,Incorrect information on credit report,13,5/13/2016,Credit reporting,Incorrect information on credit report,Company has responded to the consumer and the ...,5/13/2016,No,A state tax lien was filed against me XXXX / X...


In [13]:
# train_data_merged = panda.concat([train_data, train_data_complaint_reason], axis =1)
train_data_merged = train_data_complaint_reason.merge(train_data, how='inner', on='complaint_id')
train_data_merged.shape, train_data.shape

((43266, 11), (43266, 9))

In [14]:
from dateutil import relativedelta


def get_days_passed( given):
    
    current = datetime.datetime.now()
    given = datetime.datetime.strptime(given, '%m/%d/%Y')
    
    return (current-given).days

In [15]:
train_data_merged['days_passed_since_complaint_received'] = train_data_merged['date_received'].apply(lambda x : get_days_passed(x))

In [16]:
test_data_merged['days_passed_since_complaint_received'] = test_data_merged['date_received'].apply(lambda x : get_days_passed(x))

In [None]:
train_data_merged['days_since_complaint_sent_to_company'] = train_data_merged['date_sent_to_company'].apply(lambda x: get_days_passed(x))

In [None]:
test_data_merged['days_since_complaint_sent_to_company'] = test_data_merged['date_sent_to_company'].apply(lambda x: get_days_passed(x))

In [None]:
le = LabelEncoder().fit(train_data_merged['transaction_type'].values)

print(le.classes_)

transformed_transaction_types = le.transform(train_data_merged['transaction_type'].values)


In [None]:
transformed = OneHotEncoder().fit_transform(transformed_transaction_types[:,np.newaxis])

print(transformed.toarray())

In [None]:
tt = panda.DataFrame(transformed.toarray())
tt.columns = le.classes_
tt.head()

In [None]:
train_data_transaction_type_encoded = panda.concat([train_data_merged,tt], axis =1)

In [None]:
train_data_transaction_type_encoded.head()

In [None]:
del tt, le

In [None]:
le = LabelEncoder().fit(test_data_merged['transaction_type'].values)


transformed_transaction_types = le.transform(test_data_merged['transaction_type'].values)


transformed = OneHotEncoder().fit_transform(transformed_transaction_types[:,np.newaxis])

# print(transformed.toarray())

tt = panda.DataFrame(transformed.toarray())
tt.columns = le.classes_
# tt.head()

test_data_transaction_type_encoded = panda.concat([test_data_merged,tt], axis =1)

test_data_transaction_type_encoded.head()

In [None]:
del tt, le

In [None]:
le = LabelEncoder().fit(test_data_transaction_type_encoded['complaint_reason_encoded'].values)


transformed_transaction_types = le.transform(test_data_transaction_type_encoded['complaint_reason_encoded'].values)


transformed = OneHotEncoder().fit_transform(transformed_transaction_types[:,np.newaxis])

# print(transformed.toarray())

tt = panda.DataFrame(transformed.toarray())
tt.columns = le.classes_
# tt.head()

test_data_complaint_reason_encoded = panda.concat([test_data_transaction_type_encoded,tt], axis =1)

del tt, le
test_data_complaint_reason_encoded.columns.tolist()

In [None]:
test_data_complaint_reason_encoded.head()

In [None]:
le = LabelEncoder().fit(train_data_transaction_type_encoded['complaint_reason_encoded'].values)


transformed_transaction_types = le.transform(train_data_transaction_type_encoded['complaint_reason_encoded'].values)


transformed = OneHotEncoder().fit_transform(transformed_transaction_types[:,np.newaxis])

# print(transformed.toarray())

tt = panda.DataFrame(transformed.toarray())
tt.columns = le.classes_
# tt.head()

train_data_complaint_reason_encoded = panda.concat([train_data_transaction_type_encoded,tt], axis =1)

del tt, le
train_data_complaint_reason_encoded.head()

In [None]:
train_data_complaint_reason_encoded['company_response'].isnull().sum()

## there are quite a few null values.. we will see what we can do about it.
## check the complaint status for the ones which has nan in company response
## if status is same throughout-- we can replace nan with some made up status
## else we will replace with the most appearing response

In [None]:
train_data_complaint_reason_encoded['company_response'].value_counts()

In [None]:
empty_response = train_data_complaint_reason_encoded[train_data_complaint_reason_encoded['company_response'].isnull()][['company_response','complaint_status']]


In [None]:
empty_response['complaint_status'].value_counts()

#### Since most status of the are closed..we will decide on categorical variable to replace NaN values for company-response. Lets get the most appearing company response for complaint status similar to the one above (except the NaN ones)

In [None]:
a = empty_response['complaint_status'].value_counts()

t =train_data_complaint_reason_encoded[train_data_complaint_reason_encoded['complaint_status'].isin(a.index.tolist())][['company_response']]

t['company_response'].value_counts()

In [None]:
most_appearing = t['company_response'].value_counts().index.tolist()[0]
most_appearing

In [None]:
del t, empty_response, a


<br> We will replace the empty value with the most appearing value for the column company-response

In [None]:
train_data_complaint_reason_encoded.company_response = train_data_complaint_reason_encoded['company_response'].fillna(value = most_appearing)
# train_data_complaint_reason_encoded


In [None]:
train_data_complaint_reason_encoded.columns.tolist()

In [None]:
train_data_complaint_reason_encoded['company_response'].isnull().sum()

In [None]:
test_data_complaint_reason_encoded['company_response'].isnull().sum()

In [None]:
test_data_complaint_reason_encoded.company_response = test_data_complaint_reason_encoded['company_response'].fillna(value = most_appearing)
# train_data_complaint_reason_encoded


In [None]:
test_data_complaint_reason_encoded['company_response'].isnull().sum()

In [None]:
len(set(test_data_complaint_reason_encoded['company_response'].values.tolist())) ,len(set(train_data_complaint_reason_encoded['company_response'].values.tolist()))

In [None]:
le = LabelEncoder().fit(train_data_complaint_reason_encoded['company_response'].values)


transformed_transaction_types = le.transform(train_data_complaint_reason_encoded['company_response'].values)


transformed = OneHotEncoder().fit_transform(transformed_transaction_types[:,np.newaxis])

# print(transformed.toarray())

tt = panda.DataFrame(transformed.toarray())
tt.columns = le.classes_
# tt.head()

train_data_company_response_encoded = panda.concat([train_data_complaint_reason_encoded,tt], axis =1)

del tt, le
train_data_company_response_encoded.head()

In [None]:
le = LabelEncoder().fit(test_data_complaint_reason_encoded['company_response'].values)


transformed_transaction_types = le.transform(test_data_complaint_reason_encoded['company_response'].values)


transformed = OneHotEncoder().fit_transform(transformed_transaction_types[:,np.newaxis])

# print(transformed.toarray())

tt = panda.DataFrame(transformed.toarray())
tt.columns = le.classes_
# tt.head()

test_data_company_response_encoded = panda.concat([test_data_complaint_reason_encoded,tt], axis =1)

del tt, le
test_data_company_response_encoded.head()

In [None]:

print(train_data_company_response_encoded['consumer_disputes'].isnull().sum())
## dere are close to 8k empty values..for now we will replace it will most appearing value

# train_data_two_encoded['consumer-disputes'].value_counts()

most_appearing = train_data_company_response_encoded['consumer_disputes'].value_counts().idxmax(axis=1)
print(most_appearing)

train_data_company_response_encoded['consumer_disputes'].fillna(value = most_appearing, inplace = True)

print(train_data_company_response_encoded['consumer_disputes'].isnull().sum())

train_data_company_response_encoded['encoded_consumer_disputes']= train_data_company_response_encoded['consumer_disputes'].apply(lambda x: 1 if x=='Yes' else 0)

In [None]:
train_data_company_response_encoded.head()

In [None]:

print(test_data_company_response_encoded['consumer_disputes'].isnull().sum())
## dere are close to 8k empty values..for now we will replace it will most appearing value

# train_data_two_encoded['consumer-disputes'].value_counts()

most_appearing = test_data_company_response_encoded['consumer_disputes'].value_counts().idxmax(axis=1)
print(most_appearing)

test_data_company_response_encoded['consumer_disputes'].fillna(value = most_appearing, inplace = True)

print(test_data_company_response_encoded['consumer_disputes'].isnull().sum())

test_data_company_response_encoded['encoded_consumer_disputes']= test_data_company_response_encoded['consumer_disputes'].apply(lambda x: 1 if x=='Yes' else 0)


test_data_company_response_encoded.head()

In [None]:
#lets apply encoding on our target columns

target_dict = {
    
    'Closed with explanation':0,
    'Closed with non-monetary relief':1,
    'Closed with monetary relief':2,
    'Closed':3,
    'Untimely response':4
    
}
train_data_company_response_encoded['y'] = train_data_company_response_encoded['complaint_status'].apply(lambda x: target_dict.get(x))

In [None]:
train_data.columns.values

In [None]:
train_data['time_series_date_received'] = panda.DatetimeIndex(train_data.date_received).astype(np.int64)
train_data['time_series_date_sent_to_company'] = panda.DatetimeIndex(train_data.date_sent_to_company).astype(np.int64)

test_data['time_series_date_received'] = panda.DatetimeIndex(test_data.date_received).astype(np.int64)
test_data['time_series_date_sent_to_company'] = panda.DatetimeIndex(test_data.date_sent_to_company).astype(np.int64)

In [None]:
train_data_company_response_encoded.to_csv('dataset/train_all_merged_1.csv')

In [None]:
test_data_company_response_encoded.to_csv('dataset/test_all_merged_1.csv')