# Societe Generale Complaint Tracker - Categorical Field Analysis

In [1]:
import pandas as panda

from sklearn.model_selection import learning_curve, train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, mean_absolute_error, classification_report, \
    confusion_matrix, f1_score, roc_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

from matplotlib import pyplot as plot
import seaborn as sns


from numpy import bincount, linspace, mean, std, arange, squeeze

import itertools, time, datetime

import warnings
warnings.simplefilter('ignore')

%matplotlib inline

In [2]:
test_data_path ='dataset/test.csv'
train_data_path = 'dataset/train.csv'
sample_submission_path = 'dataset/sample_submission.csv'


In [3]:
train_data = panda.read_csv(train_data_path)
test_data = panda.read_csv(test_data_path)

In [4]:
train_data.columns = [i.lower().replace('-','_') for i in train_data.columns.tolist()]

test_data.columns = [i.lower().replace('-','_') for i in test_data.columns.tolist()]

In [5]:
len(train_data['transaction_type'].value_counts()) == len(test_data['transaction_type'].value_counts())

True

In [6]:
train_data_transaction_types = set(train_data.transaction_type.values.tolist())
test_data_transaction_types = set(test_data.transaction_type.values.tolist())
train_data_transaction_types - test_data_transaction_types, test_data_transaction_types-train_data_transaction_types

(set(), set())

In [7]:
len(train_data['consumer_disputes'].value_counts()) == len(test_data['consumer_disputes'].value_counts())

True

In [8]:
train_data_consumer_disputes_types = set(train_data.consumer_disputes.values.tolist())
test_data_consumer_disputes_types = set(test_data.consumer_disputes.values.tolist())

train_data_consumer_disputes_types -test_data_consumer_disputes_types, test_data_consumer_disputes_types-train_data_consumer_disputes_types

(set(), set())

In [9]:
len(train_data['company_response'].value_counts()) == len(test_data['company_response'].value_counts())

True

In [10]:
train_data_company_response_types = set(train_data.company_response.values.tolist())
test_data_company_response_types = set(test_data.company_response.values.tolist())

train_data_company_response_types -test_data_company_response_types, test_data_company_response_types-train_data_company_response_types

(set(), set())

In [11]:
len(train_data['complaint_reason'].value_counts()) == len(test_data['complaint_reason'].value_counts())

## this is the problem.. number of unique variables in complaint reason in train and test set differs..so
## when we perform one hot encoding it actually causes issues

False

In [12]:
print(len(train_data['complaint_reason'].value_counts()),len(test_data['complaint_reason'].value_counts()))

150 147


In [13]:
train_data_complaint_reason_types = set(train_data.complaint_reason.values.tolist())
test_data_complaint_reason_types = set(test_data.complaint_reason.values.tolist())

train_data_complaint_reason_types -test_data_complaint_reason_types, test_data_complaint_reason_types-train_data_complaint_reason_types

({'Account terms and changes',
  'Advertising',
  'Incorrect exchange rate',
  'Problem with an overdraft',
  "Was approved for a loan, but didn't receive the money"},
 {"Can't stop withdrawals from your bank account",
  'Problem with cash advance'})

#### "Can't stop withdrawals from your bank account", 'Problem with cash advance 
-- these values are present in test set and not present in train data. We cannot predict for values which we have not trained in. which means we can safely drop these values from the training as well as test data

### we may have to apply proper bining and categorization on complaint reason category field

In [14]:
train_data_complaint_reason_types

{'APR or interest rate',
 'Account opening, closing, or management',
 'Account terms and changes',
 'Adding money',
 'Advertising',
 'Advertising and marketing',
 'Advertising and marketing, including promotional offers',
 'Advertising, marketing or disclosures',
 'Application processing delay',
 'Application, originator, mortgage broker',
 'Applied for loan/did not receive money',
 'Applying for a mortgage',
 'Applying for a mortgage or refinancing an existing mortgage',
 'Arbitration',
 'Attempts to collect debt not owed',
 'Balance transfer',
 'Balance transfer fee',
 'Bankruptcy',
 'Billing disputes',
 'Billing statement',
 "Can't contact lender",
 "Can't contact lender or servicer",
 "Can't repay my loan",
 "Can't stop charges to bank account",
 'Cash advance',
 'Cash advance fee',
 'Charged bank acct wrong day or amt',
 "Charged fees or interest I didn't expect",
 "Charged fees or interest you didn't expect",
 'Closing an account',
 'Closing on a mortgage',
 'Closing your account

In [15]:
test_data_complaint_reason_types

{'APR or interest rate',
 'Account opening, closing, or management',
 'Adding money',
 'Advertising and marketing',
 'Advertising and marketing, including promotional offers',
 'Advertising, marketing or disclosures',
 'Application processing delay',
 'Application, originator, mortgage broker',
 'Applied for loan/did not receive money',
 'Applying for a mortgage',
 'Applying for a mortgage or refinancing an existing mortgage',
 'Arbitration',
 'Attempts to collect debt not owed',
 'Balance transfer',
 'Balance transfer fee',
 'Bankruptcy',
 'Billing disputes',
 'Billing statement',
 "Can't contact lender",
 "Can't contact lender or servicer",
 "Can't repay my loan",
 "Can't stop charges to bank account",
 "Can't stop withdrawals from your bank account",
 'Cash advance',
 'Cash advance fee',
 'Charged bank acct wrong day or amt',
 "Charged fees or interest I didn't expect",
 "Charged fees or interest you didn't expect",
 'Closing an account',
 'Closing on a mortgage',
 'Closing your acc

In [16]:
from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import simple_preprocess # converts into lowercase tokens
from gensim import corpora

from nltk.stem import WordNetLemmatizer, SnowballStemmer
import nltk
import gensim

np.random.seed(2018)


In [17]:
def lemmatize_stemming(text):
    stemmer = SnowballStemmer('english')
    lemetized =  [stemmer.stem(WordNetLemmatizer().lemmatize(i, pos='v')) for i in text]
    if len(lemetized) == 0:
        print('this gave me 0 text..it removed everything ', text)
    
    return lemetized

def stop_words_removed(text):
    removed =  [word for word in text if word not in STOPWORDS]
    if len(removed) == 0: # for a complaint reason like Others, stopword was removing this. we need others
        print('This removed all my words: ', text)
        removed = text
    
    return removed


In [18]:
tokenized_complaint_reason_types = [simple_preprocess(word) for word in train_data_complaint_reason_types]

cleaned_complaint_reason_types = [stop_words_removed(word) for word in tokenized_complaint_reason_types ]

lemmetized_complaint_reason_types = [lemmatize_stemming(word) for word in cleaned_complaint_reason_types]


This removed all my words:  ['other']


In [22]:
dictionary = corpora.Dictionary(lemmetized_complaint_reason_types)
corpus = [dictionary.doc2bow(word) for word in lemmetized_complaint_reason_types]


In [23]:
topic_count = 25

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = topic_count, id2word=dictionary, passes=15)
topics = ldamodel.print_topics()

for topic in topics:
    print('found topic: ',topic)


found topic:  (22, '0.162*"debt" + 0.123*"transfer" + 0.083*"owe" + 0.083*"collect" + 0.083*"attempt" + 0.083*"balanc" + 0.042*"cont" + 0.042*"disclosur" + 0.042*"verif" + 0.002*"fee"')
found topic:  (24, '0.163*"wrong" + 0.083*"day" + 0.083*"bank" + 0.082*"charg" + 0.042*"account" + 0.042*"amt" + 0.042*"acct" + 0.042*"steal" + 0.042*"take" + 0.042*"lose"')
found topic:  (11, '0.076*"problem" + 0.076*"bankruptci" + 0.076*"arbitr" + 0.076*"conveni" + 0.076*"check" + 0.076*"pay" + 0.076*"unabl" + 0.003*"servic" + 0.003*"end" + 0.003*"issuanc"')
found topic:  (20, '0.279*"loan" + 0.205*"leas" + 0.164*"get" + 0.042*"excess" + 0.042*"take" + 0.002*"problem" + 0.002*"overdraft" + 0.002*"fee" + 0.002*"advertis" + 0.002*"threaten"')
found topic:  (16, '0.127*"disclosur" + 0.064*"miss" + 0.064*"threaten" + 0.064*"contact" + 0.064*"share" + 0.064*"info" + 0.064*"improp" + 0.033*"take" + 0.033*"illeg" + 0.033*"action"')
found topic:  (23, '0.089*"apr" + 0.089*"rate" + 0.089*"secur" + 0.088*"fraud

In [24]:
def get_topic_count_phrase_belongs_to(lda_model, dictionary, phrase):
    
    bow = lemmatize_stemming(stop_words_removed(simple_preprocess(phrase))) 
    # print(bow)
    a = lda_model.get_document_topics(dictionary.doc2bow(bow))
    arr = np.array(a)
    t1,t2 = arr.max(axis=0)
    return int(t1)

In [25]:
train_data['complaint_reason_encoded'] = train_data.complaint_reason.apply(lambda x: get_topic_count_phrase_belongs_to(ldamodel,dictionary, x))

This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed

This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed

In [26]:
train_data.complaint_reason_encoded.value_counts()

24    9129
13    6397
22    6156
8     5469
5     4084
7     1973
4     1704
16    1586
20    1243
18    1155
2     1020
14     649
10     580
0      440
11     426
21     426
19     324
9      123
12     117
3       79
17      71
23      61
6       39
15      15
Name: complaint_reason_encoded, dtype: int64

In [27]:
train_data[['complaint_reason', 'complaint_reason_encoded']]

Unnamed: 0,complaint_reason,complaint_reason_encoded
0,"Loan servicing, payments, escrow account",8
1,Incorrect information on credit report,13
2,Using a debit or ATM card,21
3,Cont'd attempts collect debt not owed,22
4,Payoff process,24
5,"Loan modification,collection,foreclosure",5
6,Incorrect information on credit report,13
7,Problems caused by my funds being low,14
8,Cont'd attempts collect debt not owed,22
9,Incorrect information on credit report,13


In [28]:
test_data['complaint_reason_encoded'] = test_data.complaint_reason.apply(lambda x: get_topic_count_phrase_belongs_to(ldamodel,dictionary, x))

This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed

In [29]:
test_data.complaint_reason_encoded.value_counts()

24    3819
13    2927
22    2683
8     2236
5     1642
7      881
16     726
4      694
20     532
18     518
2      407
14     285
10     232
11     207
0      202
21     187
19     155
9       57
12      46
17      33
3       27
23      20
6       19
15       8
Name: complaint_reason_encoded, dtype: int64

In [31]:
test_data.head()

Unnamed: 0,complaint_id,date_received,transaction_type,complaint_reason,company_response,date_sent_to_company,consumer_disputes,consumer_complaint_summary,complaint_reason_encoded
0,Te-1,8/18/2016,Bank account or service,"Account opening, closing, or management",Company has responded to the consumer and the ...,8/18/2016,No,XXXX / XXXX / 16 I called Citibank to open a c...,5
1,Te-2,4/18/2016,Debt collection,Communication tactics,Company believes it acted appropriately as aut...,4/20/2016,No,I'm struggling financially. I called and I off...,24
2,Te-3,3/23/2016,Credit reporting,Incorrect information on credit report,,3/23/2016,No,"In XXXX of 2015, an automatic payment was conf...",13
3,Te-4,6/26/2017,Student loan,Dealing with your lender or servicer,,6/26/2017,,"I submitted a request to XXXX, which is my cur...",4
4,Te-5,5/13/2016,Credit reporting,Incorrect information on credit report,Company has responded to the consumer and the ...,5/13/2016,No,A state tax lien was filed against me XXXX / X...,13


In [32]:
train_data[['complaint_id','complaint_reason','complaint_reason_encoded']].to_csv('dataset/train_data_complaint_reason.csv')

In [33]:
test_data[['complaint_id','complaint_reason','complaint_reason_encoded']].to_csv('dataset/test_data_complaint_reason.csv')