# Societe Generale Complaint Tracker - Categorical Field Analysis

In [1]:
import pandas as panda

from sklearn.model_selection import learning_curve, train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, mean_absolute_error, classification_report, \
    confusion_matrix, f1_score, roc_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

from matplotlib import pyplot as plot
import seaborn as sns


from numpy import bincount, linspace, mean, std, arange, squeeze

import itertools, time, datetime

import warnings
warnings.simplefilter('ignore')

%matplotlib inline

In [2]:
test_data_path ='dataset/test.csv'
train_data_path = 'dataset/train.csv'
sample_submission_path = 'dataset/sample_submission.csv'


In [3]:
train_data = panda.read_csv(train_data_path)
test_data = panda.read_csv(test_data_path)

In [4]:
train_data.columns = [i.lower().replace('-','_') for i in train_data.columns.tolist()]

test_data.columns = [i.lower().replace('-','_') for i in test_data.columns.tolist()]

In [5]:
len(train_data['transaction_type'].value_counts()) == len(test_data['transaction_type'].value_counts())

True

In [6]:
train_data_transaction_types = set(train_data.transaction_type.values.tolist())
test_data_transaction_types = set(test_data.transaction_type.values.tolist())
train_data_transaction_types - test_data_transaction_types, test_data_transaction_types-train_data_transaction_types

(set(), set())

In [7]:
len(train_data['consumer_disputes'].value_counts()) == len(test_data['consumer_disputes'].value_counts())

True

In [8]:
train_data_consumer_disputes_types = set(train_data.consumer_disputes.values.tolist())
test_data_consumer_disputes_types = set(test_data.consumer_disputes.values.tolist())

train_data_consumer_disputes_types -test_data_consumer_disputes_types, test_data_consumer_disputes_types-train_data_consumer_disputes_types

(set(), set())

In [20]:
len(train_data['company_response'].value_counts()) == len(test_data['company_response'].value_counts())

True

In [9]:
train_data_company_response_types = set(train_data.company_response.values.tolist())
test_data_company_response_types = set(test_data.company_response.values.tolist())

train_data_company_response_types -test_data_company_response_types, test_data_company_response_types-train_data_company_response_types

(set(), set())

In [10]:
len(train_data['complaint_reason'].value_counts()) == len(test_data['complaint_reason'].value_counts())

## this is the problem.. number of unique variables in complaint reason in train and test set differs..so
## when we perform one hot encoding it actually causes issues

False

In [11]:
print(len(train_data['complaint_reason'].value_counts()),len(test_data['complaint_reason'].value_counts()))

150 147


In [12]:
train_data_complaint_reason_types = set(train_data.complaint_reason.values.tolist())
test_data_complaint_reason_types = set(test_data.complaint_reason.values.tolist())

train_data_complaint_reason_types -test_data_complaint_reason_types, test_data_complaint_reason_types-train_data_complaint_reason_types

({'Account terms and changes',
  'Advertising',
  'Incorrect exchange rate',
  'Problem with an overdraft',
  "Was approved for a loan, but didn't receive the money"},
 {"Can't stop withdrawals from your bank account",
  'Problem with cash advance'})

#### "Can't stop withdrawals from your bank account", 'Problem with cash advance 
-- these values are present in test set and not present in train data. We cannot predict for values which we have not trained in. which means we can safely drop these values from the training as well as test data

### we may have to apply proper bining and categorization on complaint reason category field

In [24]:
train_data_complaint_reason_types

{'APR or interest rate',
 'Account opening, closing, or management',
 'Account terms and changes',
 'Adding money',
 'Advertising',
 'Advertising and marketing',
 'Advertising and marketing, including promotional offers',
 'Advertising, marketing or disclosures',
 'Application processing delay',
 'Application, originator, mortgage broker',
 'Applied for loan/did not receive money',
 'Applying for a mortgage',
 'Applying for a mortgage or refinancing an existing mortgage',
 'Arbitration',
 'Attempts to collect debt not owed',
 'Balance transfer',
 'Balance transfer fee',
 'Bankruptcy',
 'Billing disputes',
 'Billing statement',
 "Can't contact lender",
 "Can't contact lender or servicer",
 "Can't repay my loan",
 "Can't stop charges to bank account",
 'Cash advance',
 'Cash advance fee',
 'Charged bank acct wrong day or amt',
 "Charged fees or interest I didn't expect",
 "Charged fees or interest you didn't expect",
 'Closing an account',
 'Closing on a mortgage',
 'Closing your account

In [25]:
test_data_complaint_reason_types

{'APR or interest rate',
 'Account opening, closing, or management',
 'Adding money',
 'Advertising and marketing',
 'Advertising and marketing, including promotional offers',
 'Advertising, marketing or disclosures',
 'Application processing delay',
 'Application, originator, mortgage broker',
 'Applied for loan/did not receive money',
 'Applying for a mortgage',
 'Applying for a mortgage or refinancing an existing mortgage',
 'Arbitration',
 'Attempts to collect debt not owed',
 'Balance transfer',
 'Balance transfer fee',
 'Bankruptcy',
 'Billing disputes',
 'Billing statement',
 "Can't contact lender",
 "Can't contact lender or servicer",
 "Can't repay my loan",
 "Can't stop charges to bank account",
 "Can't stop withdrawals from your bank account",
 'Cash advance',
 'Cash advance fee',
 'Charged bank acct wrong day or amt',
 "Charged fees or interest I didn't expect",
 "Charged fees or interest you didn't expect",
 'Closing an account',
 'Closing on a mortgage',
 'Closing your acc

In [13]:
from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import simple_preprocess # converts into lowercase tokens
from gensim import corpora

from nltk.stem import WordNetLemmatizer, SnowballStemmer
import nltk
import gensim

np.random.seed(2018)


In [14]:
def lemmatize_stemming(text):
    stemmer = SnowballStemmer('english')
    lemetized =  [stemmer.stem(WordNetLemmatizer().lemmatize(i, pos='v')) for i in text]
    if len(lemetized) == 0:
        print('this gave me 0 text..it removed everything ', text)
    
    return lemetized

def stop_words_removed(text):
    removed =  [word for word in text if word not in STOPWORDS]
    if len(removed) == 0: # for a complaint reason like Others, stopword was removing this. we need others
        print('This removed all my words: ', text)
        removed = text
    
    return removed


In [15]:
tokenized_complaint_reason_types = [simple_preprocess(word) for word in train_data_complaint_reason_types]

cleaned_complaint_reason_types = [stop_words_removed(word) for word in tokenized_complaint_reason_types ]

lemmetized_complaint_reason_types = [lemmatize_stemming(word) for word in cleaned_complaint_reason_types]


This removed all my words:  ['other']


In [16]:
dictionary = corpora.Dictionary(lemmetized_complaint_reason_types)
corpus = [dictionary.doc2bow(word) for word in lemmetized_complaint_reason_types]


In [17]:
topic_count = 25

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = topic_count, id2word=dictionary, passes=15)
topics = ldamodel.print_topics()

for topic in topics:
    print('found topic: ',topic)


found topic:  (22, '0.058*"share" + 0.058*"improp" + 0.058*"process" + 0.058*"payoff" + 0.058*"contact" + 0.058*"info" + 0.030*"charg" + 0.030*"wrong" + 0.030*"bank" + 0.030*"end"')
found topic:  (24, '0.110*"loan" + 0.084*"account" + 0.084*"chang" + 0.084*"repay" + 0.043*"credit" + 0.043*"limit" + 0.043*"steal" + 0.043*"check" + 0.043*"lose" + 0.043*"payment"')
found topic:  (11, '0.065*"loan" + 0.065*"collect" + 0.065*"foreclosur" + 0.065*"modif" + 0.065*"payment" + 0.065*"troubl" + 0.065*"process" + 0.065*"wallet" + 0.065*"mobil" + 0.010*"close"')
found topic:  (20, '0.138*"issu" + 0.104*"transact" + 0.104*"card" + 0.070*"protect" + 0.070*"credit" + 0.070*"get" + 0.037*"problem" + 0.035*"rate" + 0.035*"apr" + 0.035*"tran"')
found topic:  (16, '0.099*"statement" + 0.080*"debt" + 0.080*"notif" + 0.080*"represent" + 0.080*"fals" + 0.080*"write" + 0.003*"get" + 0.003*"problem" + 0.003*"custom" + 0.003*"loan"')
found topic:  (23, '0.154*"problem" + 0.103*"purchas" + 0.053*"featur" + 0.05

In [24]:
def get_topic_count_phrase_belongs_to(lda_model, dictionary, phrase):
    
    bow = lemmatize_stemming(stop_words_removed(simple_preprocess(phrase))) 
    # print(bow)
    a = lda_model.get_document_topics(dictionary.doc2bow(bow))
    arr = np.array(a)
    t1,t2 = arr.max(axis=0)
    return int(t1)

In [25]:
train_data['complaint_reason_encoded'] = train_data.complaint_reason.apply(lambda x: get_topic_count_phrase_belongs_to(ldamodel,dictionary, x))

This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed

This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed

In [29]:
train_data.complaint_reason_encoded.value_counts()

24    12336
13     7471
17     3755
18     3238
14     3088
11     2620
8      2083
1      1551
16     1051
0       948
23      880
22      853
5       698
10      673
3       505
7       369
20      366
6       227
15      189
21      107
12      100
9        78
4        37
19       36
2         7
Name: complaint_reason_encoded, dtype: int64

In [26]:
train_data[['complaint_reason', 'complaint_reason_encoded']]

Unnamed: 0,complaint_reason,complaint_reason_encoded
0,"Loan servicing, payments, escrow account",24
1,Incorrect information on credit report,13
2,Using a debit or ATM card,3
3,Cont'd attempts collect debt not owed,17
4,Payoff process,24
5,"Loan modification,collection,foreclosure",11
6,Incorrect information on credit report,13
7,Problems caused by my funds being low,10
8,Cont'd attempts collect debt not owed,17
9,Incorrect information on credit report,13


In [30]:
test_data['complaint_reason_encoded'] = test_data.complaint_reason.apply(lambda x: get_topic_count_phrase_belongs_to(ldamodel,dictionary, x))

This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed all my words:  ['other']
This removed

In [31]:
test_data.complaint_reason_encoded.value_counts()

24    5172
13    3343
17    1685
18    1379
14    1285
11    1060
8      867
1      641
16     431
23     400
22     378
0      378
10     310
5      285
3      211
7      165
20     164
15     115
6      114
21      42
9       42
12      39
4       19
19      14
2        4
Name: complaint_reason_encoded, dtype: int64

In [None]:
train_data[['complaint_reason','complaint_reason_encoded']].to_csv('dataset/train_data_complaint_reason.csv')

In [None]:
tr_data[['complaint_reason','complaint_reason_encoded']].to_csv('dataset/train_data_complaint_reason.csv')