In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import csv
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import sys
!{sys.executable} -m pip install summa
from summa import keywords
from summa import summarizer

  from ._conv import register_converters as _register_converters




In [2]:
complaints = pd.read_csv("us-consumer-finance-complaint-database/\
consumer_complaints.csv",usecols=[1,5],\
                         dtype={'consumer_complaint_narrative': object})
print(complaints.shape)
complaints.dropna(inplace=True)
complaints = complaints.reset_index(drop=True)
print(complaints.shape)

(555957, 2)
(66806, 2)


In [3]:
complaints.head()

Unnamed: 0,product,consumer_complaint_narrative
0,Debt collection,XXXX has claimed I owe them {$27.00} for XXXX ...
1,Consumer Loan,Due to inconsistencies in the amount owed that...
2,Mortgage,In XX/XX/XXXX my wages that I earned at my job...
3,Mortgage,I have an open and current mortgage with Chase...
4,Mortgage,XXXX was submitted XX/XX/XXXX. At the time I s...


In [4]:
from collections import Counter
counts = Counter(complaints['product'])
# Counter({'Debt collection': 17552, 'Mortgage': 14919, 'Credit reporting': 12526, 'Credit card': 7929, 'Bank account or service': 5711, 'Consumer Loan': 3678, 'Student loan': 2128, 'Prepaid card': 861, 'Payday loan': 726, 'Money transfers': 666, 'Other financial service': 110})

In [5]:
for complaint in complaints[complaints['product'] == 'Debt collection'].consumer_complaint_narrative.sample(3, random_state=1):
    print(complaint)
    print('________________________________________')

Had 4 phone calls in one day to my cell phone about debt collecting. 
They are asking to talk to a XXXX XXXX ... ... Not me ... .Never heard of him. They got the wrong number! I keep explaining to them you got the wrong number and they get very rude! 

________________________________________
My sister provided Hyundai Motor Finance my phone # while hers was not working. I received a call from their XXXX number and when advised my sister was not available and asked who was calling. Female declined to identify herself or her company. I advised that the cell phone being called belongs to me and they no longer have my permission to dial my number manually or via their automated dialer. Female then hung up on me. My sister took care of the past due payment ( was just an oversight ) and we assumed everything was good. payment rec by HMF on XXXX/XXXX/15. On XXXX/XXXX/15 I recevied another call from HMF. I had my sister call back and they advised her account current and no record of call. Adv

In [6]:
labels = complaints['product'].unique().tolist()
train = complaints.copy()
print(labels)
print(train.head())
#train = train.reset_index(drop=True)
for i in range(len(train)):
    train.loc[i,'product_num'] = labels.index(train.loc[i,'product'])
y = train['product_num'].copy()
print(train.head())
train = train.drop(['product_num','product'],1)

['Debt collection', 'Consumer Loan', 'Mortgage', 'Credit card', 'Credit reporting', 'Student loan', 'Bank account or service', 'Payday loan', 'Money transfers', 'Other financial service', 'Prepaid card']
           product                       consumer_complaint_narrative
0  Debt collection  XXXX has claimed I owe them {$27.00} for XXXX ...
1    Consumer Loan  Due to inconsistencies in the amount owed that...
2         Mortgage  In XX/XX/XXXX my wages that I earned at my job...
3         Mortgage  I have an open and current mortgage with Chase...
4         Mortgage  XXXX was submitted XX/XX/XXXX. At the time I s...
           product                       consumer_complaint_narrative  \
0  Debt collection  XXXX has claimed I owe them {$27.00} for XXXX ...   
1    Consumer Loan  Due to inconsistencies in the amount owed that...   
2         Mortgage  In XX/XX/XXXX my wages that I earned at my job...   
3         Mortgage  I have an open and current mortgage with Chase...   
4         M

In [7]:
train.head()

Unnamed: 0,consumer_complaint_narrative
0,XXXX has claimed I owe them {$27.00} for XXXX ...
1,Due to inconsistencies in the amount owed that...
2,In XX/XX/XXXX my wages that I earned at my job...
3,I have an open and current mortgage with Chase...
4,XXXX was submitted XX/XX/XXXX. At the time I s...


In [8]:
y.head()

0    0.0
1    1.0
2    2.0
3    2.0
4    2.0
Name: product_num, dtype: float64

In [9]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(train,y, test_size=0.4)
#X_train = X_train.reset_index(drop=True)
#y_train = y_train.reset_index(drop=True)

In [10]:
X_train.head()

Unnamed: 0,consumer_complaint_narrative
36678,I applied over the phone for a mortgage with B...
20113,1. I obtained loan of {$15000.00} on Credit Ca...
52799,XXXX XXXX XXXX XXXX XXXX XXXX XXXX VA XXXX SS ...
49657,XXXX UniversityXXXX XXXXI was a University stu...
3141,This is not a duplicate rather a corrected ver...


In [11]:
y_train.head()

36678    2.0
20113    3.0
52799    4.0
49657    5.0
3141     2.0
Name: product_num, dtype: float64

In [12]:
categories = y_train.unique()

count_vector = CountVectorizer()
X_train_count = count_vector.fit_transform(X_train['consumer_complaint_narrative'])

X_train_count.shape

count_vector.vocabulary_.get(u'movie')

tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_count)

X_train_tf = tf_transformer.transform(X_train_count)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_count)

# Linear Support Vector Classification

In [13]:
lsvc = LinearSVC().fit(X_train_count,y_train)

In [14]:
X_test_count = count_vector.transform(X_test['consumer_complaint_narrative'])
X_test_tfidf = tfidf_transformer.transform(X_test_count)
ans = lsvc.predict(X_test_tfidf)

In [15]:
accuracy_score(y_test,ans)

0.5602290162032706

# Summarizing the text

In [16]:
print((complaints['consumer_complaint_narrative'][4]))
print("\n\nSummary:\n\n")

print(summarizer.\
      summarize((complaints['consumer_complaint_narrative'][4])))

XXXX was submitted XX/XX/XXXX. At the time I submitted this complaint, I had dealt with Rushmore Mortgage directly endeavoring to get them to stop the continuous daily calls I was receiving trying to collect on a mortgage for which I was not responsible due to bankruptcy. They denied having knowledge of the bankruptcy, even though I had spoken with them about it repeatedly and had written them repeatedly referencing the bankruptcy requesting them to cease the pursuit, they continued to do so. When they were unable to trick me into paying, force me into paying in retaliation they placed reported to my credit bureaus a past due mortgage amount that had been discharged in Federal Court. On XX/XX/XXXX Rushmore responded the referenced complaint indicating that they would remove the reporting from my bureau, yet it is still there now in XX/XX/XXXX. I would like them to remove it immediately and send me a letter indicating that it should not have been there in the first place and they are go