## Complaints Classifier

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

### Import dataset
To get the dataset click [here](https://www.kaggle.com/cfpb/us-consumer-finance-complaints)

In [6]:
data = pd.read_csv('consumer_complaints.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
data.head()

Unnamed: 0,date_received,product,sub_product,issue,sub_issue,consumer_complaint_narrative,company_public_response,company,state,zipcode,tags,consumer_consent_provided,submitted_via,date_sent_to_company,company_response_to_consumer,timely_response,consumer_disputed?,complaint_id
0,08/30/2013,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,U.S. Bancorp,CA,95993,,,Referral,09/03/2013,Closed with explanation,Yes,Yes,511074
1,08/30/2013,Mortgage,Other mortgage,"Loan servicing, payments, escrow account",,,,Wells Fargo & Company,CA,91104,,,Referral,09/03/2013,Closed with explanation,Yes,Yes,511080
2,08/30/2013,Credit reporting,,Incorrect information on credit report,Account status,,,Wells Fargo & Company,NY,11764,,,Postal mail,09/18/2013,Closed with explanation,Yes,No,510473
3,08/30/2013,Student loan,Non-federal student loan,Repaying your loan,Repaying your loan,,,"Navient Solutions, Inc.",MD,21402,,,Email,08/30/2013,Closed with explanation,Yes,Yes,510326
4,08/30/2013,Debt collection,Credit card,False statements or representation,Attempted to collect wrong amount,,,Resurgent Capital Services L.P.,GA,30106,,,Web,08/30/2013,Closed with explanation,Yes,Yes,511067


In [8]:
data = data[['product', 'consumer_complaint_narrative']]
data.isnull().sum()

product                              0
consumer_complaint_narrative    489151
dtype: int64

In [9]:
data = data.dropna()
data.head()

Unnamed: 0,product,consumer_complaint_narrative
190126,Debt collection,XXXX has claimed I owe them {$27.00} for XXXX ...
190135,Consumer Loan,Due to inconsistencies in the amount owed that...
190155,Mortgage,In XX/XX/XXXX my wages that I earned at my job...
190207,Mortgage,I have an open and current mortgage with Chase...
190208,Mortgage,XXXX was submitted XX/XX/XXXX. At the time I s...


In [10]:
prod_categ = list(data['product'].unique())
print(len(prod_categ))
prod_category = []
for prod in data['product']:
    prod_category.append(prod_categ.index(prod))

11


In [11]:
data['prod_category'] = prod_category
data.head()

Unnamed: 0,product,consumer_complaint_narrative,prod_category
190126,Debt collection,XXXX has claimed I owe them {$27.00} for XXXX ...,0
190135,Consumer Loan,Due to inconsistencies in the amount owed that...,1
190155,Mortgage,In XX/XX/XXXX my wages that I earned at my job...,2
190207,Mortgage,I have an open and current mortgage with Chase...,2
190208,Mortgage,XXXX was submitted XX/XX/XXXX. At the time I s...,2


### Using TF-IDF Vectorizer to convert categorical data to numerical.

In [12]:
tfidf = TfidfVectorizer(sublinear_tf=True, norm='l2', min_df=5, stop_words='english', ngram_range=(1,2))

In [13]:
X = tfidf.fit_transform(data.consumer_complaint_narrative)
y = data.prod_category
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
model = LinearSVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, y_pred)*100)

Accuracy:  87.04535249214189


In [15]:
y_pred

array([0, 2, 3, ..., 0, 4, 4])