In [1]:
import numpy as np
import pandas as pd
import datetime as dt

In [3]:
# Text
import nltk, re, string
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\Jiyoung
[nltk_data]     Sim\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Jiyoung
[nltk_data]     Sim\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [4]:
df = pd.read_csv('Consumer_Complaints.csv', parse_dates = ['Date received', 'Date sent to company'])

In [37]:
df['Date received'] = df['Date received'].dt.date

In [76]:
df.loc[df['Sub-product'] == 'I do not know', 'Sub-product'] = 'Other debt'

In [119]:
df.loc[df['Product']=='Credit reporting', 'Sub-product'] = 'Credit reporting'
df.loc[df['Product']=='Credit card', 'Sub-product'] = 'Credit card'
df.loc[df['Product']=='Payday loan', 'Sub-product'] = 'Payday loan'

In [120]:
subproducts = df[df['Date received'] > dt.date(2018, 11, 1)]['Sub-product'].unique()
len(subproducts)

47

In [7]:
df2 = df[[x in subproducts for x in df['Sub-product']]]
df2 = df2[['Sub-product', 'Consumer complaint narrative']]
df2.head()

Unnamed: 0,Sub-product,Consumer complaint narrative
0,General-purpose credit card or charge card,CITI BANK in his response to XXXX asked me idi...
1,Other debt,THIS ACCOUNT IS A RESULT OF IDENTITY THEFT.
2,Other debt,Pro Collect has sent documents that dont match...
3,Medical debt,On XX/XX/2019 I sent PlusFour Inc a certified ...
4,Credit card debt,Client Services is reporting negative informat...


In [8]:
df2.dropna(0, inplace = True)

In [9]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df2['Sub-product'])
df2['labels'] = le.transform(df2['Sub-product'])
df2.head()

Unnamed: 0,Sub-product,Consumer complaint narrative,labels
0,General-purpose credit card or charge card,CITI BANK in his response to XXXX asked me idi...,14
1,Other debt,THIS ACCOUNT IS A RESULT OF IDENTITY THEFT.,28
2,Other debt,Pro Collect has sent documents that dont match...,28
3,Medical debt,On XX/XX/2019 I sent PlusFour Inc a certified ...,23
4,Credit card debt,Client Services is reporting negative informat...,5


In [10]:
### NEXT STEP: USE LEMMATIZER INSTEAD OF STEMMER TO IMPROVE THE RESULT

stemmer = PorterStemmer()

def tokenize_and_stem(text):
    lowers = text.lower()
    tokens = [word for sent in nltk.sent_tokenize(lowers) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token) and not token in stopwords.words('english'):
            filtered_tokens.append(re.sub(r'[^\w\s]','',token))
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [12]:
df2['tokenized narrative'] = [tokenize_and_stem(x) for x in df2['Consumer complaint narrative']]

In [13]:
df2.head()

Unnamed: 0,Sub-product,Consumer complaint narrative,labels,tokenized narrative
0,General-purpose credit card or charge card,CITI BANK in his response to XXXX asked me idi...,14,"[citi, bank, respons, xxxx, ask, idiot, questi..."
1,Other debt,THIS ACCOUNT IS A RESULT OF IDENTITY THEFT.,28,"[account, result, ident, theft]"
2,Other debt,Pro Collect has sent documents that dont match...,28,"[pro, collect, sent, document, dont, match, mo..."
3,Medical debt,On XX/XX/2019 I sent PlusFour Inc a certified ...,23,"[xxxx2019, sent, plusfour, inc, certifi, lette..."
4,Credit card debt,Client Services is reporting negative informat...,5,"[client, servic, report, neg, inform, credit, ..."


In [109]:
import re

df2['stemmed narrative']  = df2['tokenized narrative'].apply(lambda x: re.sub('[xx]+', '',x))
df2['stemmed narrative'] = df2['stemmed narrative'].apply(lambda x: re.split('[\'\s\[\],]', x))
df2['stemmed narrative'] = df2['stemmed narrative'].apply(lambda x: ' '.join(filter(None, x)))

In [142]:
df2.head()

Unnamed: 0,Sub-product,Consumer complaint narrative,labels,tokenized narrative,stemmed narrative
0,General-purpose credit card or charge card,CITI BANK in his response to XXXX asked me idi...,14,"['citi', 'bank', 'respons', 'xxxx', 'ask', 'id...",citi bank respons ask idiot question inapplic ...
1,Other debt,THIS ACCOUNT IS A RESULT OF IDENTITY THEFT.,28,"['account', 'result', 'ident', 'theft']",account result ident theft
2,Other debt,Pro Collect has sent documents that dont match...,28,"['pro', 'collect', 'sent', 'document', 'dont',...",pro collect sent document dont match move date...
3,Medical debt,On XX/XX/2019 I sent PlusFour Inc a certified ...,23,"['xxxx2019', 'sent', 'plusfour', 'inc', 'certi...",2019 sent plusfour inc certifi letter via usp ...
4,Credit card debt,Client Services is reporting negative informat...,5,"['client', 'servic', 'report', 'neg', 'inform'...",client servic report neg inform credit report ...


In [5]:
df2['stemmed narrative']=df2['tokenized narrative'].apply(lambda x: ' '.join(x))

In [117]:
df.to_csv('df2.csv', index = False)