# Import and inspect data

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist
import string

In [2]:
df = pd.read_csv('../project_data/complaints_1.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162421 entries, 0 to 162420
Data columns (total 18 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Date received                 162421 non-null  object 
 1   Product                       162421 non-null  object 
 2   Sub-product                   162421 non-null  object 
 3   Issue                         162421 non-null  object 
 4   Sub-issue                     162421 non-null  object 
 5   Consumer complaint narrative  162421 non-null  object 
 6   Company public response       162421 non-null  object 
 7   Company                       162421 non-null  object 
 8   State                         162421 non-null  object 
 9   ZIP code                      162421 non-null  object 
 10  Tags                          162421 non-null  object 
 11  Consumer consent provided?    162421 non-null  object 
 12  Submitted via                 162421 non-nul

In [4]:
df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,7/15/2020,Credit card or prepaid card,General-purpose credit card or charge card,Problem with a purchase shown on your statement,Credit card company isn't resolving a dispute ...,"-- -- -- -- -- 1. ) XXXX XXXX, XXXX a purchase...",,CAPITAL ONE FINANCIAL CORPORATION,FL,,,Consent provided,Web,7/15/2020,Closed with monetary relief,Yes,,3745924
1,6/18/2020,Credit card or prepaid card,Store credit card,Trouble using your card,Credit card company won't increase or decrease...,-- -- -- -- -- Forwarded message -- -- -- -- -...,,Alliance Data Card Services,MD,212XX,,Consent provided,Web,6/18/2020,Closed with non-monetary relief,Yes,,3705859
2,4/26/2020,Checking or savings account,Checking account,Managing an account,Problem using a debit or ATM card,-- -- - Forwarded Message -- -- - From : XXXX ...,Company has responded to the consumer and the ...,WELLS FARGO & COMPANY,CA,910XX,,Consent provided,Web,4/26/2020,Closed with monetary relief,Yes,,3624519
3,11/10/2020,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information is missing that should be on the r...,"-- -- - XXXX, XXXX, XX/XX/2020 Payment Histori...",Company has responded to the consumer and the ...,Specialized Loan Servicing Holdings LLC,CA,,,Consent provided,Web,11/10/2020,Closed with explanation,Yes,,3945441
4,11/10/2020,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information is missing that should be on the r...,"-- -- - XXXX, XXXX, XX/XX/2020 Payment Histori...",Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",CA,,,Consent provided,Web,11/10/2020,Closed with explanation,Yes,,3945445


### Isolate relevant columns

In [5]:
df = df[['Product', 'Sub-product', 'Issue', 'Sub-issue', 'Consumer complaint narrative']]

In [6]:
df = df.rename(columns={"Product": "product", "Sub-product": "subproduct", "Issue": "issue", "Sub-issue": "subissue", "Consumer complaint narrative": "narrative"})

In [7]:
df['product'].value_counts()

Credit reporting, credit repair services, or other personal consumer reports    91179
Debt collection                                                                 23150
Credit card or prepaid card                                                     15566
Mortgage                                                                        11626
Checking or savings account                                                      8934
Money transfer, virtual currency, or money service                               4602
Vehicle loan or lease                                                            3524
Payday loan, title loan, or personal loan                                        1979
Student loan                                                                     1861
Name: product, dtype: int64

In [8]:
df['subproduct'].value_counts()

Credit reporting                              90041
General-purpose credit card or charge card    11097
Conventional home mortgage                     7411
Checking account                               7239
Other debt                                     6566
Credit card debt                               5241
I do not know                                  4836
Medical debt                                   3861
Loan                                           2896
Store credit card                              2284
FHA mortgage                                   1887
Mobile or digital wallet                       1782
Government benefit card                        1377
Domestic (US) money transfer                   1352
Federal student loan servicing                 1194
VA mortgage                                    1133
Auto debt                                       979
Installment loan                                882
Other personal consumer report                  828
Other bankin

In [9]:
df['issue'].value_counts()

Incorrect information on your report                                                54347
Problem with a credit reporting company's investigation into an existing problem    30084
Attempts to collect debt not owed                                                   11489
Trouble during payment process                                                       5407
Improper use of your report                                                          5220
                                                                                    ...  
Incorrect exchange rate                                                                 4
Property was damaged or destroyed property                                              3
Was approved for a loan, but didn't receive money                                       2
Property was sold                                                                       1
Problem with an overdraft                                                               1
Name: issu

In [10]:
df['subissue'].value_counts()

Information belongs to someone else                            33331
None                                                           19753
Their investigation did not fix an error on your report        14294
Account status incorrect                                        7952
Account information incorrect                                   7116
                                                               ...  
Account sold or transferred to another company                     4
Problem with a check written from your prepaid card account        3
Threatened to turn you in to immigration or deport you             2
Problem with fees or penalties                                     2
Qualified for a better loan than the one offered                   1
Name: subissue, Length: 162, dtype: int64

Except for 'product', it doesn't seem there's enough data to train on. Maybe I could do subproduct, too, but I'd want to eliminate those with the lower value counts.

Note for subproduct, a high number is "I don't know." I could maybe classify those. 

In 'subissue', the second highest is 'None'. Also, there seems to be a lot of overlap between the categories, finer points that some consumers may not make. Example: "Debt is not yours" vs. "Debt was result of identity theft".

# Prepare Text

## Inspect first row

In [11]:
print(df.iloc[0])
text = df['narrative'][0]
text

product                             Credit card or prepaid card
subproduct           General-purpose credit card or charge card
issue           Problem with a purchase shown on your statement
subissue      Credit card company isn't resolving a dispute ...
narrative     -- -- -- -- -- 1. ) XXXX XXXX, XXXX a purchase...
Name: 0, dtype: object


'-- -- -- -- -- 1. ) XXXX XXXX, XXXX a purchase with EIO.com ( order # XXXX ) with 2 day shipping for the amount of {$57.00}. \n\n-- -- -- -- -- -2. ) Did not receive the product after a week, so I sent a followup email ; with the exact verbiage : " I paid for two days shipping and have not received my order \'\' on XXXX XXXX. \n\n-- -- -- -- -- 3. ) Company Responded on on XXXX XXXX : " Im sorry to inform you that due to unusually high order volume, your order has not shipped and XXXX not be for several weeks. XXXX has been out of stock since early XXXX due to high demand, although they are continuing to take orders and guaranteeing that we will receive orders we place with them in due time. As such we have masks on order but we do not have an exact shipping date from XXXX right now, however we can guarantee we will ship as soon as soon as XXXX delivers product to us. We have been getting small shipments and shipping those on a first come, first served basis, so we appreciate your pat

## Process Data

### Function to tokenize data and remove stopwords

In [2]:
stopwords_list = stopwords.words('english') + list(string.punctuation)
stopwords_list += ["''", '""', '...', '``']

In [5]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [4]:
list(string.punctuation)

['!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~']

In [6]:
list('abc')

['a', 'b', 'c']

In [13]:
def process_narrative(narrative):
    tokens = nltk.word_tokenize(narrative)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    return stopwords_removed  

### Inspect and process first narrative 

In [14]:
text_words = process_narrative(text)
text_words[0:10]

['--', '--', '--', '--', '--', '1', 'xxxx', 'xxxx', 'xxxx', 'purchase']

In [15]:
# Add to stopwords list

stopwords_list += ['--', 'xxxx']

In [16]:
# Check out word counts

text_words = process_narrative(text)

word_counts = {}
for i in range(len(text_words)):
    word_counts[text_words[i]] = text_words.count(text_words[i])
word_counts

{'1': 1,
 'purchase': 5,
 'eio.com': 1,
 'order': 10,
 '2': 2,
 'day': 1,
 'shipping': 5,
 'amount': 4,
 '57.00': 6,
 '-2': 1,
 'receive': 2,
 'product': 2,
 'week': 1,
 'sent': 2,
 'followup': 1,
 'email': 2,
 'exact': 2,
 'verbiage': 1,
 'paid': 1,
 'two': 1,
 'days': 1,
 'received': 1,
 '3': 1,
 'company': 1,
 'responded': 1,
 'im': 1,
 'sorry': 1,
 'inform': 1,
 'due': 3,
 'unusually': 1,
 'high': 2,
 'volume': 1,
 'shipped': 1,
 'several': 1,
 'weeks': 1,
 'stock': 2,
 'since': 1,
 'early': 1,
 'demand': 1,
 'although': 1,
 'continuing': 1,
 'take': 1,
 'orders': 3,
 'guaranteeing': 1,
 'place': 2,
 'time': 1,
 'masks': 1,
 'date': 1,
 'right': 1,
 'however': 1,
 'guarantee': 1,
 'ship': 1,
 'soon': 2,
 'delivers': 1,
 'us': 2,
 'getting': 1,
 'small': 1,
 'shipments': 1,
 'first': 2,
 'come': 1,
 'served': 1,
 'basis': 1,
 'appreciate': 1,
 'patience': 1,
 'fulfill': 1,
 'quickly': 1,
 'recommend': 1,
 'keeping': 1,
 'lose': 1,
 'line': 1,
 'cancel': 3,
 'distributor': 1,
 'momen

It seems there are a lot of numbers. Update function to get rid of numbers from the list.  

Note: this function also gets rid of strings with punctuation in it like 'xx/xx/xxxx' or "n't".

In [17]:
def process_narrative(narrative):
    tokens = nltk.word_tokenize(narrative)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    
    # adding line to remove all tokens with numbers and punctuation
    stopwords_punc_and_numbers_removed = [word for word in stopwords_removed if word.isalpha()]
    
    return stopwords_punc_and_numbers_removed  

In [18]:
# Redoing processing with updated function
text_words = process_narrative(text)

### Make dictionary of word counts

In [19]:
FreqDist(text_words)

FreqDist({'order': 10, 'purchase': 5, 'shipping': 5, 'purchased': 5, 'amount': 4, 'canceled': 4, 'called': 4, 'transaction': 4, 'case': 4, 'due': 3, ...})

In [20]:
type(FreqDist(text_words))

nltk.probability.FreqDist

In [21]:
FreqDist(text_words).most_common(10)

[('order', 10),
 ('purchase', 5),
 ('shipping', 5),
 ('purchased', 5),
 ('amount', 4),
 ('canceled', 4),
 ('called', 4),
 ('transaction', 4),
 ('case', 4),
 ('due', 3)]

Note how calling `most_common()` creates a list of tuples.

In [22]:
FreqDist(text_words).plot(10)

<Figure size 640x480 with 1 Axes>

<matplotlib.axes._subplots.AxesSubplot at 0x1b9216bd6a0>

### Trying process on the next two rows

#### df.iloc[1]

In [23]:
df.iloc[1][0:4]

product                             Credit card or prepaid card
subproduct                                    Store credit card
issue                                   Trouble using your card
subissue      Credit card company won't increase or decrease...
Name: 1, dtype: object

In [24]:
text = df['narrative'][1]
text

'-- -- -- -- -- Forwarded message -- -- -- -- - From : XXXX XXXX XXXX Date : Tue, XX/XX/XXXXat XXXX XXXX Subject : Please Investigate Comenity Bank Retailers card scam To : XXXX Sent from my XXXX Hello my name is XXXX XXXX, I am being scammed by Comenity bank a credit card provider for companies The Childrens place, New York & Co. , Forever 21 and Victoria Secret. My original credit from XXXX was {$500.00} Comenity bank then lowers my limit to {$300.00} and began to charge overage fees along with late fees. I then began to pay close attention to my other cards to find that my limits were also changed on them as well incurring overages and late fees. \nI reached out to the company Comenity bank they stated that they would change my credit limit to its original limits but did not. I reached out to them again and told them I will not summit any payment until my accounts are corrected. Comenity bank credit cards has impacted my credit scores plummeted to a negative status. Im currently pay

In [25]:
text_words = process_narrative(text)

In [26]:
FreqDist(text_words).most_common(30)

[('credit', 7),
 ('fees', 6),
 ('comenity', 5),
 ('bank', 5),
 ('company', 4),
 ('late', 3),
 ('limits', 3),
 ('im', 3),
 ('card', 2),
 ('name', 2),
 ('original', 2),
 ('limit', 2),
 ('began', 2),
 ('overage', 2),
 ('cards', 2),
 ('well', 2),
 ('reached', 2),
 ('accounts', 2),
 ('due', 2),
 ('forwarded', 1),
 ('message', 1),
 ('date', 1),
 ('tue', 1),
 ('subject', 1),
 ('please', 1),
 ('investigate', 1),
 ('retailers', 1),
 ('scam', 1),
 ('sent', 1),
 ('hello', 1)]

#### df.iloc[2]

In [27]:
df.iloc[2][0:4]

product             Checking or savings account
subproduct                     Checking account
issue                       Managing an account
subissue      Problem using a debit or ATM card
Name: 2, dtype: object

In [28]:
text = df['narrative'][2]
text_words = process_narrative(text)
FreqDist(text_words).most_common(30)

[('wells', 7),
 ('fargo', 7),
 ('clearly', 5),
 ('fraudulent', 5),
 ('account', 4),
 ('legal', 3),
 ('agencies', 3),
 ('branch', 3),
 ('cards', 3),
 ('damages', 3),
 ('small', 3),
 ('claims', 3),
 ('lawsuit', 3),
 ('payment', 2),
 ('actually', 2),
 ('court', 2),
 ('charges', 2),
 ('debit', 2),
 ('required', 2),
 ('take', 2),
 ('business', 2),
 ('consumer', 2),
 ('protection', 2),
 ('immediately', 2),
 ('also', 2),
 ('punitive', 2),
 ('conduct', 2),
 ('forwarded', 1),
 ('message', 1),
 ('cc', 1)]

In [29]:
df.iloc[2]

product                             Checking or savings account
subproduct                                     Checking account
issue                                       Managing an account
subissue                      Problem using a debit or ATM card
narrative     -- -- - Forwarded Message -- -- - From : XXXX ...
Name: 2, dtype: object

# Combine categories and create new dataframes

## Inspect 

In [30]:
# Inspect products again
df['product'].value_counts()

Credit reporting, credit repair services, or other personal consumer reports    91179
Debt collection                                                                 23150
Credit card or prepaid card                                                     15566
Mortgage                                                                        11626
Checking or savings account                                                      8934
Money transfer, virtual currency, or money service                               4602
Vehicle loan or lease                                                            3524
Payday loan, title loan, or personal loan                                        1979
Student loan                                                                     1861
Name: product, dtype: int64

What is "Money transfer, virtual currency, or money service"?

In [31]:
df[df['product'] == "Money transfer, virtual currency, or money service"].head(10)

Unnamed: 0,product,subproduct,issue,subissue,narrative
34,"Money transfer, virtual currency, or money ser...",Mobile or digital wallet,Unauthorized transactions or other transaction...,,- negative {$4800.00} XX/XX/2020 Transfer Load...
84,"Money transfer, virtual currency, or money ser...",Domestic (US) money transfer,Money was not available when promised,,""" I want you to call me not email me about thi..."
126,"Money transfer, virtual currency, or money ser...",Virtual currency,Unexpected or other fees,,"( 1 ) In the month of XXXX, I made 9 conversio..."
467,"Money transfer, virtual currency, or money ser...",Mobile or digital wallet,"Managing, opening, or closing your mobile wall...",,"( XXXX # XXXX, # XXXX, # XXXX, # XXXX, # XXXX..."
597,"Money transfer, virtual currency, or money ser...",Mobile or digital wallet,"Managing, opening, or closing your mobile wall...",,**BEWARE** VENMO Has horrible customer service...
633,"Money transfer, virtual currency, or money ser...",Mobile or digital wallet,"Managing, opening, or closing your mobile wall...",,*I am able to prove identity with my state ide...
635,"Money transfer, virtual currency, or money ser...",Domestic (US) money transfer,Fraud or scam,,*THIS IS A FOLLOW UP TO MY PREVIOUS CLAIM TO B...
642,"Money transfer, virtual currency, or money ser...",International money transfer,Other transaction problem,,". Date : XXXX, 2020. Amount : Total XXXX USD. ..."
668,"Money transfer, virtual currency, or money ser...",Domestic (US) money transfer,Other transaction problem,,".On XX/XX/XXXX, ( not sure if this ' the right..."
695,"Money transfer, virtual currency, or money ser...",Mobile or digital wallet,Unauthorized transactions or other transaction...,,`Y'all send the cash-out deposit to the wrong ...


Seems to be about Venmo, digital transactions, international transfers, etc. It's a bit of it's own thing. I'll keep it for now, but there are only 4,602 entries. But I'll fold into "checking and savings" in general, which has only 9,000.

## Combine categories

**Tasks**

- Rename "credit_reporting"  
- Rename "debt_collection"  
- Rename "credit_card"
- Rename "mortgage"
- Combine "checking" and "money transfer" into "retail_banking"
- Combine the loans into "loans"

In [32]:
df['product'].value_counts()

Credit reporting, credit repair services, or other personal consumer reports    91179
Debt collection                                                                 23150
Credit card or prepaid card                                                     15566
Mortgage                                                                        11626
Checking or savings account                                                      8934
Money transfer, virtual currency, or money service                               4602
Vehicle loan or lease                                                            3524
Payday loan, title loan, or personal loan                                        1979
Student loan                                                                     1861
Name: product, dtype: int64

In [33]:
df['product'].replace({'Credit reporting, credit repair services, or other personal consumer reports': 'credit_reporting',
                       'Debt collection': 'debt_collection',
                       'Credit card or prepaid card': 'credit_card',
                       'Mortgage': 'mortgage',
                       'Checking or savings account': 'retail_banking',
                       'Money transfer, virtual currency, or money service': 'retail_banking',
                       'Vehicle loan or lease': 'loans',
                       'Payday loan, title loan, or personal loan': 'loans',
                       'Student loan': 'loans'}, inplace=True)

In [34]:
df['product'].value_counts()

credit_reporting    91179
debt_collection     23150
credit_card         15566
retail_banking      13536
mortgage            11626
loans                7364
Name: product, dtype: int64

Mortgage and loans are the smallest. Since they're both types of loans, I'll combine them.

In [35]:
df['product'].replace({'mortgage': 'mortgages_and_loans',
                       'loans': 'mortgages_and_loans'}, inplace=True)
df['product'].value_counts()

credit_reporting       91179
debt_collection        23150
mortgages_and_loans    18990
credit_card            15566
retail_banking         13536
Name: product, dtype: int64

## Create new dataframes

In [61]:
credit_reporting_df = df[df['product'] == 'credit_reporting']
debt_collection_df = df[df['product'] == 'debt_collection']
mortgages_and_loans_df = df[df['product'] == 'mortgages_and_loans']
credit_card_df = df[df['product'] == 'credit_card']
retail_banking_df = df[df['product'] == 'retail_banking']

## Concatenate all the narratives into a single string per class

In [76]:
credit_reporting_df.head()

Unnamed: 0,product,subproduct,issue,subissue,narrative
3,credit_reporting,Credit reporting,Incorrect information on your report,Information is missing that should be on the r...,"-- -- - XXXX, XXXX, XX/XX/2020 Payment Histori..."
4,credit_reporting,Credit reporting,Incorrect information on your report,Information is missing that should be on the r...,"-- -- - XXXX, XXXX, XX/XX/2020 Payment Histori..."
5,credit_reporting,Credit reporting,Incorrect information on your report,Information is missing that should be on the r...,"-- -- - XXXX, XXXX, XX/XX/2020 Payment Histori..."
6,credit_reporting,Credit reporting,Unable to get your credit report or credit score,Problem getting your free annual credit report,- - ______XXXX XXXX XXXX XXXX XXXX _____ XXXX...
7,credit_reporting,Credit reporting,Problem with a credit reporting company's inve...,Their investigation did not fix an error on yo...,-- - USDOE/XXXX - This account is being repor...


In [137]:
def concat_narratives(df):
    # concat narratives
    narr = ''
    for i in range(len(df)):
        narr += df.iloc[i]['narrative']
    print('Finished Concatenation')
    return narr

In [138]:
credit_reporting_text = concat_narratives(credit_reporting_df)
credit_reporting_text_processed = process_narrative(credit_reporting_text)

Finished Concatenation


In [139]:
debt_collection_text = concat_narratives(debt_collection_df)
debt_collection_text_processed = process_narrative(debt_collection_text)

Finished Concatenation


In [140]:
mortgages_and_loans_text = concat_narratives(mortgages_and_loans_df)
mortgages_and_loans_text_processed = process_narrative(mortgages_and_loans_text)

Finished Concatenation


In [141]:
credit_card_text = concat_narratives(credit_card_df)
credit_card_text_processed = process_narrative(credit_card_text)

Finished Concatenation


In [142]:
retail_banking_text = concat_narratives(retail_banking_df)
retail_banking_text_processed = process_narrative(retail_banking_text)

Finished Concatenation


### Saving the text files

In [151]:
text_file = open('../project_data/credit_reporting_text.txt', 'w')
text_file.write(credit_reporting_text)
text_file.close()

In [152]:
text_file = open('../project_data/debt_collection_text.txt', 'w')
text_file.write(debt_collection_text)
text_file.close()

In [158]:
text_file = open('../project_data/mortgages_and_loans_text.txt', 'w')
mortgages_and_loans_text = mortgages_and_loans_text.replace('\x82', '')
text_file.write(mortgages_and_loans_text)
text_file.close()

In [159]:
text_file = open('../project_data/credit_card_text.txt', 'w')
text_file.write(credit_card_text)
text_file.close()

In [161]:
text_file = open('../project_data/retail_banking_text.txt', 'w')
text_file.write(retail_banking_text)
text_file.close()

### Saving the processed text (lists) files

In [167]:
temp = pd.DataFrame(credit_reporting_text_processed)
temp.to_csv('../project_data/credit_reporting_text_processed.csv')

In [169]:
temp = pd.DataFrame(debt_collection_text_processed)
temp.to_csv('../project_data/debt_collection_text_processed.csv')

In [170]:
temp = pd.DataFrame(mortgages_and_loans_text_processed)
temp.to_csv('../project_data/mortgages_and_loans_text_processed.csv')

In [171]:
temp = pd.DataFrame(credit_card_text_processed)
temp.to_csv('../project_data/credit_card_text_processed.csv')

In [172]:
temp = pd.DataFrame(retail_banking_text_processed)
temp.to_csv('../project_data/retail_banking_text_processed.csv')

## Check `FreqDist()`

In [173]:
FreqDist(debt_collection_text_processed).most_common(30)

[('debt', 40246),
 ('credit', 38719),
 ('account', 29301),
 ('collection', 17936),
 ('company', 17088),
 ('report', 16215),
 ('information', 16006),
 ('received', 11725),
 ('letter', 10413),
 ('never', 10049),
 ('reporting', 9986),
 ('would', 9690),
 ('sent', 9636),
 ('payment', 8160),
 ('called', 8094),
 ('amount', 8045),
 ('also', 7968),
 ('told', 7784),
 ('call', 7477),
 ('pay', 7475),
 ('provide', 7354),
 ('paid', 7249),
 ('agency', 7229),
 ('time', 6998),
 ('number', 6847),
 ('phone', 6826),
 ('please', 6585),
 ('consumer', 6425),
 ('original', 6134),
 ('said', 6117)]

In [143]:
FreqDist(credit_reporting_text_processed).most_common(30)

[('credit', 222999),
 ('report', 122959),
 ('account', 113703),
 ('information', 106416),
 ('reporting', 79776),
 ('accounts', 70117),
 ('consumer', 46023),
 ('dispute', 34859),
 ('identity', 33370),
 ('please', 31687),
 ('remove', 29415),
 ('days', 29375),
 ('also', 28482),
 ('sent', 28283),
 ('payment', 27931),
 ('received', 27686),
 ('file', 27113),
 ('date', 26520),
 ('never', 26385),
 ('inaccurate', 26365),
 ('bureaus', 26349),
 ('collection', 26290),
 ('theft', 26049),
 ('items', 25810),
 ('debt', 25767),
 ('letter', 25513),
 ('fraudulent', 25264),
 ('balance', 25215),
 ('late', 25053),
 ('reported', 24957)]

In [144]:
FreqDist(credit_card_text_processed).most_common(30)

[('card', 34158),
 ('credit', 32460),
 ('account', 25410),
 ('bank', 13341),
 ('would', 13000),
 ('payment', 12410),
 ('one', 11079),
 ('received', 10176),
 ('told', 10093),
 ('called', 9987),
 ('time', 8810),
 ('never', 8088),
 ('balance', 7953),
 ('back', 7860),
 ('due', 7395),
 ('made', 7297),
 ('dispute', 7181),
 ('said', 7090),
 ('charge', 7038),
 ('call', 6924),
 ('get', 6914),
 ('charges', 6687),
 ('late', 6541),
 ('company', 6486),
 ('could', 6251),
 ('pay', 6060),
 ('customer', 5927),
 ('interest', 5856),
 ('information', 5725),
 ('closed', 5617)]

In [174]:
FreqDist(retail_banking_text_processed).most_common(30)

[('account', 40415),
 ('bank', 24655),
 ('money', 15640),
 ('would', 10493),
 ('told', 9679),
 ('funds', 8462),
 ('back', 7777),
 ('check', 7750),
 ('called', 7733),
 ('received', 7272),
 ('card', 7146),
 ('said', 6495),
 ('get', 6199),
 ('days', 6008),
 ('sent', 5853),
 ('time', 5826),
 ('one', 5714),
 ('paypal', 5601),
 ('checking', 5494),
 ('call', 5475),
 ('could', 5449),
 ('number', 5386),
 ('chase', 5091),
 ('never', 4961),
 ('information', 4945),
 ('phone', 4937),
 ('email', 4896),
 ('transaction', 4843),
 ('also', 4648),
 ('transfer', 4573)]

In [175]:
FreqDist(mortgages_and_loans_text_processed).most_common(30)

[('loan', 38616),
 ('payment', 29808),
 ('mortgage', 25916),
 ('would', 22588),
 ('account', 17651),
 ('payments', 17272),
 ('told', 15634),
 ('credit', 15599),
 ('received', 14280),
 ('company', 13216),
 ('time', 12870),
 ('called', 12308),
 ('due', 11972),
 ('bank', 11396),
 ('pay', 11113),
 ('get', 10975),
 ('call', 10799),
 ('back', 10706),
 ('paid', 10602),
 ('sent', 9950),
 ('amount', 9737),
 ('made', 9054),
 ('said', 8990),
 ('home', 8927),
 ('information', 8890),
 ('never', 8811),
 ('could', 8432),
 ('one', 8303),
 ('interest', 8063),
 ('also', 7991)]