# Import libraries and load text

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist
from nltk.stem.wordnet import WordNetLemmatizer
import string
import plotly
import plotly.graph_objects as go
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
# Prepping dataframe as done in 1_EDA.ipynb

df = pd.read_csv('../project_data/complaints_1.csv')
df = df[['Product', 'Consumer complaint narrative']]
df = df.rename(columns={"Product": "product", "Consumer complaint narrative": "narrative"})
df['product'].replace({'Credit reporting, credit repair services, or other personal consumer reports': 'credit_reporting',
                       'Debt collection': 'debt_collection',
                       'Credit card or prepaid card': 'credit_card',
                       'Mortgage': 'mortgages_and_loans',
                       'Checking or savings account': 'retail_banking',
                       'Money transfer, virtual currency, or money service': 'retail_banking',
                       'Vehicle loan or lease': 'mortgages_and_loans',
                       'Payday loan, title loan, or personal loan': 'mortgages_and_loans',
                       'Student loan': 'mortgages_and_loans'}, inplace=True)

In [3]:
df.head()

Unnamed: 0,product,narrative
0,credit_card,"-- -- -- -- -- 1. ) XXXX XXXX, XXXX a purchase..."
1,credit_card,-- -- -- -- -- Forwarded message -- -- -- -- -...
2,retail_banking,-- -- - Forwarded Message -- -- - From : XXXX ...
3,credit_reporting,"-- -- - XXXX, XXXX, XX/XX/2020 Payment Histori..."
4,credit_reporting,"-- -- - XXXX, XXXX, XX/XX/2020 Payment Histori..."


In [3]:
df.loc[3]['narrative']

'-- -- - XXXX, XXXX, XX/XX/2020 Payment Histories Missing On My Credit Report -- -- Specialized Loan Servicing ( SLS ) has made the mistake to put my account under forbearance in XXXX 2020, without my authorization or knowledge ( As a matter of fact, I have automatic payment setup. In each month \'s XXXX, the monthly mortgage was paid in full ). \n\nI have noticed this issue ( my account was marked as " forbearance \'\' ) in XX/XX/2020 on my credit report ( when I tried to get a new home loan from another new bank ), I have contacted them immediately and asked them to fix this error and provide me a letter ( details please see below asks ). The " forbearance \'\' issue itself seemed to be fixed by now. However, on my credit report, the payment histories for XXXX, XXXX, XX/XX/2020 were all missing. My new bank will not be able to approve my new loan because of these issues of missing payment histories. \n\nI have contacted Specialized Loan Servicing more than 20 times since XX/XX/XXXX b

In [5]:
len(df)

162421

# Loop through narratives to remove stopwords, tokenize and lemmatize

In [6]:
stopwords_list = stopwords.words('english') + list(string.punctuation)
stopwords_list += ["''", '""', '...', '``']
stopwords_list += ['--', 'xxxx']

In [7]:
# function to tokenize data and remove stopwords
def process_narrative(narrative):
    tokens = nltk.word_tokenize(narrative)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    
    # adding line to remove all tokens with numbers and punctuation
    stopwords_punc_and_numbers_removed = [word for word in stopwords_removed if word.isalpha()]
    
    return stopwords_punc_and_numbers_removed


# function to concat words (used in function below)
def concat_words(list_of_words):
    # remove any NaN's
    # list_of_words = [i for i in list if i is not np.nan]

    concat_words = ''
    for word in list_of_words:
        concat_words += word + ' '
    return concat_words.strip()

# function to lemmatize words and merge each complaint into a single space-separated string

lemm = WordNetLemmatizer()

def make_lemma_and_concat(list_of_words):
    # remove any NaN's
    list_of_words = [i for i in list_of_words if i is not np.nan]
    
    # lemmatize each word
    lemmatized_list = []
    for idx, word in enumerate(list_of_words):
        lemmatized_list.append(lemm.lemmatize(word))
    
    # make the list into a single string with the words separated by ' '
    concatenated_string = concat_words(lemmatized_list)
    return concatenated_string

# Prepare dataframe for modeling

In [8]:
for i in range(len(df)):
    processed_narr = process_narrative(df['narrative'].loc[i])
    narr = make_lemma_and_concat(processed_narr)
    df['narrative'].loc[i] = narr
    if i % 3000 == 0:
        print(f'Finished line number {i}')
df.head()

Finished line number 0
Finished line number 3000
Finished line number 6000
Finished line number 9000
Finished line number 12000
Finished line number 15000
Finished line number 18000
Finished line number 21000
Finished line number 24000
Finished line number 27000
Finished line number 30000
Finished line number 33000
Finished line number 36000
Finished line number 39000
Finished line number 42000
Finished line number 45000
Finished line number 48000
Finished line number 51000
Finished line number 54000
Finished line number 57000
Finished line number 60000
Finished line number 63000
Finished line number 66000
Finished line number 69000
Finished line number 72000
Finished line number 75000
Finished line number 78000
Finished line number 81000
Finished line number 84000
Finished line number 87000
Finished line number 90000
Finished line number 93000
Finished line number 96000
Finished line number 99000
Finished line number 102000
Finished line number 105000
Finished line number 108000
Finis

Unnamed: 0,product,narrative
0,credit_card,purchase order day shipping amount receive pro...
1,credit_card,forwarded message date tue subject please inve...
2,retail_banking,forwarded message cc sent friday pdt subject f...
3,credit_reporting,payment history missing credit report speciali...
4,credit_reporting,payment history missing credit report made mis...


# Save dataframe as csv for use in other notebooks

In [9]:
df.to_csv('../project_data/complaints_processed.csv')