In [1]:
import pandas as pd
import requests

# Get data from API

## Check dates of original dataset

In [2]:
df_orig = pd.read_csv('../project_data/complaints_1.csv')

In [3]:
type(df_orig['Date received'][0])

str

Dates are in string format. Convert to datetime.

In [4]:
df_orig.rename(columns={'Date received': 'date'}, inplace=True)

df_orig['date'] = pd.to_datetime(df_orig['date'])

In [5]:
min(df_orig['date'])

Timestamp('2020-03-17 00:00:00')

In [6]:
max(df_orig['date'])

Timestamp('2021-03-01 00:00:00')

The most recent was 2021-03-01

## Use API

Submit GET request with parameters

In [31]:
url = 'https://www.consumerfinance.gov/data-research/consumer-complaints/search/api/v1/'

parameters = {'date_received_min': '2021-03-02',
              'has_narrative': True,
              'size': 20
}
r = requests.get(url, params=parameters)

Make data into a dictionary format

In [32]:
data = r.json()

In [47]:
# Instatiate empty dictionary
new_dict = {}

# Create empty lists to gather data
product_list = []
narrative_list = [] 

# Loop through data and add to dictionary
for i in range(len(data['hits']['hits'])):
    product_list.append(data['hits']['hits'][i]['_source']['product'])
    narrative_list.append(data['hits']['hits'][i]['_source']['complaint_what_happened'])
new_dict['product'] = product_list
new_dict['narrative'] = narrative_list

In [50]:
df = pd.DataFrame.from_dict(new_dict)

# Clean Data

## Consolidate Products

In [51]:
# consolidate products into the classes used so far

df['product'].replace({'Credit reporting, credit repair services, or other personal consumer reports': 'credit_reporting',
                       'Debt collection': 'debt_collection',
                       'Credit card or prepaid card': 'credit_card',
                       'Mortgage': 'mortgages_and_loans',
                       'Checking or savings account': 'retail_banking',
                       'Money transfer, virtual currency, or money service': 'retail_banking',
                       'Vehicle loan or lease': 'mortgages_and_loans',
                       'Payday loan, title loan, or personal loan': 'mortgages_and_loans',
                       'Student loan': 'mortgages_and_loans'}, inplace=True)

In [52]:
df.head()

Unnamed: 0,product,narrative
0,credit_reporting,I have determined that there are some creditor...
1,credit_reporting,"On XX/XX/2020, I initially bought some great g..."
2,credit_reporting,I sent a letter plus I did a dispute but they ...
3,retail_banking,Wells Fargo failed to provide outstanding reli...
4,credit_reporting,The department of education is now showing the...


In [53]:
df['product'].unique()

array(['credit_reporting', 'retail_banking', 'mortgages_and_loans',
       'debt_collection', 'credit_card'], dtype=object)

## Loop through narratives to remove stopwords, tokenize and lemmatize

In [58]:
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist
from nltk.stem.wordnet import WordNetLemmatizer
import string

In [55]:
stopwords_list = stopwords.words('english') + list(string.punctuation)
stopwords_list += ["''", '""', '...', '``']
stopwords_list += ['--', 'xxxx']

In [56]:
# function to tokenize data and remove stopwords
def process_narrative(narrative):
    tokens = nltk.word_tokenize(narrative)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    
    # adding line to remove all tokens with numbers and punctuation
    stopwords_punc_and_numbers_removed = [word for word in stopwords_removed if word.isalpha()]
    
    return stopwords_punc_and_numbers_removed


# function to concat words (used in function below)
def concat_words(list_of_words):
    # remove any NaN's
    # list_of_words = [i for i in list if i is not np.nan]

    concat_words = ''
    for word in list_of_words:
        concat_words += word + ' '
    return concat_words.strip()

# function to lemmatize words and merge each complaint into a single space-separated string

lemm = WordNetLemmatizer()

def make_lemma_and_concat(list_of_words):
    # remove any NaN's
    list_of_words = [i for i in list_of_words if i is not np.nan]
    
    # lemmatize each word
    lemmatized_list = []
    for idx, word in enumerate(list_of_words):
        lemmatized_list.append(lemm.lemmatize(word))
    
    # make the list into a single string with the words separated by ' '
    concatenated_string = concat_words(lemmatized_list)
    return concatenated_string

In [59]:
for i in range(len(df)):
    processed_narr = process_narrative(df['narrative'].loc[i])
    narr = make_lemma_and_concat(processed_narr)
    df['narrative'].loc[i] = narr
    if i % 3000 == 0:
        print(f'Finished line number {i}')
df.head()

Finished line number 0


Unnamed: 0,product,narrative
0,credit_reporting,determined creditor listed credit report belon...
1,credit_reporting,initially bought great gear love thought paid ...
2,credit_reporting,sent letter plus dispute keep putting address ...
3,retail_banking,well fargo failed provide outstanding relief a...
4,credit_reporting,department education showing account open past...


In [60]:
df

Unnamed: 0,product,narrative
0,credit_reporting,determined creditor listed credit report belon...
1,credit_reporting,initially bought great gear love thought paid ...
2,credit_reporting,sent letter plus dispute keep putting address ...
3,retail_banking,well fargo failed provide outstanding relief a...
4,credit_reporting,department education showing account open past...
5,credit_reporting,well fargo continuously report late payment ac...
6,retail_banking,bank account open knowledge opening fraudulent...
7,credit_reporting,signed paypal credit given line credit worth t...
8,mortgages_and_loans,cfpb filing official complaint regarding mortg...
9,credit_reporting,account paid sent keep reporting owe called ge...
