# Consumer data preprocessing

# 1. Loading the data

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('Consumer_Complaints.csv')
df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,03/12/2014,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,M&T BANK CORPORATION,MI,48382,,,Referral,03/17/2014,Closed with explanation,Yes,No,759217
1,10/01/2016,Credit reporting,,Incorrect information on credit report,Account status,I have outdated information on my credit repor...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",AL,352XX,,Consent provided,Web,10/05/2016,Closed with explanation,Yes,No,2141773
2,10/17/2016,Consumer Loan,Vehicle loan,Managing the loan or lease,,I purchased a new car on XXXX XXXX. The car de...,,"CITIZENS FINANCIAL GROUP, INC.",PA,177XX,Older American,Consent provided,Web,10/20/2016,Closed with explanation,Yes,No,2163100
3,06/08/2014,Credit card,,Bankruptcy,,,,AMERICAN EXPRESS COMPANY,ID,83854,Older American,,Web,06/10/2014,Closed with explanation,Yes,Yes,885638
4,09/13/2014,Debt collection,Credit card,Communication tactics,Frequent or repeated calls,,,"CITIBANK, N.A.",VA,23233,,,Web,09/13/2014,Closed with explanation,Yes,Yes,1027760


Let's look at the shape. Additionally, many of the Consumer complaints narratives are empty, so let's delete these rows.

In [2]:
df.shape

(1009102, 18)

In [3]:
df = df[pd.notnull(df['Consumer complaint narrative'])]

In [4]:
df.shape

(270363, 18)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 270363 entries, 1 to 1009101
Data columns (total 18 columns):
Date received                   270363 non-null object
Product                         270363 non-null object
Sub-product                     218180 non-null object
Issue                           270363 non-null object
Sub-issue                       172634 non-null object
Consumer complaint narrative    270363 non-null object
Company public response         131924 non-null object
Company                         270363 non-null object
State                           266461 non-null object
ZIP code                        265162 non-null object
Tags                            46203 non-null object
Consumer consent provided?      270363 non-null object
Submitted via                   270363 non-null object
Date sent to company            270363 non-null object
Company response to consumer    270362 non-null object
Timely response?                270363 non-null object
Consumer 

In [6]:
col = ['Product', 'Consumer complaint narrative']
df_smaller = df[col]

In [7]:
df_smaller.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 270363 entries, 1 to 1009101
Data columns (total 2 columns):
Product                         270363 non-null object
Consumer complaint narrative    270363 non-null object
dtypes: object(2)
memory usage: 6.2+ MB


In [8]:
df_smaller["Product"].value_counts()

Debt collection                                                                 61587
Credit reporting, credit repair services, or other personal consumer reports    45971
Mortgage                                                                        43104
Credit reporting                                                                31593
Credit card                                                                     18842
Student loan                                                                    16404
Bank account or service                                                         14887
Credit card or prepaid card                                                      9980
Consumer Loan                                                                    9474
Checking or savings account                                                      6077
Money transfer, virtual currency, or money service                               2831
Vehicle loan or lease                                 

In [39]:
import random
random.seed(123)
df_part1 = df_smaller.loc[df_smaller.Product == "Student loan", :]
df_part1 = df_part1.sample(11404)
df_part2 = df_smaller.loc[df_smaller.Product == "Credit card", :]
df_part2 = df_part2.sample(9540)
df_part3 = df_smaller.loc[df_smaller.Product == "Bank account or service", :]
df_part3 = df_part3.sample(8309)
df_part4 = df_smaller.loc[df_smaller.Product == "Consumer Loan", :]
df_part5 = df_smaller.loc[df_smaller.Product == "Checking or savings account", :]
df_part6 = df_smaller.loc[df_smaller.Product == "Mortgage", :]
df_part6 = df_part6.sample(8332)
df_part7 = df_smaller.loc[df_smaller.Product == "Credit reporting", :]
df_part7 = df_part7.sample(6864)

In [40]:
df_final = pd.concat([df_part1, df_part2, df_part3, df_part4, df_part5, df_part6, df_part7], axis=0)

In [41]:
df_final["Product"].value_counts()

Student loan                   11404
Credit card                     9540
Consumer Loan                   9474
Mortgage                        8332
Bank account or service         8309
Credit reporting                6864
Checking or savings account     6077
Name: Product, dtype: int64

In [42]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60000 entries, 856553 to 788280
Data columns (total 2 columns):
Product                         60000 non-null object
Consumer complaint narrative    60000 non-null object
dtypes: object(2)
memory usage: 1.4+ MB


In [43]:
df_final.to_csv('Bank_complaints.csv', index=False)

# SOURCES

https://github.com/susanli2016/Machine-Learning-with-Python/blob/master/Consumer_complaints.ipynb

https://catalog.data.gov/dataset/consumer-complaint-database