### Data Exploration

In [1]:
import zipfile
import pandas as pd

# Define the path to the zip file and the CSV file inside it
zip_path = '../data_comp.zip'
csv_filename = 'consumer_complaints.csv'

# Open the zip file and load the CSV file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    with zip_ref.open(csv_filename) as file:
        df = pd.read_csv(file)

# Displaying the first few raws of the dataframe
df.head()

Unnamed: 0.1,Unnamed: 0,product,narrative
0,0,credit_card,purchase order day shipping amount receive pro...
1,1,credit_card,forwarded message date tue subject please inve...
2,2,retail_banking,forwarded message cc sent friday pdt subject f...
3,3,credit_reporting,payment history missing credit report speciali...
4,4,credit_reporting,payment history missing credit report made mis...


In [2]:
# Displaying basic information about the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162421 entries, 0 to 162420
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  162421 non-null  int64 
 1   product     162421 non-null  object
 2   narrative   162411 non-null  object
dtypes: int64(1), object(2)
memory usage: 3.7+ MB


In [3]:
# Dropping the unnecessary 'Unnamed: 0' column
df = df.drop(columns=['Unnamed: 0'])

# Dropping rows with missing values in the 'narrative' column
df = df.dropna(subset=['narrative'])

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162411 entries, 0 to 162420
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   product    162411 non-null  object
 1   narrative  162411 non-null  object
dtypes: object(2)
memory usage: 3.7+ MB


In [4]:
# Checking for duplicate rows based on 'product' and 'narrative' columns
duplicates = df.duplicated(subset=['product', 'narrative']).sum()

duplicates

37735

In [5]:
# Display full content in the 'narrative' column
pd.set_option('display.max_colwidth', None)

# Filter the DataFrame to show only duplicate rows based on 'product' and 'narrative' columns
duplicate_rows = df[df.duplicated(subset=['product', 'narrative'], keep=False)]

# Display the duplicate rows
duplicate_rows[['product', 'narrative']]

Unnamed: 0,product,narrative
31,credit_reporting,name last four s account number writing dispute late payment reported impacted hospital long time lost job spoke advised payment deferred get credit alert reported past due violation care act late payment need removed pay account full correct credit report remove erroinious late fee told provided protection care act failed adhere offered one called received written notice term deferred plan rely rep told thought protected need make right comply care act fix credit reporting remove late fee call done account negative told going forebarace would put account credit limit illegal credit ruined udapp violation care act violation
32,credit_reporting,name last four s account number writing dispute late payment reported impacted hospital long time lost job spoke advised payment deferred get credit alert reported past due violation care act late payment need removed pay account full correct credit report remove erroinious late fee told provided protection care act failed adhere offered one called received written notice term deferred plan rely rep told thought protected need make right comply care act fix credit reporting remove late fee call done account negative told going forebarace would put account credit limit illegal credit ruined udapp violation care act violation
42,credit_reporting,open account acct opened balance account acct opened balance account closed account acct opened balance account acct opened balance account acct opened balance account acct opened balance account acct opened balance account acct opened balance account acct opened balance account acct opened balance account acct opened balance account acct opened balance account acct opened balance account
44,credit_reporting,open account acct opened balance account acct opened balance account closed account acct opened balance account acct opened balance account acct opened balance account acct opened balance account acct opened balance account acct opened balance account acct opened balance account acct opened balance account acct opened balance account acct opened balance account acct opened balance account
68,credit_reporting,hello name trying reach several time get problem fixed receipted response far wrong adresses employment information exist credit account year addition also derogatory information appear credit report belong business address employment information need removed info priority expedite reported ar ar
...,...,...
162415,debt_collection,name
162416,debt_collection,name
162417,credit_card,name
162418,debt_collection,name
