# This is a learning project. It aims to have hands on experience in NLP

### Trying to build a model that predicts which Product do the customer complaints concern
### Will work with [Customer Complaint Database](https://www.kaggle.com/datasets/selener/consumer-complaint-database) from Kaggle

In [1]:
import pandas as pd
import numpy as np
import os
import json
#import shutil

#import tensorflow as tf
#import tensorflow_hub as hub
#import tensorflow_text as text

#from official.nlp import optimization  # to create AdamW optimizer

#import matplotlib.pyplot as plt

In [2]:
SPARSITY_THRESHOLD = 0.5

def optimize(df: pd.DataFrame) -> pd.DataFrame:
    float_types = {np.finfo(np.float16).max: np.float16, np.finfo(np.float32).max: np.float32}
    int_types = {np.iinfo(np.int8).max: np.int8, np.iinfo(np.int16).max: np.int16, np.iinfo(np.int32).max: np.int32}
    
    for dtype in ['float64', 'int64']:
        selected_columns = df.select_dtypes(include=[dtype])
        max_values = selected_columns.max()
        types = float_types if dtype == 'float64' else int_types

        for col in selected_columns.columns:
            if df[col].isna().sum() / len(df[col]) > SPARSITY_THRESHOLD:  # If mostly NaN values
                df[col] = pd.arrays.SparseArray(df[col])
            else:
                max_val = max_values[col]
                if not pd.isna(max_val):  # skip columns with only NaNs
                    for max_type_val, type_val in types.items():
                        if max_val <= max_type_val:
                            df[col] = df[col].astype(type_val)
                            break

    for col in df.select_dtypes(include=['object']).columns:
        if df[col].isna().sum() / len(df[col]) > SPARSITY_THRESHOLD:  # If mostly NaN values
            df[col] = pd.arrays.SparseArray(df[col])
        else:
            try:
                df[col] = pd.to_datetime(df[col])
            except ValueError:
                num_unique_values = len(df[col].unique())
                num_total_values = len(df[col])
                if num_unique_values / num_total_values < SPARSITY_THRESHOLD:
                    df[col] = df[col].astype('category')

    return df


### The ``optimize()`` method is fetched from the notebook [memory-usage-optimization-poc.ipynb](https://github.com/harunugurlu/mobile-action-internship/blob/main/Data%20Science/PoC/Memory%20Usage%20Optimization/memory-usage-optimization-poc.ipynb)

In [3]:
data_dir = 'data/rows.csv'

In [4]:
customer_tickets = pd.read_csv(data_dir)

  customer_tickets = pd.read_csv(data_dir)


In [5]:
customer_tickets.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,05/10/2019,Checking or savings account,Checking account,Managing an account,Problem using a debit or ATM card,,,NAVY FEDERAL CREDIT UNION,FL,328XX,Older American,,Web,05/10/2019,In progress,Yes,,3238275
1,05/10/2019,Checking or savings account,Other banking product or service,Managing an account,Deposits and withdrawals,,,BOEING EMPLOYEES CREDIT UNION,WA,98204,,,Referral,05/10/2019,Closed with explanation,Yes,,3238228
2,05/10/2019,Debt collection,Payday loan debt,Communication tactics,Frequent or repeated calls,,,CURO Intermediate Holdings,TX,751XX,,,Web,05/10/2019,Closed with explanation,Yes,,3237964
3,05/10/2019,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Old information reappears or never goes away,,,Ad Astra Recovery Services Inc,LA,708XX,,,Web,05/10/2019,Closed with explanation,Yes,,3238479
4,05/10/2019,Checking or savings account,Checking account,Managing an account,Banking errors,,,ALLY FINANCIAL INC.,AZ,85205,,,Postal mail,05/10/2019,In progress,Yes,,3238460


Will use the ``Consumer complaint narrative`` to predict the ``Product`` 

In [6]:
print(customer_tickets.shape)

(1282355, 18)


In [7]:
customer_tickets.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1282355 entries, 0 to 1282354
Data columns (total 18 columns):
 #   Column                        Non-Null Count    Dtype 
---  ------                        --------------    ----- 
 0   Date received                 1282355 non-null  object
 1   Product                       1282355 non-null  object
 2   Sub-product                   1047189 non-null  object
 3   Issue                         1282355 non-null  object
 4   Sub-issue                     751169 non-null   object
 5   Consumer complaint narrative  383564 non-null   object
 6   Company public response       449082 non-null   object
 7   Company                       1282355 non-null  object
 8   State                         1262955 non-null  object
 9   ZIP code                      1167057 non-null  object
 10  Tags                          175643 non-null   object
 11  Consumer consent provided?    690654 non-null   object
 12  Submitted via                 1282355 non-

The dataset is very large (1m+ rows, 1.7gb memory usage), let's shrink it down

In [8]:
customer_tickets = optimize(customer_tickets)

In [9]:
customer_tickets.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1282355 entries, 0 to 1282354
Data columns (total 18 columns):
 #   Column                        Non-Null Count    Dtype              
---  ------                        --------------    -----              
 0   Date received                 1282355 non-null  datetime64[ns]     
 1   Product                       1282355 non-null  category           
 2   Sub-product                   1047189 non-null  category           
 3   Issue                         1282355 non-null  category           
 4   Sub-issue                     751169 non-null   category           
 5   Consumer complaint narrative  383564 non-null   Sparse[object, nan]
 6   Company public response       449082 non-null   Sparse[object, nan]
 7   Company                       1282355 non-null  category           
 8   State                         1262955 non-null  category           
 9   ZIP code                      1167057 non-null  category           
 10  Tags  

Inspecting the features of interest (``Consumer complaint narrative`` and ``Product``)

In [10]:
print("Num of non na ticket descriptions", customer_tickets['Consumer complaint narrative'].count())
print("Num of unique products:", customer_tickets['Product'].nunique())

Num of non na ticket descriptions 383564
Num of unique products: 18


We have over __1m__ rows but only less than half of them include a consumer complaint narrative. Lucky for us this number is still large (380k+) and adequate to use in __BERT__. Therefore, we can just drop them. But before that it is a good approach to inspect and to see if there is any correlation between the missing values of  ``Consumer complaint narrative`` and their respective ``Product``.

In [11]:
# Getting the rows with NaN values for 'Consumer complaint narrative'
invalid_tickets = customer_tickets[customer_tickets['Consumer complaint narrative'].isnull()]

In [12]:
invalid_tickets.shape

(898791, 18)

In [13]:
products_complaints = invalid_tickets[['Product', 'Consumer complaint narrative']]

In [14]:
products_complaints['Product'].value_counts()

Mortgage                                                                        225111
Debt collection                                                                 158163
Credit reporting, credit repair services, or other personal consumer reports    133600
Credit reporting                                                                108844
Bank account or service                                                          71321
Credit card                                                                      70352
Student loan                                                                     29875
Checking or savings account                                                      27760
Credit card or prepaid card                                                      26274
Consumer Loan                                                                    22131
Vehicle loan or lease                                                             5632
Money transfer, virtual currency, or money 

Seems like there is no correlation between missing narrative values and product values. There is no large number of certain product. Actually, we should take into account other features such as ``Sub-product``, ``Issue``, ``Sub-issue`` etc. but we are ignoring them for ease.

Get the "valid" tickets with consumer complaint narratives.

In [15]:
valid_tickets = customer_tickets[customer_tickets['Consumer complaint narrative'].notna()].copy()

In [16]:
print(valid_tickets.shape)
valid_tickets.head()

(383564, 18)


Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
29904,2019-03-23,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Account information incorrect,The Summer of XX/XX/2018 I was denied a mortga...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",IL,,,Consent provided,Web,2019-03-23,Closed with explanation,Yes,,3189109
30629,2019-03-22,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Account information incorrect,There are many mistakes appear in my report wi...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",VA,220XX,,Consent provided,Web,2019-03-22,Closed with explanation,Yes,,3187982
30735,2019-03-22,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Account information incorrect,There are many mistakes appear in my report wi...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",TX,770XX,,Consent provided,Web,2019-03-22,Closed with explanation,Yes,,3187954
30795,2019-03-22,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Account information incorrect,There are many mistakes appear in my report wi...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",TX,787XX,,Consent provided,Web,2019-03-22,Closed with explanation,Yes,,3188091
30807,2019-03-22,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Account information incorrect,There are many mistakes appear in my report wi...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",CA,951XX,,Consent provided,Web,2019-03-22,Closed with explanation,Yes,,3188119


In [17]:
valid_tickets.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 383564 entries, 29904 to 912553
Data columns (total 18 columns):
 #   Column                        Non-Null Count   Dtype              
---  ------                        --------------   -----              
 0   Date received                 383564 non-null  datetime64[ns]     
 1   Product                       383564 non-null  category           
 2   Sub-product                   331391 non-null  category           
 3   Issue                         383564 non-null  category           
 4   Sub-issue                     269891 non-null  category           
 5   Consumer complaint narrative  383564 non-null  Sparse[object, nan]
 6   Company public response       182680 non-null  Sparse[object, nan]
 7   Company                       383564 non-null  category           
 8   State                         382178 non-null  category           
 9   ZIP code                      294787 non-null  category           
 10  Tags            

We have converted the dtype of the ``Consumer complaint narrative`` to Sparse array because it had lots of missing values. But in our ``valid_tickets`` DataFrame, we no longer have missing values of ``Consumer complaint narrative``. Therefore, we should cast it back to dense array.

In [18]:
valid_tickets.loc[:, 'Consumer complaint narrative'] = valid_tickets['Consumer complaint narrative'].sparse.to_dense()

Should check the other Sparse array columns as well.

In [19]:
valid_tickets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 383564 entries, 29904 to 912553
Data columns (total 18 columns):
 #   Column                        Non-Null Count   Dtype              
---  ------                        --------------   -----              
 0   Date received                 383564 non-null  datetime64[ns]     
 1   Product                       383564 non-null  category           
 2   Sub-product                   331391 non-null  category           
 3   Issue                         383564 non-null  category           
 4   Sub-issue                     269891 non-null  category           
 5   Consumer complaint narrative  383564 non-null  object             
 6   Company public response       182680 non-null  Sparse[object, nan]
 7   Company                       383564 non-null  category           
 8   State                         382178 non-null  category           
 9   ZIP code                      294787 non-null  category           
 10  Tags            

Finally, we can start training BERT

#### Bert expects the data in a certain form:
- __Column 0:__ An ID for the row
- __Column 1:__ The label for the row (should be an int — class labels: 0,1,2,3 etc) This column represents the class or category of the ticket (the "target" that the model is trying to predict).
- __Column 2:__ A column of the same letter for all rows — this is a throw-away column that we need to include because BERT expects it.  It doesn't have any impact on the task.
- __Column 3:__ The text examples we want to classify. This is the data that BERT will use to learn how to classify tickets.

Also it expects the data to be in tsv file.

In [20]:
indices = range(valid_tickets['Consumer complaint narrative'].size)

In [21]:
# Getting product and complaint pairs
product_complaint_pairs = valid_tickets[['Product', 'Consumer complaint narrative']]

In [22]:
product_complaint_pairs.head()

Unnamed: 0,Product,Consumer complaint narrative
29904,"Credit reporting, credit repair services, or o...",The Summer of XX/XX/2018 I was denied a mortga...
30629,"Credit reporting, credit repair services, or o...",There are many mistakes appear in my report wi...
30735,"Credit reporting, credit repair services, or o...",There are many mistakes appear in my report wi...
30795,"Credit reporting, credit repair services, or o...",There are many mistakes appear in my report wi...
30807,"Credit reporting, credit repair services, or o...",There are many mistakes appear in my report wi...


Mapping the Products to integers.

In [23]:
from sklearn.preprocessing import LabelEncoder

# Initializing a LabelEncoder
le = LabelEncoder()

# Fitting the encoder to the 'Product' data
valid_tickets['Product_encoded'] = le.fit_transform(valid_tickets['Product'])

In [24]:
data = {'ID': range(len(valid_tickets)), 'Product': valid_tickets['Product_encoded'], 'Throw away': 'a', 'Complaint': valid_tickets['Consumer complaint narrative']}

In [25]:
df_bert = pd.DataFrame(data=data)

In [26]:
df_bert.head()

Unnamed: 0,ID,Product,Throw away,Complaint
29904,0,6,a,The Summer of XX/XX/2018 I was denied a mortga...
30629,1,6,a,There are many mistakes appear in my report wi...
30735,2,6,a,There are many mistakes appear in my report wi...
30795,3,6,a,There are many mistakes appear in my report wi...
30807,4,6,a,There are many mistakes appear in my report wi...


We are going to need to create three separate files, called ``train.tsv`` ``dev.tsv`` and ``test.tsv``.  In ``train.tsv`` and ``dev.tsv`` we will have all the 4 columns while in ``test.tsv`` we will only keep 2 of the columns, i.e., id for the row and the text we want to classify.

In [36]:
# 80% for train and 20% for test
from sklearn.model_selection import train_test_split

df_bert_train, df_bert_dev = train_test_split(df_bert, test_size=0.2)

In [40]:
df_bert_train.info()
df_bert_dev.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 306851 entries, 535070 to 727636
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   ID          306851 non-null  int64 
 1   Product     306851 non-null  int32 
 2   Throw away  306851 non-null  object
 3   Complaint   306851 non-null  object
dtypes: int32(1), int64(1), object(2)
memory usage: 10.5+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 76713 entries, 791545 to 549509
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          76713 non-null  int64 
 1   Product     76713 non-null  int32 
 2   Throw away  76713 non-null  object
 3   Complaint   76713 non-null  object
dtypes: int32(1), int64(1), object(2)
memory usage: 2.6+ MB


In [45]:
print(df_bert_train.shape)
df_bert_train.head()

(306851, 4)


Unnamed: 0,ID,Product,Throw away,Complaint
535070,228705,2,a,In XXXX of 2016 my vehicle was total lost in a...
258345,96195,7,a,"Asked for verification of debt, received paper..."
306580,118235,6,a,I have reached out to All credit bureaus some ...
417036,171469,7,a,This complaint is against XXXX XXXX XXXX locat...
478547,201596,6,a,I have some hard inquiries on my XXXX and ...


In [44]:
print(df_bert_dev.shape)
df_bert_dev.head()

(76713, 4)


Unnamed: 0,ID,Product,Throw away,Complaint
791545,334454,3,a,I once had a credit card from capital one whic...
475636,200161,7,a,The company claims i owed them and i have no k...
684613,291773,0,a,I accidently forgot to transfer the money I wa...
348325,138423,7,a,XXXX XXXX XXXX XXXX is attempting to collect a...
385682,156318,10,a,I could not access my account online and the p...


In [46]:
df_bert_train.to_csv('out.csv')

In [51]:
# Creating the test dataframe for BERT
df_bert_test = pd.DataFrame({
    'ID': df_bert['ID'],
    'Complaint': df_bert['Complaint']
})

In [52]:
print(df_bert_test.shape)
df_bert_test.head()

(383564, 2)


Unnamed: 0,ID,Complaint
29904,0,The Summer of XX/XX/2018 I was denied a mortga...
30629,1,There are many mistakes appear in my report wi...
30735,2,There are many mistakes appear in my report wi...
30795,3,There are many mistakes appear in my report wi...
30807,4,There are many mistakes appear in my report wi...


In [47]:
import csv

with open('C:/Users/harun/bert/data/train.tsv', 'r', encoding='utf-8') as f:
    reader = csv.reader(f, delimiter='\t')
    for i, line in enumerate(reader):
        if len(line) < 4:
            print(f"Line {i} has less than 4 values: {line}")


In [48]:
import csv

with open('C:/Users/harun/bert/data/train.tsv', 'r', encoding='utf-8') as f:
    reader = csv.reader(f, delimiter='\t')
    for i, line in enumerate(reader):
        # Check if line length is not equal to 4.
        if len(line) != 4:
            print(f"Line {i} has {len(line)} values: {line}")

        # Print out the first 10 lines for manual inspection.
        if i < 10:
            print(f"Line {i}: {line}")


Line 0: ['170289', '1', 'a', 'To whom it may concern : I would like to file a complaint against Webster Bank and more specifically, the fees that the bank imposes for insufficient funds. Currently Webster Bank charges {$37.00} for insufficient funds per bank transaction. The bank fails to indicate the time when each transaction occurs. The bank only tells the customer the date of the transaction. In my case, I was charged four {$37.00} fees for transactions that the bank claimed, I did not have funds to cover. See attached bank statement from XX/XX/XXXX until XX/XX/XXXX. I was charged {$37.00} for four bank transactions that occurred on XX/XX/XXXX ( one ), and XX/XX/XXXX ( three ). I note that my balance never went to {$0.00}, and I deposited close to {$3000.00} ( {$380.00} plus {$2600.00} ) on or aboutXX/XX/XXXXI called the bank and asked the bank to refund at least 3 of the {$37.00} insufficient fee charges. The bank refused to refund any of the {$37.00} insufficient fee charges. I w

In some ticket descriptions we have tabs. Therefore when we are using ``\t`` as a delimeter in ``to_csv`` method of the pandas, the part of the description after the tab falls into another line. Then when we run BERT, we get out of range.
![Index out of range](img/index-out-of-range.png)

Therefore we need to replace the tabs with a single space in the complaint column.

In [61]:
df_bert_train['Complaint'] = df_bert_train['Complaint'].replace('\t', ' ', regex=True)
df_bert_train['Complaint'] = df_bert_train['Complaint'].replace('\n', ' ', regex=True)

In [62]:
df_bert_dev['Complaint'] = df_bert_train['Complaint'].replace('\t', ' ', regex=True)
df_bert_dev['Complaint'] = df_bert_train['Complaint'].replace('\n', ' ', regex=True)

In [63]:
df_bert_test['Complaint'] = df_bert_train['Complaint'].replace('\t', ' ', regex=True)
df_bert_test['Complaint'] = df_bert_train['Complaint'].replace('\n', ' ', regex=True)

In [65]:
df_bert_train.to_csv('out.csv')

In [64]:
df_bert_train['Complaint'][306580]

"I have reached out to All credit bureaus some have refused my request to dispute the unauthorized inquiries on my credit profile.  I have asked them to investigate.  XXXX employees told me they don't investigate disputes, They told me I have to reach out to the creditor to obtain a letter of deletion. After reaching out to the creditors they told me I have to call the credit bureaus and request a dispute. So I recontacted the Credit Bureaus and attempted to get them to honor my request to check out the unauthorized inquiries. They continue to give me the runaround and hanging up in my face 3 times.   Experian started off giving me a run around for my request. Telling me to reach out the creditor. By telling them I already have done this and a threat of contacting CFPB they open disputes. But when I go to the dispute center in Experian it seems as if they doubled up on disputes and missed out on a couple of disputes in the date range of XX/XX/XXXX to XX/XX/XXXX.   XXXX has also given m

In [66]:
# Saving dataframes to .tsv format as required by BERT
df_bert_train.to_csv('data/train.tsv', sep='\t', index=False, header=False)
df_bert_dev.to_csv('data/dev.tsv', sep='\t', index=False, header=False)
df_bert_test.to_csv('data/test.tsv', sep='\t', index=False, header=False)