In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
complaints = pd.read_csv('complaints.csv')
complaints

Unnamed: 0,Consumer complaint narrative,Issue
0,My name is XXXX XXXX this complaint is not mad...,Incorrect information on your report
1,I searched on XXXX for XXXXXXXX XXXX and was ...,Fraud or scam
2,I have a particular account that is stating th...,Incorrect information on your report
3,I have not supplied proof under the doctrine o...,Attempts to collect debt not owed
4,Hello i'm writing regarding account on my cred...,Incorrect information on your report
...,...,...
353427,Collections account I have no knowledge of,Attempts to collect debt not owed
353428,"Dear CFPB Team, The reason for my complaint is...",Attempts to collect debt not owed
353429,FRCA violations : Failing to Follow Debt Dispu...,Attempts to collect debt not owed
353430,"My Father, a XXXX XXXX acquired an HECM rever...",Struggling to pay mortgage


In [4]:
complaints['Issue'].value_counts()

Incorrect information on your report    229305
Attempts to collect debt not owed        73163
Communication tactics                    21243
Struggling to pay mortgage               17374
Fraud or scam                            12347
Name: Issue, dtype: int64

In [10]:
complaints_no_x=complaints

In [11]:
complaints_no_x['Consumer complaint narrative'] = complaints_no_x['Consumer complaint narrative'].apply(lambda x: re.sub(r'XX', '', x))
complaints_no_x

Unnamed: 0,Consumer complaint narrative,Issue
0,My name is this complaint is not made in err...,Incorrect information on your report
1,I searched on for and was pointed to a web...,Fraud or scam
2,I have a particular account that is stating th...,Incorrect information on your report
3,I have not supplied proof under the doctrine o...,Attempts to collect debt not owed
4,Hello i'm writing regarding account on my cred...,Incorrect information on your report
...,...,...
353427,Collections account I have no knowledge of,Attempts to collect debt not owed
353428,"Dear CFPB Team, The reason for my complaint is...",Attempts to collect debt not owed
353429,FRCA violations : Failing to Follow Debt Dispu...,Attempts to collect debt not owed
353430,"My Father, a acquired an HECM reverse mortg...",Struggling to pay mortgage


In [13]:
complaints_no_x['Consumer complaint narrative'] = complaints_no_x['Consumer complaint narrative'].apply(lambda x: re.sub(r'//', '', x))

In [14]:
complaints_no_x['Consumer complaint narrative'][1]

"I searched on  for    and was pointed to a website I legitimately believed was . The website was  whereas the authentic website is  I proceeded to buy a gun from the fraudulent website and sent my money via  as the spam seller requested, this was a total of {$450.00} on . I received an email stating the transaction was approved and on  I attempted to reach back out to the company in an attempt to give them the contact info of my chosen  dealer, the emails started to bounce back ( there are also photos of this ). This was when I realized that this website was a faked, copycat website. I attempted to report it to my bank immediately when I realized a company I was dealing with was committing fraud and pretending to be a firearms dealer that has been in business for over 80 years and is a very well-known company all over the world. As my bank, I expected them to help protect me. I was told to wait until the money posted and then got a letter stating that because I initiated the transacti

In [15]:
X_train, X_test, y_train, y_test = train_test_split(complaints_no_x['Consumer complaint narrative'], 
                                                    complaints_no_x['Issue'], 
                                                    test_size=0.2, 
                                                    random_state=42) 
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = LogisticRegression()
clf.fit(X_train_vec, y_train)

y_pred = clf.predict(X_test_vec)
classification_report(y_test, y_pred)

                                      precision    recall  f1-score   support

   Attempts to collect debt not owed       0.77      0.64      0.70     14477
               Communication tactics       0.84      0.75      0.79      4160
                       Fraud or scam       0.93      0.84      0.89      2391
Incorrect information on your report       0.89      0.95      0.92     46122
          Struggling to pay mortgage       0.94      0.88      0.91      3537

                            accuracy                           0.87     70687
                           macro avg       0.87      0.81      0.84     70687
                        weighted avg       0.87      0.87      0.87     70687



In [None]:
import umap

vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

umap_model = umap.UMAP()
X_train_umap = umap_model.fit_transform(X_train_vec)
X_test_umap = umap_model.transform(X_test_vec)

clf = LogisticRegression()
clf.fit(X_train_umap, y_train)

y_pred = clf.predict(X_test_umap)
classification_report(y_test, y_pred)

  self._set_arrayXarray(i, j, x)


In [None]:
import hdbscan

vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

hdbscan_model = hdbscan.HDBSCAN()
hdbscan_model.fit(X_train_vec)
X_train_hdbscan = hdbscan_model.labels_
X_test_hdbscan = hdbscan_model.predict(X_test_vec)

for cluster_label in set(X_train_hdbscan):
    X_train_cluster = X_train_vec[X_train_hdbscan == cluster_label]
    y_train_cluster = y_train[X_train_hdbscan == cluster_label]
    X_test_cluster = X_test_vec[X_test_hdbscan == cluster_label]
    y_test_cluster = y_test[X_test_hdbscan == cluster_label]
    
    clf = LogisticRegression()
    clf.fit(X_train_cluster, y_train_cluster)
    
    y_pred = clf.predict(X_test_cluster)
    print(f"Classification report for cluster {cluster_label}")
    print(classification_report(y_test_cluster, y_pred))

In [None]:
from bertopic import BERTopic

model = BERTopic(# fill in with number of topics)
complaints['topic'] = model.fit_transform(complaints['Consumer complaint narrative'])
topic_labels = {0: 'Incorrect information on your report', 
                1: 'Attempts to collect debt not owed', 
                2: 'Communication tactics', 
                3: 'Struggling to pay mortgage', 
                4: 'Fraud or scam'}

complaints['predicted_issue'] = complaints['topic'].apply(lambda x: topic_labels[x])
complaints

In [None]:
# Rohit's stopword filter
# nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['xa', 'nbsp'] + list(string.ascii_lowercase))

def text_preprocessing(text):
    text = BeautifulSoup(text, 'lxml').get_text(separator=' ', strip=True)
    text = re.findall(r'[a-z]+', text.lower())
    text = [word for word in text if word not in stopwords]
    return ' '.join(text)