# Classifying most frequently reported credit products

Direct link to selected data: https://www.consumerfinance.gov/data-research/consumer-complaints/search/api/v1/?date_received_max=2022-11-03&date_received_min=2011-12-01&field=all&format=csv&has_narrative=true&lens=product&no_aggs=true&product=Credit%20reporting%20or%20other%20personal%20consumer%20reports&product=Debt%20collection&product=Mortgage&product=Credit%20card%20or%20prepaid%20card&product=Checking%20or%20savings%20account&product=Student%20loan&product=Credit%20reporting&product=Money%20transfer%2C%20virtual%20currency%2C%20or%20money%20service&size=524341&sub_lens=sub_product&trend_depth=5&trend_interval=month

Business task - classifying mails and directing them to proper departments of a bank, performing investingations on unlabeled data

## Preprocessing

In [1]:
import pandas as pd
import numpy as np
import os

# Data processing
import nltk
import pickle
import re
import nltk.corpus
import random as rand
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords

def save_object(obj, filename):
    with open(filename, 'wb') as output:  # Overwrites any existing file.
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

In [32]:
complaints = pd.read_csv('complaints-22-23.csv')

In [33]:
complaints.columns

Index(['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue',
       'Consumer complaint narrative', 'Company public response', 'Company',
       'State', 'ZIP code', 'Tags', 'Consumer consent provided?',
       'Submitted via', 'Date sent to company', 'Company response to consumer',
       'Timely response?', 'Consumer disputed?', 'Complaint ID'],
      dtype='object')

In [34]:
complaints = complaints[["Consumer complaint narrative", "Product"]].copy()
complaints['Product'].value_counts()

Product
Debt collection                                       194261
Mortgage                                               98508
Credit card or prepaid card                            83554
Checking or savings account                            55647
Student loan                                           33519
Credit reporting                                       31587
Money transfer, virtual currency, or money service     27265
Name: count, dtype: int64

In [35]:
# Undersampling: Randomly drop rows from majority classes 
# so that we are left with 30'000 rows for each value
most_frequent_products = ['Debt collection',
'Mortgage',
'Credit card or prepaid card',
'Checking or savings account']

# Create an empty DataFrame to store the reduced rows
complaints_reduced = pd.DataFrame(columns=complaints.columns)

# Set the desired number of rows for each distinct value
target_rows = 5000

# Randomly reduce the rows for each distinct value to the target number
for value in most_frequent_products:
    # Filter the DataFrame to select rows with the current value
    filtered_rows = complaints[complaints['Product'] == value]
    
    if len(filtered_rows) > target_rows:
        # Randomly shuffle the rows and keep the first target_rows rows
        shuffled_rows = filtered_rows.sample(frac=1).head(target_rows)
    else:
        # If there are fewer rows than the target, keep all of them
        shuffled_rows = filtered_rows
    
    # Concatenate the selected rows with the reduced DataFrame
    complaints_reduced = pd.concat([complaints_reduced, shuffled_rows])

# Reset the index of the reduced DataFrame
complaints_reduced = complaints_reduced.reset_index(drop=True)
complaints = complaints_reduced

In [36]:
# Display the values
complaints_products = complaints[["Consumer complaint narrative", "Product"]].copy()
complaints_products['Product'].value_counts()

Product
Debt collection                30000
Mortgage                       30000
Credit card or prepaid card    30000
Checking or savings account    30000
Name: count, dtype: int64

In [37]:
# Create a function to assign labels
def assign_label(row):
    if row['Product'] in most_frequent_products:
        return row['Product']
    else:
        return 'Other'

# Apply the function to create the 'Product_label' column
complaints_products['Product_label'] = complaints_products.apply(assign_label, axis=1)
complaints_products = complaints_products.rename(columns = {'Consumer complaint narrative':'Complaint'})

In [None]:
# Text cleaning
# Add 'xxxx' to the list of stopwords
stop = set(stopwords.words('english'))
stop.add('xxxx')


def text_clean(df,column_name):
    df['cleaned_text'] = df[column_name]
    
    # Normalize text
    df['cleaned_text'] = df['cleaned_text'].str.lower()
    
    # Remove unicode chars + all word formations with 'xxxx' inside
    pattern = r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?|(\w*\d*xxxx\d*\w*|xx\/xx\/\w*\d*)"

    df['cleaned_text'] = df['cleaned_text'].apply(lambda x: re.sub(pattern, "", x))
    
    # Remove stopwords
    df['cleaned_text'] = df['cleaned_text'].apply(lambda x: " ".join(word for word in x.split() if word not in stop))
    
    # Perfrom lemmatization
    lemmatizer = WordNetLemmatizer()
    df['cleaned_text'] = df['cleaned_text'].apply(lambda x: " ".join(lemmatizer.lemmatize(word) for word in x.split()))
    
    return df.head(5)

text_clean(complaints_products,'Complaint')

In [40]:
# Get the columns that are unique to df2
additional_columns = [col for col in complaints.columns if col not in complaints_products.columns]

# Concatenate the DataFrames, keeping only the unique columns from df2
result_df = pd.concat([complaints_products, complaints[additional_columns]], axis=1)

# Save processed data to csv
result_df.to_csv('processed_complaints.csv', index=False)

In [2]:
# Reread data
complaints_products = pd.read_csv('processed_complaints.csv')

In [10]:
complaints_products

Unnamed: 0,Complaint,Product,Product_label,cleaned_text,Consumer complaint narrative
0,"To whom it may concern, There is an account th...",Debt collection,Debt collection,may concern account opened name without consen...,"To whom it may concern, There is an account th..."
1,"In XXXX , I was stuck on high deductible medi...",Debt collection,Debt collection,stuck high deductible medical plan son incurre...,"In XXXX , I was stuck on high deductible medi..."
2,A company named XXXX XXXX XXXX sent me a lette...,Debt collection,Debt collection,company named sent letter demanding payment un...,A company named XXXX XXXX XXXX sent me a lette...
3,This account was paid by me even though I was ...,Debt collection,Debt collection,account paid even though provided proof owed d...,This account was paid by me even though I was ...
4,THIS DEBT COLLECTOR IS ADVERTISING TO COERCE P...,Debt collection,Debt collection,debt collector advertising coerce payment debt...,THIS DEBT COLLECTOR IS ADVERTISING TO COERCE P...
...,...,...,...,...,...
119995,"Hi, I had fraud committed against my account s...",Checking or savings account,Checking or savings account,hi fraud committed account several time citiba...,"Hi, I had fraud committed against my account s..."
119996,I have a FDIC account in my HSA ( health savin...,Checking or savings account,Checking or savings account,fdic account hsa health saving account optumba...,I have a FDIC account in my HSA ( health savin...
119997,"In XXXX XXXX , I saw an online offer fro m ...",Checking or savings account,Checking or savings account,saw online offer fro citibank th stated open s...,"In XXXX XXXX , I saw an online offer fro m ..."
119998,This compliant is on Netspend deliberate failu...,Checking or savings account,Checking or savings account,compliant netspend deliberate failure apply ad...,This compliant is on Netspend deliberate failu...


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer 

complaints_products = complaints_products.dropna()

# Calculate token counts for each document
tokenizer = RegexpTokenizer(r'\w+') # tokenizer

tf_vectorizer = CountVectorizer(ngram_range = (1, 3), # bigrams and trigrams added
                                max_df = 0.9,
                                min_df = 0.1,
                                tokenizer = tokenizer.tokenize
)

tf = tf_vectorizer.fit_transform(complaints_products["cleaned_text"])
tf_feature_names = tf_vectorizer.get_feature_names_out()



## Decision Tree

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer 
from sklearn.preprocessing import MaxAbsScaler

X = tf  # Feature matrix (TF matrix)
y = complaints_products["Product"]  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = MaxAbsScaler() # Scale the features by dividing each feature by the maximum absolute value of that feature.
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

param_grid = {
    'max_depth': [None, 10, 20],   # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
}


model = DecisionTreeClassifier()
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='balanced_accuracy')
grid_search.fit(X_train_scaled, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [34]:
print(best_params)

{'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 10}


In [35]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import label_binarize

# Best model for prediction
y_pred = best_model.predict(X_test_scaled)

print("Best Hyperparameters for Decision Tree:", best_params)
# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of the best Decision Tree model on the test set:", accuracy)

# Calculate balanced accuracy
balanced_acc = balanced_accuracy_score(y_test, y_pred)
print("Balanced Accuracy of the best Decision Tree model on the test set:", balanced_acc)

# Generate the classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

y_test_bin = label_binarize(y_test, classes=np.unique(y_test))
y_pred_bin = label_binarize(y_pred, classes=np.unique(y_test))

# Calculate ROC AUC using the 'ovr' strategy
roc_auc = roc_auc_score(y_test_bin, y_pred_bin, multi_class='ovr')
print("ROC AUC of the best Decision Tree model on the test set (ovr):", roc_auc)

Best Hyperparameters for Decision Tree: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 10}
Accuracy of the best Decision Tree model on the test set: 0.8142083333333333
Balanced Accuracy of the best Decision Tree model on the test set: 0.8137400446906501
Classification Report:
                              precision    recall  f1-score   support

Checking or savings account       0.77      0.81      0.79      5920
Credit card or prepaid card       0.76      0.74      0.75      5926
            Debt collection       0.83      0.83      0.83      6086
                   Mortgage       0.90      0.88      0.89      6068

                   accuracy                           0.81     24000
                  macro avg       0.81      0.81      0.81     24000
               weighted avg       0.81      0.81      0.81     24000

Confusion Matrix:
 [[4794  742  228  156]
 [ 884 4387  534  121]
 [ 300  430 5029  327]
 [ 264  180  293 5331]]
ROC AUC of the best Decision Tree model 

## SLDA

In [11]:
complaints_products["cleaned_text"]

<bound method NDFrame.describe of 0         may concern account opened name without consen...
1         stuck high deductible medical plan son incurre...
2         company named sent letter demanding payment un...
3         account paid even though provided proof owed d...
4         debt collector advertising coerce payment debt...
                                ...                        
119995    hi fraud committed account several time citiba...
119996    fdic account hsa health saving account optumba...
119997    saw online offer fro citibank th stated open s...
119998    compliant netspend deliberate failure apply ad...
119999    deposited two large check received checking ac...
Name: cleaned_text, Length: 120000, dtype: object>

In [4]:
# Preparing X col with bigrams also
from nltk import bigrams

# Form bigrams
complaints_products['cleaned_text'] = complaints_products['cleaned_text'].astype('str')
complaints_products['tokenized_text'] = complaints_products['cleaned_text'].apply(lambda x: x.split(" "))
complaints_products['tokenized_text'] = complaints_products['tokenized_text'].apply(lambda x: x + [' '.join(b) for b in bigrams(x)])

# Assuming 'complaints_products' DataFrame has a 'Product' column
# One-hot encode the 'Product' column
one_hot_encoded = pd.get_dummies(complaints_products['Product'])

# Convert boolean values to integers (0 or 1)
one_hot_encoded = one_hot_encoded.astype(int)

# Concatenate the one-hot encoded columns to the original DataFrame
complaints_products_encoded = pd.concat([complaints_products, one_hot_encoded], axis=1)

In [168]:
import tomotopy as tp 
import math
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split

# Getting X and y
X = complaints_products_encoded['tokenized_text']
y = complaints_products_encoded.iloc[:, -4:]

# Splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

results = []
skf = StratifiedKFold(n_splits=5, random_state=123, shuffle=True)

# Unique ngrams
unique_tokens_ngrams = list(set(token for tokens in complaints_products['tokenized_text'] for token in tokens))

for k in [10, 30]:  # number of topics
    for min_df in [10, 30]:  # DF of tokens to be removed "from the bottom"
        for rm_top in [0, int(len(unique_tokens_ngrams) * 0.001), int(len(unique_tokens_ngrams) * 0.01)]:  # how many tokens should be removed "from the top"

            print('Parameters:', k, min_df, rm_top)

            balanced_accs = []
            aucs = []

            for i, (train_index, val_index) in enumerate(skf.split(X_train, y_train.iloc[:,0])):
                X_train_a= X_train.iloc[train_index]
                X_val= X_train.iloc[val_index]
                y_train_a = y_train.iloc[train_index].values
                y_val= y_train.iloc[val_index].values

                slda = tp.SLDAModel(k=k,
                                    min_df=min_df,
                                    rm_top=rm_top,
                                    vars=['b','b','b','b'],  # we have four topics
                                    alpha=0.1,
                                    eta=0.01,
                                    mu=0,
                                    nu_sq=1,
                                    glm_param=1,
                                    seed=123
                                    )

                for i in range(len(X_train_a)):
                    slda.add_doc(X_train_a.iloc[i], y=y_train_a[i,:])

                for i in range(0, 100, 20):
                    slda.train(20)

                train_values = list(y_train_a)
                val_values = list(y_val)

                train_estimates = []
                for doc in slda.docs:
                    estimate = slda.estimate(doc)
                    train_estimates.append(estimate)

                val_estimates = []
                for i in range(len(X_val)):
                    slda_val_doc = slda.make_doc(X_val.iloc[i])
                    slda.infer(slda_val_doc)
                    val_estimates.append(slda.estimate(slda_val_doc))

                # Compute balanced accuracy
                balanced_acc_val = balanced_accuracy_score(np.argmax(y_val, axis=1),
                                                           np.argmax(val_estimates, axis=1))
                balanced_accs.append(balanced_acc_val)

                # Compute weighted F1 score
                auc = roc_auc_score(y_val, val_estimates, multi_class='ovr')
                aucs.append(auc)
                
            results.append([k, min_df, rm_top, round(np.mean(aucs), 2)])
            print('Balanced Accuracy (val):', round(np.mean(balanced_accs), 2))
            print('AUC (val):', round(np.mean(aucs), 2))

Parameters: 10 10 0


  slda.train(20)
  slda.train(20)
  slda.train(20)
  slda.train(20)
  slda.train(20)


Balanced Accuracy (val): 0.86
AUC (val): 0.96
Parameters: 10 10 2924


  slda.train(20)
  slda.train(20)
  slda.train(20)
  slda.train(20)
  slda.train(20)


Balanced Accuracy (val): 0.81
AUC (val): 0.94
Parameters: 10 10 29240


  slda.train(20)
  slda.train(20)
  slda.train(20)
  slda.train(20)
  slda.train(20)


Balanced Accuracy (val): 0.69
AUC (val): 0.87
Parameters: 10 30 0


  slda.train(20)
  slda.train(20)
  slda.train(20)
  slda.train(20)
  slda.train(20)


Balanced Accuracy (val): 0.86
AUC (val): 0.96
Parameters: 10 30 2924


  slda.train(20)
  slda.train(20)
  slda.train(20)
  slda.train(20)
  slda.train(20)


Balanced Accuracy (val): 0.81
AUC (val): 0.94
Parameters: 10 30 29240


  slda.train(20)
  slda.train(20)
  slda.train(20)
  slda.train(20)
  slda.train(20)


Balanced Accuracy (val): 0.52
AUC (val): 0.76
Parameters: 30 10 0


  slda.train(20)
  slda.train(20)
  slda.train(20)
  slda.train(20)
  slda.train(20)


Balanced Accuracy (val): 0.88
AUC (val): 0.97
Parameters: 30 10 2924


  slda.train(20)
  slda.train(20)
  slda.train(20)
  slda.train(20)
  slda.train(20)


Balanced Accuracy (val): 0.85
AUC (val): 0.96
Parameters: 30 10 29240


  slda.train(20)
  slda.train(20)
  slda.train(20)
  slda.train(20)
  slda.train(20)


Balanced Accuracy (val): 0.74
AUC (val): 0.91
Parameters: 30 30 0


  slda.train(20)
  slda.train(20)
  slda.train(20)
  slda.train(20)
  slda.train(20)


Balanced Accuracy (val): 0.88
AUC (val): 0.97
Parameters: 30 30 2924


  slda.train(20)
  slda.train(20)
  slda.train(20)
  slda.train(20)
  slda.train(20)


Balanced Accuracy (val): 0.82
AUC (val): 0.95
Parameters: 30 30 29240


  slda.train(20)
  slda.train(20)
  slda.train(20)
  slda.train(20)
  slda.train(20)


Balanced Accuracy (val): 0.52
AUC (val): 0.75


In [169]:
print(results)

[[10, 10, 0, 0.96], [10, 10, 2924, 0.94], [10, 10, 29240, 0.87], [10, 30, 0, 0.96], [10, 30, 2924, 0.94], [10, 30, 29240, 0.76], [30, 10, 0, 0.97], [30, 10, 2924, 0.96], [30, 10, 29240, 0.91], [30, 30, 0, 0.97], [30, 30, 2924, 0.95], [30, 30, 29240, 0.75]]


In [6]:
# Best model
import tomotopy as tp 
from tqdm import tqdm
slda = tp.SLDAModel(k = 30, # number of topics
                    min_df = 30, # DF of tokens to be removed "from the bottom"
                    rm_top = 0, # how many tokens should be removed "from the top"
                    vars = ['b','b','b','b'], # indicate linear response variable
                    alpha = 0.1,
                    eta = 0.01,
                    mu = 0,
                    nu_sq = 1,
                    glm_param = 1,
                    seed = 123
                   )
# adds documents to the corpus

# Getting X and y
X = complaints_products_encoded['tokenized_text']
y = complaints_products_encoded.iloc[:, -4:]

# splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

for i in tqdm(range(0, X_train.shape[0])):
    slda.add_doc(X_train.iloc[i], y = y_train.iloc[i,:])
for i in range(0, 200, 20):
    print('Iteration: {:04} LL per word: {:.4}'.format(i, slda.ll_per_word))
    slda.train(20)

100%|██████████| 96000/96000 [00:17<00:00, 5440.44it/s]
  slda.train(20)


Iteration: 0000 LL per word: 0.0
Iteration: 0020 LL per word: -9.419
Iteration: 0040 LL per word: -9.066
Iteration: 0060 LL per word: -8.954
Iteration: 0080 LL per word: -8.898
Iteration: 0100 LL per word: -8.863
Iteration: 0120 LL per word: -8.844
Iteration: 0140 LL per word: -8.832
Iteration: 0160 LL per word: -8.824
Iteration: 0180 LL per word: -8.817


In [181]:
for k in range(slda.k):
    print('Top 10 words of topic #{}:'.format(k))
    for token, weight in slda.get_topic_words(k, top_n = 5):
        print(token, ':', weight)
    print('\n')

Top 10 words of topic #0:
account : 0.16398397088050842
closed : 0.024911701679229736
access : 0.012652751989662647
checking : 0.012523052282631397
close : 0.012429293245077133


Top 10 words of topic #1:
check : 0.06826106458902359
fund : 0.03530901297926903
money : 0.025745557621121407
bank : 0.025311430916190147
account : 0.022151347249746323


Top 10 words of topic #2:
information : 0.02240683324635029
consumer : 0.019022773951292038
credit : 0.01816413179039955
reporting : 0.01652892306447029
account : 0.012349356897175312


Top 10 words of topic #3:
consumer : 0.032327305525541306
15 : 0.019105788320302963
usc : 0.01548374630510807
15 usc : 0.012361587956547737
credit : 0.012204844504594803


Top 10 words of topic #4:
fee : 0.05983734130859375
balance : 0.03777819871902466
charge : 0.03155619651079178
interest : 0.025294827297329903
charged : 0.019571484997868538


Top 10 words of topic #5:
bank : 0.15010620653629303
account : 0.042847611010074615
america : 0.033090926706790924
b

In [46]:
# slda.save('optimal_slda.bin', full=True)
import tomotopy as tp 
slda = tp.SLDAModel().load('optimal_slda.bin')

In [7]:
from sklearn.metrics import balanced_accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Getting X and y
X = complaints_products_encoded['tokenized_text']
y = complaints_products_encoded.iloc[:, -4:]

# splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

test_values = y_test
test_estimates = []

for i in tqdm(range(y_test.shape[0])):
    slda_test_doc = slda.make_doc(X_test.iloc[i])
    slda.infer(slda_test_doc)
    test_estimates.append(slda.estimate(slda_test_doc))

balanced_acc_val = balanced_accuracy_score(np.argmax(test_values, axis=1), np.argmax(test_estimates, axis=1))
# Calculate ROC AUC for each class

roc_auc = roc_auc_score(test_values, test_estimates, multi_class='ovr')
print(balanced_acc_val)
print(roc_auc)

100%|██████████| 24000/24000 [04:26<00:00, 90.16it/s]

0.8760716073680765
0.9694335461723224





In [52]:
import numpy as np

def get_top_four_elements(array_of_arrays):
    result = []

    for inner_array in array_of_arrays:
        absolute_values = np.abs(inner_array)
        sorted_indices = np.argsort(absolute_values)[::-1][:6]
        
        top_three_indices = sorted_indices
        top_three_values = inner_array[top_three_indices]

        result.append({
            'indices': top_three_indices,
            'values': top_three_values
        })

    return result

result = get_top_four_elements(slda.get_regression_coef())

for i, entry in enumerate(result):
    print(f"For inner array {i+1}, top four indices are {entry['indices']} and values are {entry['values']}.")

# 7, 9, 13, 15, 28, 29

For inner array 1, top four indices are [15 29  9  7 13  5] and values are [-23.725807  -15.979474  -13.451655  -12.468449  -12.3006935  12.151494 ].
For inner array 2, top four indices are [15  9 29 28 13  7] and values are [ 23.855532 -17.936125 -16.538229 -14.900158 -13.661053 -12.576155].
For inner array 3, top four indices are [ 7 13 15 29  5  9] and values are [ 18.106134  16.923796 -12.485664 -11.769079 -11.487708 -10.024684].
For inner array 4, top four indices are [ 9 29 28  7 15 20] and values are [ 21.365965  19.198513  16.235477 -15.895102 -14.031957  13.761041].


## Bert Topic

In [56]:
complaints_products['cleaned_text'].apply(lambda x: len(x)).std()

862.3262459682252

In [40]:
from sklearn.model_selection import train_test_split
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.dimensionality import BaseDimensionalityReduction
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import pickle

# Set the random seed for reproducibility
X = complaints_products['cleaned_text'].astype(str)
y = complaints_products['Product']

# Initialize the label encoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Create a 1/100 sample with the same proportions of classes as the original dataset
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded.astype(int), test_size=0.2, random_state=123)

# main difference: classifier replaces the clustering step
empty_dimensionality_model = BaseDimensionalityReduction()
clf = LogisticRegression()
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words = True)

topic_model = BERTopic(
    umap_model = empty_dimensionality_model,
    hdbscan_model = clf,
    ctfidf_model = ctfidf_model,
    language = 'english'
)

topics, probs = topic_model.fit_transform(X_train, y=y_train)

In [40]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Set the random seed for reproducibility
X = complaints_products['cleaned_text'].astype(str)
y = complaints_products['Product']

# Initialize the label encoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Create a 1/100 sample with the same proportions of classes as the original dataset
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded.astype(int), test_size=0.2, random_state=123)



In [18]:
mappings = topic_model.topic_mapper_.get_mappings()
print(mappings)

{0: 1, 1: 0, 2: 3, 3: 2}


In [13]:
# save_object(topic_model, 'outputs_new/topic_model.pkl')
# save_object(topics, 'outputs_new/topics.pkl')
# save_object(probs, 'outputs_new/probs.pkl')

with open("outputs_new/topic_model.pkl", "rb") as fp:
     topic_model = pickle.load(fp)
with open("outputs_new/topics.pkl", "rb") as fp:
     topics = pickle.load(fp)
with open("outputs_new/probs.pkl", "rb") as fp:
     probs = pickle.load(fp)


IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html



In [15]:
label_encoder.classes_

array(['Checking or savings account', 'Credit card or prepaid card',
       'Debt collection', 'Mortgage'], dtype=object)

In [19]:
# Creating topic representations (of the last batch)

# map input to topics
mappings = topic_model.topic_mapper_.get_mappings()
mappings = {value: label_encoder.classes_[key] for key, value in mappings.items()}

# assign original classes to our topics
df = topic_model.get_topic_info()
df["Class"] = df.Topic.map(mappings)
df[['Topic','Count','Name','Class']]

Unnamed: 0,Topic,Count,Name,Class
0,0,24107,0_card_credit_charge_account,Credit card or prepaid card
1,1,24044,1_account_bank_check_money,Checking or savings account
2,2,24012,2_mortgage_loan_payment_home,Mortgage
3,3,23837,3_debt_collection_credit_report,Debt collection


In [42]:
from tqdm import tqdm

mapping = {0: 1, 1: 0, 2: 3, 3: 2}
test_estimates = []

for doc in tqdm(X_test):
    topic, _ = topic_model.transform(doc)
    mapped_topic = mapping[topic[0]]
    test_estimates.append(mapped_topic)


100%|██████████| 24000/24000 [15:05<00:00, 26.50it/s]


In [44]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

encoded_classes = [0, 1, 2, 3]

# Reshape the array to a column vector
encoded_test = np.array(y_test).reshape(-1, 1)
encoded_pred = np.array(test_estimates).reshape(-1, 1)

# Initialize OneHotEncoder
onehot_encoder = OneHotEncoder(sparse=False, categories='auto')

# Fit and transform the data
onehot_encoded_test = onehot_encoder.fit_transform(encoded_test)
onehot_encoded_estimates = onehot_encoder.transform(encoded_pred)


In [45]:
from sklearn.metrics import balanced_accuracy_score, f1_score, roc_auc_score
AUC = roc_auc_score(onehot_encoded_test, onehot_encoded_estimates, multi_class='ovr')
BA_score = balanced_accuracy_score(y_test, test_estimates)
print('AUC (test):', AUC)
print('Balanced Acc (test):', BA_score)

AUC (test): 0.9333254167388831
Balanced Acc (test): 0.8999673731974861


### Bert Model Visualisation

In [None]:
topic_model.visualize_barchart()

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_heatmap()