In [None]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn import naive_bayes #import GaussianNB, MultinomialNB
from sklearn.linear_model import SGDClassifier # simulate the behavior of logistic regression using SGDClassifier(loss='log')
from sklearn.metrics import accuracy_score,balanced_accuracy_score,average_precision_score, classification_report
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from collections import Counter
from itertools import product

from imblearn.over_sampling import SMOTE 
import time

pd.set_option('display.max_colwidth', None)

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
# Load the trained vectorizer
with open('tfidf_vectorizer_train_split_33k.pkl', 'rb') as f:
    loaded_vectorizer = pickle.load(f)
len(loaded_vectorizer.get_feature_names_out())

train_df = pd.read_csv('cfpb_train.csv')
test_df = pd.read_csv('cfpb_test.csv')
dev_df = pd.read_csv('cfpb_dev.csv')

In [None]:
# some cleaning just ot make sure
train_df['Consumer complaint narrative'] = train_df['Consumer complaint narrative'].fillna('').astype(str)
test_df['Consumer complaint narrative'] = test_df['Consumer complaint narrative'].fillna('').astype(str)
dev_df['Consumer complaint narrative'] = dev_df['Consumer complaint narrative'].fillna('').astype(str)

train_df['debt_collection'] = (train_df['Product'] == 'Debt collection').astype(int)
test_df['debt_collection'] = (test_df['Product'] == 'Debt collection').astype(int)
dev_df['debt_collection'] = (dev_df['Product'] == 'Debt collection').astype(int)

In [None]:
%%time
X_test = loaded_vectorizer.transform(test_df['Consumer complaint narrative'])
y_test = test_df['debt_collection']

X_dev = loaded_vectorizer.transform(dev_df['Consumer complaint narrative'])
y_dev = dev_df['debt_collection']

### Grid Search Model

In [None]:
%%time
# Define the parameters for exploration
# First Run
param_grid = {
    'sample_size': [1000, 5000, 10000, 25000 ,50000, 100000, 150000, 200000, 250000,  300000], 
    'chi2_features': [500, 1000, 5000, 10000, 15000, 20000, 25000, 30000], 
    'clf__alpha': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5],
    'clf__fit_prior': [True], 
    'clf__class_prior': [None]
}


def grid_search_model(param_grid, train_df, X_dev, y_dev):
    results = []
    best_params = None
    best_score = 0
    prev_score = 0
    
    for sample_size in param_grid['sample_size']:
        for chi2_features in param_grid['chi2_features']:
            
            # Sample and transform the train data
            train_df_sample = train_df.sample(sample_size)
            
            X_train = loaded_vectorizer.transform(train_df_sample['Consumer complaint narrative'])
            y_train = train_df_sample['debt_collection']
            
            selector = SelectKBest(chi2, k=chi2_features)
            X_train = selector.fit_transform(X_train, y_train)
            # Transform dev set with the same selector
            X_dev_transformed = selector.transform(X_dev)
            
            sm = SMOTE(random_state=42)
            X_train, y_train = sm.fit_resample(X_train, y_train)
#             oversample = RandomOverSampler(sampling_strategy='minority')
#             X_train, y_train = oversample.fit_resample(X_train, y_train)
            
            # Shuffle your data
            X_train, y_train = shuffle(X_train, y_train)
            
            for params in product(
                param_grid['clf__alpha'],
                param_grid['clf__fit_prior'],
                param_grid['clf__class_prior']
            ):
                start_time = time.time()
                
                # Apply the parameters
                alpha, fit_prior, class_prior = params
                
                clf = naive_bayes.MultinomialNB(
                    alpha=alpha, 
                    fit_prior=fit_prior, 
                    class_prior=class_prior
                )
                # Train and score the model
                clf.fit(X_train, y_train)
                predicted = clf.predict(X_dev_transformed)
                score = f1_score(y_dev, predicted)
                # Calculate training time
                training_time = time.time() - start_time
                
                results.append({
                    'sample_size': sample_size,
                    'chi2_features': chi2_features,
                    'clf__alpha': alpha,
                    'clf__fit_prior': fit_prior,
                    'clf__class_prior': class_prior,
                    'f1_score': score,
                    'training_time': training_time
                })
                
                if score > best_score:
                    best_score = score
                    best_params = {
                        'sample_size': sample_size,
                        'chi2_features': chi2_features,
                        'clf__alpha': alpha,
                        'clf__fit_prior': fit_prior,
                        'clf__class_prior': class_prior,
                        'training_time': training_time
                    }
                    print(f"New best score:{score} using {best_params}")
#                 else:
#                     print(f"Current score:{score} using sample size={sample_size}, feature size={chi2_features} with params={params}")
                prev_score = score
                
                
                    
    return pd.DataFrame(results), best_params

In [None]:
%%time
df, best_params = grid_search_model(param_grid, train_df, X_dev, y_dev)
print(f'Best parameters: {best_params}')

In [None]:
df

In [None]:
df.to_csv("nb.csv", index=False)

### sensitivity analysis

In [None]:
import altair as alt
df = pd.read_csv("nb.csv")

In [None]:
import altair as alt

# F1 Score versus Sample Size
chart1 = alt.Chart(df).mark_circle(size=60).encode(
    alt.X('sample_size', scale=alt.Scale(type='log'), title="Sample Size (log scale)"),
    alt.Y('f1_score', title="F1 Score"),
    tooltip=['sample_size', 'chi2_features', 'clf__alpha', 'clf__fit_prior', 'clf__class_prior', 'f1_score', 'training_time']
).properties(
    title='F1 Score versus Sample Size'
)

# F1 Score versus Chi2 Features
chart2 = alt.Chart(df).mark_circle(size=60).encode(
    alt.X('chi2_features', scale=alt.Scale(type='log'), title="Chi2 Features (log scale)"),
    alt.Y('f1_score', title="F1 Score"),
    tooltip=['sample_size', 'chi2_features', 'clf__alpha', 'clf__fit_prior', 'clf__class_prior', 'f1_score', 'training_time']
).properties(
    title='F1 Score versus Chi2 Features'
)

# F1 Score versus Alpha
chart3 = alt.Chart(df).mark_circle(size=60).encode(
    alt.X('clf__alpha', scale=alt.Scale(type='log'), title="Alpha (log scale)"),
    alt.Y('f1_score', title="F1 Score"),
    tooltip=['sample_size', 'chi2_features', 'clf__alpha', 'clf__fit_prior', 'clf__class_prior', 'f1_score', 'training_time']
).properties(
    title='F1 Score versus Alpha'
)

# Training Time versus Sample Size
chart4 = alt.Chart(df).mark_circle(size=60).encode(
    alt.X('sample_size', scale=alt.Scale(type='log'), title="Sample Size (log scale)"),
    alt.Y('training_time', title="Training Time"),
    tooltip=['sample_size', 'chi2_features', 'clf__alpha', 'clf__fit_prior', 'clf__class_prior', 'f1_score', 'training_time']
).properties(
    title='Training Time versus Sample Size'
)

# Training Time versus Chi2 Features
chart5 = alt.Chart(df).mark_circle(size=60).encode(
    alt.X('chi2_features', scale=alt.Scale(type='log'), title="Chi2 Features (log scale)"),
    alt.Y('training_time', title="Training Time"),
    tooltip=['sample_size', 'chi2_features', 'clf__alpha', 'clf__fit_prior', 'clf__class_prior', 'f1_score', 'training_time']
).properties(
    title='Training Time versus Chi2 Features'
)

# Training Time versus Alpha
chart6 = alt.Chart(df).mark_circle(size=60).encode(
    alt.X('clf__alpha', scale=alt.Scale(type='log'), title="Alpha (log scale)"),
    alt.Y('training_time', title="Training Time"),
    tooltip=['sample_size', 'chi2_features', 'clf__alpha', 'clf__fit_prior', 'clf__class_prior', 'f1_score', 'training_time']
).properties(
    title='Training Time versus Alpha'
)

# Bubble Chart: Adjusted F1 Score versus Sample Size and Chi2 Features
df['f1_score_adjusted'] = (df['f1_score'] - 0.4) * 100  # Adjusting the F1 scores

chart7 = alt.Chart(df).mark_point().encode(
    alt.X('sample_size', scale=alt.Scale(type='log'), title="Sample Size (log scale)"),
    alt.Y('chi2_features', scale=alt.Scale(type='log'), title="Chi2 Features (log scale)"),
    alt.Size('f1_score_adjusted', title="Adjusted F1 Score", scale=alt.Scale(range=[10, 1000])),
    tooltip=['sample_size', 'chi2_features', 'clf__alpha', 'clf__fit_prior', 'clf__class_prior', 'f1_score', 'training_time']
).properties(
    title='Adjusted F1 Score versus Sample Size and Chi2 Features'
)





(chart1 | chart2 | chart3) & (chart4 | chart5 | chart6)# & chart7


In [None]:
# F1 Score versus Sample Size
chart1 = alt.Chart(df).mark_boxplot().encode(
    alt.X('sample_size', scale=alt.Scale(type='log'), title="Sample Size (log scale)"),
    alt.Y('f1_score', title="F1 Score"),
    tooltip=['sample_size', 'chi2_features', 'clf__alpha', 'clf__fit_prior', 'clf__class_prior', 'f1_score', 'training_time']
).properties(
    title='F1 Score versus Sample Size'
)

# F1 Score versus Chi2 Features
chart2 = alt.Chart(df).mark_boxplot().encode(
    alt.X('chi2_features', scale=alt.Scale(type='log'), title="Chi2 Features (log scale)"),
    alt.Y('f1_score', title="F1 Score"),
    tooltip=['sample_size', 'chi2_features', 'clf__alpha', 'clf__fit_prior', 'clf__class_prior', 'f1_score', 'training_time']
).properties(
    title='F1 Score versus Chi2 Features'
)

# F1 Score versus Alpha
chart3 = alt.Chart(df).mark_boxplot().encode(
    alt.X('clf__alpha', scale=alt.Scale(type='log'), title="Alpha (log scale)"),
    alt.Y('f1_score', title="F1 Score"),
    tooltip=['sample_size', 'chi2_features', 'clf__alpha', 'clf__fit_prior', 'clf__class_prior', 'f1_score', 'training_time']
).properties(
    title='F1 Score versus Alpha'
)

# Training Time versus Sample Size
chart4 = alt.Chart(df).mark_boxplot().encode(
    alt.X('sample_size', scale=alt.Scale(type='log'), title="Sample Size (log scale)"),
    alt.Y('training_time', title="Training Time"),
    tooltip=['sample_size', 'chi2_features', 'clf__alpha', 'clf__fit_prior', 'clf__class_prior', 'f1_score', 'training_time']
).properties(
    title='Training Time versus Sample Size'
)

# Training Time versus Chi2 Features
chart5 = alt.Chart(df).mark_boxplot().encode(
    alt.X('chi2_features', scale=alt.Scale(type='log'), title="Chi2 Features (log scale)"),
    alt.Y('training_time', title="Training Time"),
    tooltip=['sample_size', 'chi2_features', 'clf__alpha', 'clf__fit_prior', 'clf__class_prior', 'f1_score', 'training_time']
).properties(
    title='Training Time versus Chi2 Features'
)

# Training Time versus Alpha
chart6 = alt.Chart(df).mark_boxplot().encode(
    alt.X('clf__alpha', scale=alt.Scale(type='log'), title="Alpha (log scale)"),
    alt.Y('training_time', title="Training Time"),
    tooltip=['sample_size', 'chi2_features', 'clf__alpha', 'clf__fit_prior', 'clf__class_prior', 'f1_score', 'training_time']
).properties(
    title='Training Time versus Alpha'
)
(chart1 | chart2 | chart3) & (chart4 | chart5 | chart6)

In [None]:
# F1 Score versus Sample Size
chart1 = alt.Chart(df).mark_boxplot().encode(
    alt.X('sample_size', title="Sample Size"),
    alt.Y('f1_score', title="F1 Score"),
    tooltip=['sample_size', 'chi2_features', 'clf__alpha', 'clf__fit_prior', 'clf__class_prior', 'f1_score', 'training_time']
).properties(
    title='F1 Score versus Sample Size'
)

# F1 Score versus Chi2 Features
chart2 = alt.Chart(df).mark_boxplot().encode(
    alt.X('chi2_features', title="Chi2 Features"),
    alt.Y('f1_score', title="F1 Score"),
    tooltip=['sample_size', 'chi2_features', 'clf__alpha', 'clf__fit_prior', 'clf__class_prior', 'f1_score', 'training_time']
).properties(
    title='F1 Score versus Chi2 Features'
)

# F1 Score versus Alpha
chart3 = alt.Chart(df).mark_boxplot().encode(
    alt.X('clf__alpha', title="Alpha"),
    alt.Y('f1_score', title="F1 Score"),
    tooltip=['sample_size', 'chi2_features', 'clf__alpha', 'clf__fit_prior', 'clf__class_prior', 'f1_score', 'training_time']
).properties(
    title='F1 Score versus Alpha'
)

# Training Time versus Sample Size
chart4 = alt.Chart(df).mark_boxplot().encode(
    alt.X('sample_size', title="Sample Size"),
    alt.Y('training_time', title="Training Time"),
    tooltip=['sample_size', 'chi2_features', 'clf__alpha', 'clf__fit_prior', 'clf__class_prior', 'f1_score', 'training_time']
).properties(
    title='Training Time versus Sample Size'
)

# Training Time versus Chi2 Features
chart5 = alt.Chart(df).mark_boxplot().encode(
    alt.X('chi2_features', title="Chi2 Features"),
    alt.Y('training_time', title="Training Time"),
    tooltip=['sample_size', 'chi2_features', 'clf__alpha', 'clf__fit_prior', 'clf__class_prior', 'f1_score', 'training_time']
).properties(
    title='Training Time versus Chi2 Features'
)

# Training Time versus Alpha
chart6 = alt.Chart(df).mark_boxplot().encode(
    alt.X('clf__alpha', title="Alpha"),
    alt.Y('training_time', title="Training Time"),
    tooltip=['sample_size', 'chi2_features', 'clf__alpha', 'clf__fit_prior', 'clf__class_prior', 'f1_score', 'training_time']
).properties(
    title='Training Time versus Alpha'
)
(chart1 | chart2 | chart3) & (chart4 | chart5 | chart6)

### NAIVE BAYES

In [None]:
%%time
best_params = {'sample_size': 100000, 'chi2_features': 25000, 'clf__alpha': 0.001, 'clf__fit_prior': True, 'clf__class_prior': None, 'training_time': 0.0800180435180664}
sample_size = best_params['sample_size']
chi2_features = best_params['chi2_features']
alpha =  best_params['clf__alpha']
fit_prior =  best_params['clf__fit_prior']
class_prior =  best_params['clf__class_prior']


train_df_sample = train_df.sample(sample_size).copy()
            
X_train = loaded_vectorizer.transform(train_df_sample['Consumer complaint narrative'])
y_train = train_df_sample['debt_collection']

In [None]:
%%time
selector = SelectKBest(chi2, k=chi2_features)
X_train = selector.fit_transform(X_train, y_train)
# Transform dev set with the same selector
X_dev_transformed = selector.transform(X_dev)

oversample = RandomOverSampler(sampling_strategy='minority')
X_train, y_train = oversample.fit_resample(X_train, y_train)

In [None]:
%%time
clf = naive_bayes.MultinomialNB(
                    alpha=alpha, 
                    fit_prior=fit_prior, 
                    class_prior=class_prior
                )
clf.fit(X_train, y_train)

In [None]:
# get the results
y_pred = clf.predict(X_train.toarray())
y_pred_proba = clf.predict_proba(X_train.toarray())

# Convert lists to arrays for further use
y_pred = np.array(y_pred)
y_pred_proba = np.array(y_pred_proba)

print("Train")
print("accuracy_score",accuracy_score(y_train, y_pred))
print("balanced_accuracy_score",balanced_accuracy_score(y_train, y_pred))
print("average_precision_score",average_precision_score(y_train, y_pred))
print(classification_report(y_train, y_pred))

c = Counter(y_pred)
print("Prediction", c.most_common(2))
c = Counter(y_train)
print("Ground Truth",c.most_common(2))

prediction = pd.DataFrame(y_pred_proba)
prediction['result'] = y_pred

df = prediction.copy()
df.columns = ['neg', 'pos', 'class']
df['true'] = y_train

fig, axs = plt.subplots(ncols=2, figsize=(15, 5)) # Adjust figsize as needed

# Plot Probability Density for Different True Classes
for class_label in df['true'].unique():
    sns.kdeplot(df[df['true'] == class_label]['pos'], label=class_label, ax=axs[0])

axs[0].set_title('Probability Density for Different True Classes')
axs[0].set_xlabel('Probability')
axs[0].set_ylabel('Density')
axs[0].legend(title='True Class')

# Plot Probability Density for Different Predicted Classes
for class_label in df['class'].unique():
    sns.kdeplot(df[df['class'] == class_label]['pos'], label=class_label, ax=axs[1])

axs[1].set_title('Probability Density for Different Predicted Classes')
axs[1].set_xlabel('Probability')
axs[1].set_ylabel('Density')
axs[1].legend(title='Predicted Class')

plt.tight_layout()
plt.show()

In [None]:
failure_analysis = dev_df.sample(20000, random_state=42).copy()
X = loaded_vectorizer.transform(failure_analysis['Consumer complaint narrative'])
X_transformed = selector.transform(X)
y = failure_analysis['debt_collection']

y_pred = clf.predict(X_transformed.toarray())
y_pred_proba = clf.predict_proba(X_transformed.toarray())
prediction = pd.DataFrame(y_pred_proba)
prediction['result'] = y_pred

failure_analysis['y_pred'] = y_pred
failure_analysis['y_pred_proba'] = y_pred_proba[:, 1]

In [None]:
print("Test")
print("accuracy_score",accuracy_score(y, y_pred))
print("balanced_accuracy_score",balanced_accuracy_score(y, y_pred))
print("average_precision_score",average_precision_score(y, y_pred))
print(classification_report(y, y_pred))

In [None]:
df = prediction.copy()
df.columns = ['neg', 'pos', 'class']
df['true'] = y

fig, axs = plt.subplots(ncols=2, figsize=(15, 5)) # Adjust figsize as needed

# Plot Probability Density for Different True Classes
for class_label in df['true'].unique():
    sns.kdeplot(df[df['true'] == class_label]['pos'], label=class_label, ax=axs[0])

axs[0].set_title('Probability Density for Different True Classes')
axs[0].set_xlabel('Probability')
axs[0].set_ylabel('Density')
axs[0].legend(title='True Class')

# Plot Probability Density for Different Predicted Classes
for class_label in df['class'].unique():
    sns.kdeplot(df[df['class'] == class_label]['pos'], label=class_label, ax=axs[1])

axs[1].set_title('Probability Density for Different Predicted Classes')
axs[1].set_xlabel('Probability')
axs[1].set_ylabel('Density')
axs[1].legend(title='Predicted Class')

plt.tight_layout()
plt.show()

In [None]:
cfpb_df = pd.read_csv('../../data/CFPB with Duplicate Marked.csv')
cfpb_df = cfpb_df[['Consumer complaint narrative', 'Complaint ID']]
cfpb_df.columns = ['Original Complaint', 'Complaint ID']
failure_analysis = failure_analysis.merge(cfpb_df, on='Complaint ID', how='left')
failure_analysis.to_csv("failure example.csv", index=False)

In [None]:
failure_analysis[(failure_analysis.y_pred!=failure_analysis['debt_collection'])&(failure_analysis.y_pred==1)][['Original Complaint','debt_collection','y_pred','y_pred_proba','Product','Complaint ID']].sample(10)

In [None]:
failure_analysis[(failure_analysis.y_pred!=failure_analysis['debt_collection'])&(failure_analysis.y_pred==0)][['Original Complaint','debt_collection','y_pred','y_pred_proba','Product','Complaint ID']].sample(10)

In [None]:
failure_analysis['narr_length'] = failure_analysis['Original Complaint'].apply(lambda x: len(x))

In [None]:
failure_analysis[(failure_analysis.y_pred!=failure_analysis['debt_collection'])&(failure_analysis.y_pred==1)]['narr_length'].plot(kind='hist', bins=100, edgecolor='black')

In [None]:
failure_analysis[(failure_analysis.y_pred!=failure_analysis['debt_collection'])&(failure_analysis.y_pred==0)]['narr_length'].plot(kind='hist', bins=100, edgecolor='black')

In [None]:
import altair as alt
alt.data_transformers.disable_max_rows()

# Filter the data based on the conditions
filtered_data = failure_analysis[(failure_analysis.y_pred != failure_analysis['debt_collection'])]# & (failure_analysis.y_pred == 1)]

# Create a scatter plot using Altair
scatter_plot = alt.Chart(filtered_data).mark_circle(size=60).encode(
    x=alt.X('narr_length', scale=alt.Scale(domain=(0, 15000))),
    y=alt.Y('y_pred_proba', scale=alt.Scale(domain=(0, 1))),
    color=alt.Color('y_pred_proba', scale=alt.Scale(scheme='viridis')),
    tooltip=['narr_length', 'y_pred_proba']
).properties(
    width=500,
    height=400
)

# Show the scatter plot
scatter_plot

In [None]:
# Filter the data based on the conditions
filtered_data = failure_analysis[(failure_analysis.y_pred == failure_analysis['debt_collection'])]# & (failure_analysis.y_pred == 1)]

# Create a scatter plot using Altair
scatter_plot = alt.Chart(filtered_data).mark_circle(size=60).encode(
    x=alt.X('narr_length', scale=alt.Scale(domain=(0, 15000))),
    y=alt.Y('y_pred_proba', scale=alt.Scale(domain=(0, 1))),
    color=alt.Color('y_pred_proba', scale=alt.Scale(scheme='viridis')),
    tooltip=['narr_length', 'y_pred_proba']
).properties(
    width=500,
    height=400
)

# Show the scatter plot
scatter_plot

In [None]:
# Filter the data based on the conditions
filtered_data = failure_analysis[(failure_analysis.y_pred != failure_analysis['debt_collection'])
                                &(failure_analysis.y_pred_proba>0.5)
                                &(failure_analysis.y_pred_proba<1.01)]# & (failure_analysis.y_pred == 1)]

# Create a scatter plot using Altair
scatter_plot = alt.Chart(filtered_data).mark_circle(size=60).encode(
    x=alt.X('narr_length', scale=alt.Scale(domain=(0, 7000))),
    y=alt.Y('y_pred_proba', scale=alt.Scale(domain=(0.5, 1))),
    color=alt.Color('y_pred_proba', scale=alt.Scale(scheme='viridis')),
    tooltip=['narr_length', 'y_pred_proba']
).properties(
    width=500,
    height=400
)

# Show the scatter plot
scatter_plot

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Binning narr_length and y_pred_proba into 10 bins
filtered_data = failure_analysis[(failure_analysis.y_pred == failure_analysis['debt_collection'])&(failure_analysis['narr_length']<3000)]
filtered_data['narr_length_bin'] = pd.cut(filtered_data['narr_length'], bins=50)
filtered_data['y_pred_proba_bin'] = pd.cut(filtered_data['y_pred_proba'], bins=20)

# Create a pivot table for the heatmap
heatmap_data = filtered_data.pivot_table(index='narr_length_bin', 
                                         columns='y_pred_proba_bin', 
                                         aggfunc='size')

# Create the heatmap
plt.figure(figsize=(10,8))
sns.heatmap(heatmap_data, cmap="viridis")
plt.show()


In [None]:
# Binning narr_length and y_pred_proba into 10 bins
filtered_data = failure_analysis[(failure_analysis.y_pred != failure_analysis['debt_collection'])&(failure_analysis['narr_length']<3000)]
filtered_data['narr_length_bin'] = pd.cut(filtered_data['narr_length'], bins=50)
filtered_data['y_pred_proba_bin'] = pd.cut(filtered_data['y_pred_proba'], bins=20)

# Create a pivot table for the heatmap
heatmap_data = filtered_data.pivot_table(index='narr_length_bin', 
                                         columns='y_pred_proba_bin', 
                                         aggfunc='size')

# Create the heatmap
plt.figure(figsize=(10,8))
sns.heatmap(heatmap_data, cmap="viridis")
plt.title('False Positive Predictions Probability vs Complaint Narrative Length')
plt.ylabel('Narrative Length')
plt.xlabel('Predicted Probability')
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix

# Get the true and predicted class labels
y_true = failure_analysis['debt_collection']
y_pred = failure_analysis.y_pred

# Compute the confusion matrix
cm = confusion_matrix(y_true, y_pred)

print(cm)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plot the confusion matrix
plt.figure(figsize=(5,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()