In [53]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('CFPB with preprocessing')
print(df.columns)
print(df.shape)

Index(['Unnamed: 0', 'Product', 'Issue', 'State', 'ZIP code', 'Complaint ID',
       'Consumer complaint narrative', 'unigram_narr'],
      dtype='object')
(5000, 8)


In [16]:
# create target
df['y_target'] = (df['Product'] == 'Debt collection').astype(int)

# vectorization
vectorizer = TfidfVectorizer(ngram_range=(1,2))
X = vectorizer.fit_transform(df['unigram_narr'].values.astype('U'))
X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# calculate Pearson correlations
correlations = X_df.apply(lambda x: pearsonr(x, df['y_target'])[0])

In [47]:
# select features with correlations above the threshold
selected_features = correlations[correlations.abs() > 0.11].index

In [48]:
# note the imbalance
class_counts = df['y_target'].value_counts()
print(class_counts)

0    4064
1     936
Name: y_target, dtype: int64


In [49]:
print("Selected features:", selected_features)

Selected features: Index(['alleg', 'alleg debt', 'bank', 'bill', 'call', 'call from', 'call me',
       'ceas', 'collect', 'collect agenc', 'collect debt', 'collect on',
       'collector', 'compani', 'contact me', 'contract', 'debt', 'debt and',
       'debt collect', 'debt collector', 'debt from', 'debt in', 'debt is',
       'debt owe', 'debt that', 'debt they', 'debt to', 'debt wa', 'fair debt',
       'fdcpa', 'for debt', 'garnish', 'harass', 'harass me', 'hospit', 'inc',
       'llc', 'loan', 'medic', 'medic bill', 'midland', 'mortgag', 'my',
       'not owe', 'of debt', 'on debt', 'owe', 'owe them', 'phone', 'recoveri',
       'stop', 'that owe', 'the bill', 'the collect', 'the debt', 'the origin',
       'them', 'they call', 'thi', 'thi collect', 'thi compani', 'thi debt',
       'threaten', 'to collect', 'to stop', 'valid'],
      dtype='object')


In [55]:
from imblearn.over_sampling import SMOTE
X_selected = X_df[selected_features]
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_selected, df['y_target'])

# splitting and training
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
clf = SVC()
clf.fit(X_train, y_train)

# predict and report
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Classifier accuracy:", accuracy)
print(classification_report(y_test, y_pred))

Classifier accuracy: 0.8683886838868389
              precision    recall  f1-score   support

           0       0.85      0.89      0.87       812
           1       0.89      0.85      0.87       814

    accuracy                           0.87      1626
   macro avg       0.87      0.87      0.87      1626
weighted avg       0.87      0.87      0.87      1626



In [51]:
# if we don't use smote
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, df['y_target'], test_size=0.2, random_state=42)

clf = SVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Classifier accuracy:", accuracy)
print(classification_report(y_test, y_pred))

Classifier accuracy: 0.89
              precision    recall  f1-score   support

           0       0.91      0.95      0.93       794
           1       0.79      0.64      0.71       206

    accuracy                           0.89      1000
   macro avg       0.85      0.80      0.82      1000
weighted avg       0.89      0.89      0.89      1000



In [None]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

In [54]:
# including hyper tunning
X_selected = X_df[selected_features]
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_selected, df['y_target'])

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf']
}

grid_search = GridSearchCV(SVC(), param_grid, cv=5, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

best_clf = SVC(**grid_search.best_params_)
best_clf.fit(X_train, y_train)

y_pred = best_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Classifier accuracy:", accuracy)
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters: {'C': 100, 'gamma': 1, 'kernel': 'rbf'}
Best score: 0.847586471944658
Classifier accuracy: 0.8542435424354243
              precision    recall  f1-score   support

           0       0.82      0.90      0.86       812
           1       0.89      0.81      0.85       814

    accuracy                           0.85      1626
   macro avg       0.86      0.85      0.85      1626
weighted avg       0.86      0.85      0.85      1626



In [56]:
# wider range hyper tunning
X_selected = X_df[selected_features]
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_selected, df['y_target'])

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100, 1000],  # Regularization parameter
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],  # Kernel coefficient
    'kernel': ['rbf', 'linear', 'poly', 'sigmoid'],  # Kernel type
    'degree': [2, 3, 4, 5],  # Degree for 'poly' kernel
    'class_weight': [None, 'balanced']  # Class weight
}


grid_search = GridSearchCV(SVC(), param_grid, cv=5, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

best_clf = SVC(**grid_search.best_params_)
best_clf.fit(X_train, y_train)

y_pred = best_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Classifier accuracy:", accuracy)
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 960 candidates, totalling 4800 fits
Best parameters: {'C': 1000, 'class_weight': None, 'degree': 2, 'gamma': 1, 'kernel': 'rbf'}
Best score: 0.8600429255602199
Classifier accuracy: 0.8690036900369004
              precision    recall  f1-score   support

           0       0.84      0.91      0.87       812
           1       0.90      0.83      0.86       814

    accuracy                           0.87      1626
   macro avg       0.87      0.87      0.87      1626
weighted avg       0.87      0.87      0.87      1626

