## Train SVM Binary Classifiers For Customer Complaints

In [None]:
# Standard libraries
import re
import string
from datetime import datetime

# External libraries: basic utilities and data manipulation
import numpy as np
import pandas as pd

# External libraries: data visualization
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML

# External libraries: natural language processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer

# External libraries: data science and machine learning
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from scipy import spatial
from sklearn.manifold import TSNE
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC

# External libraries: other utilities
import pickle

# Setting options and styles
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_colwidth', None)

The following block consists of multiple operations centered around text vectorization and normalization using the GloVe word embeddings. Initially, 50-dimensional and 100-dimensional GloVe embeddings are loaded from respective files into two dictionaries. The script also defines two functions, vectorize_text_50 and vectorize_text_100, which vectorize input text using the 50-dimensional and 100-dimensional embeddings, respectively. For texts that don't have corresponding word embeddings, the functions return zero-vectors of matching dimensions. 

Additionally, a text_normalizer function is introduced that tokenizes a given text, removes specific patterns, converts text to lowercase, and removes tokens that are mere numbers. Lastly, the code loads a CSV file named "CFPB with Duplicate Marked NEW.csv" into a DataFrame, prints its shape, and subsequently drops duplicate rows based on the 'dupi_id' column before printing the updated shape.

In [None]:
# Load 50-dimensional GloVe embeddings into a dictionary
glove_6B_50D_txt_file = "glove_file, something like glove.6B.50d.txt"
embeddings_dict_6B_50D = {}
with open(glove_6B_50D_txt_file, 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = ' '.join(values[:-50]).lower().strip()
        vector = np.asarray(values[-50:], "float32")
        embeddings_dict_6B_50D[word] = vector

# Function to vectorize a given text using 50-dimensional GloVe embeddings
def vectorize_text_50(text):
    vectors = [embeddings_dict_6B_50D.get(word) for word in str(text).split() if word in embeddings_dict_6B_50D]
    vectors = [v for v in vectors if v is not None]  # remove any None values
    if vectors:
        vectorized = np.mean(vectors, axis=0)
    else:
        vectorized = np.zeros(50)  # if there are no vectors, return a zero-vector
    return vectorized

# Load 100-dimensional GloVe embeddings into a dictionary
glove_6B_100D_txt_file = "glove_file, something like glove.6B.100d.txt"
embeddings_dict_6B_100D = {}
with open(glove_6B_100D_txt_file, 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = ' '.join(values[:-100]).lower().strip()
        vector = np.asarray(values[-100:], "float32")
        embeddings_dict_6B_100D[word] = vector

# Function to vectorize a given text using 100-dimensional GloVe embeddings
def vectorize_text_100(text):
    vectors = [embeddings_dict_6B_100D.get(word) for word in str(text).split() if word in embeddings_dict_6B_100D]
    vectors = [v for v in vectors if v is not None]  # remove any None values
    if vectors:
        vectorized = np.mean(vectors, axis=0)
    else:
        vectorized = np.zeros(100)  # if there are no vectors, return a zero-vector
    return vectorized

# Function to normalize the text by tokenizing, cleaning up specific patterns and converting to lowercase
def text_normalizer(text):
    if text:
        # Use NLTK RegexpTokenizer for tokenization. 
        tokenizer = RegexpTokenizer(r'\b\w[\w\'-]*\w\b|\w')
        words = tokenizer.tokenize(text)

        # Clean tokens with repeating characters like '666', 'aaa', '!!!!!!'
        words = [re.sub(r'(\w)\1{2,}', '', word) if re.search(r'(\w)\1{2,}', word) else word for word in words]

        # Convert to lowercase and remove punctuations.
        words = [word.lower().strip() for word in words]

        # Remove tokens that are just numbers.
        words = ['' if word.isdigit() else word for word in words]

        # Join the words back into a single string.
        text = ' '.join([word for word in words if word])
    
    return text

# Load data from CSV for a sanity check
complaint_file = "you complaint file here, should called something like complaints.csv, it hsoul dhave been marked with duplicative complaints"
cfpb_df = pd.read_csv(complaint_file)
print(cfpb_df.shape)
# Drop duplicates based on 'dupi_id' column
cfpb_df = cfpb_df.drop_duplicates(subset='dupi_id')
print(cfpb_df.shape)

The following block aims to address data imbalance by reclassifying and consolidating product and issue categories in a DataFrame named `cfpb_df`. Two dictionaries, `product_mapping` and `issue_dict`, are defined to map the existing product and issue labels to a more condensed set of categories. These mappings are then applied to the 'Product' and 'Issue' columns of the DataFrame, resulting in two new columns: 'combined_product' and 'combined_issue'. To address missing mappings for 'Issue', any unmapped values are retained as they are. Following the reclassification, the code explores the data distribution by visualizing the distribution of narrative lengths (less than 1000 characters) in a histogram. Furthermore, it calculates and displays the normalized distribution of longer narratives (more than 500 characters) by their reclassified product categories, highlighting imbalances in the consolidated categories.

In [None]:
# attempt to improve unbalance by re-classifiying the data
product_mapping = {
    "Credit reporting, credit repair services, or other personal consumer reports": "Credit reporting and repair services",
    "Credit reporting": "Credit reporting and repair services",
    "Credit card or prepaid card": "Credit and prepaid cards",
    "Credit card": "Credit and prepaid cards",
    "Prepaid card": "Credit and prepaid cards",
    "Checking or savings account": "Banking services",
    "Bank account or service": "Banking services",
    "Money transfer, virtual currency, or money service": "Money transfer and virtual currency services",
    "Money transfers": "Money transfer and virtual currency services",
    "Virtual currency": "Money transfer and virtual currency services",
    "Vehicle loan or lease": "Loans and leases",
    "Consumer Loan": "Loans and leases",
    "Payday loan, title loan, or personal loan": "Short-term and personal loans",
    "Payday loan": "Short-term and personal loans",
    "Other financial service": "Other financial service",
    "Mortgage": "Mortgage",
    "Student loan": "Student loan",
    "Debt collection": "Debt collection"
}

issue_dict = {
    "Improper use of your report": "Credit report issues",
    "Incorrect information on your report": "Credit report issues",
    "Problem with a credit reporting company's investigation into an existing problem": "Credit report issues",
    "Credit reporting company's investigation": "Credit report issues",
    "Improper use of my credit report": "Credit report issues",
    "Incorrect information on credit report": "Credit report issues",
    "Unable to get credit report/credit score": "Credit report issues",

    "Fraud or scam": "Fraud and Identity theft issues",
    "Identity theft / Fraud / Embezzlement": "Fraud and Identity theft issues",
    "Received a loan you didn't apply for": "Fraud and Identity theft issues",

    "Managing an account": "Account management issues",
    "Opening an account": "Account management issues",
    "Closing your account": "Account management issues",
    "Closing an account": "Account management issues",
    "Account opening, closing, or management": "Account management issues",
    "Managing, opening, or closing your mobile wallet account": "Account management issues",
    "Managing, opening, or closing account": "Account management issues",

    "Attempts to collect debt not owed": "Unjustified debt collection attempts",
    "Cont'd attempts collect debt not owed": "Unjustified debt collection attempts",

    "Problem with a purchase shown on your statement": "Transaction issues",
    "Problem with a purchase or transfer": "Transaction issues",
    "Other transaction problem": "Transaction issues",
    "Unauthorized transactions or other transaction problem": "Transaction issues",
    "Other transaction issues": "Transaction issues",
    "Transaction issue": "Transaction issues",
    "Unauthorized transactions/trans. issues": "Transaction issues",

    "Struggling to pay your bill": "Payment struggles",
    "Struggling to pay your loan": "Payment struggles",
    "Struggling to repay your loan": "Payment struggles",
    "Can't repay my loan": "Payment struggles",
    "Problems when you are unable to pay": "Payment struggles",

    "Problem with fraud alerts or security freezes": "Fraud alerts and identity protection issues",
    "Identity theft protection or other monitoring services": "Fraud alerts and identity protection issues",
    "Credit monitoring or identity theft protection services": "Fraud alerts and identity protection issues",
    "Credit monitoring or identity protection": "Fraud alerts and identity protection issues",

    "Took or threatened to take negative or legal action": "Improper actions or threats",
    "Threatened to contact someone or share information improperly": "Improper actions or threats",
    "Taking/threatening an illegal action": "Improper actions or threats",
    "Improper contact or sharing of info": "Improper actions or threats",

    "Fees or interest": "Fee and interest issues",
    "Charged fees or interest you didn't expect": "Fee and interest issues",
    "Unexpected or other fees": "Fee and interest issues",
    "Fees": "Fee and interest issues",
    "Cash advance fee": "Fee and interest issues",
    "Overlimit fee": "Fee and interest issues",
    "Balance transfer fee": "Fee and interest issues",

    "Getting a credit card": "Credit and loan acquisition issues",
    "Trouble using your card": "Credit and loan acquisition issues",
    "Trouble using the card": "Credit and loan acquisition issues",
    "Getting a line of credit": "Credit and loan acquisition issues",
    "Shopping for a loan or lease": "Credit and loan acquisition issues",
    "Shopping for a line of credit": "Credit and loan acquisition issues",

    "Problem with customer service": "Customer service issues",
    "Customer service / Customer relations": "Customer service issues",
    "Customer service/Customer relations": "Customer service issues",
    
    "Can't contact lender or servicer": "Communication issues",
    "Can't contact lender": "Communication issues",
    "Communication tactics": "Communication issues",
    
    "Problem with the payoff process at the end of the loan": "Loan issues",
    "Problems at the end of the loan or lease": "Loan issues",
    "Managing the loan or lease": "Loan issues",
    "Loan payment wasn't credited to your account": "Loan issues",
    "Loan servicing, payments, escrow account": "Loan issues",
    "Problem when making payments": "Loan issues",
    "Making/receiving payments, sending money": "Loan issues",
    "Managing the line of credit": "Loan issues",
    "Loan modification,collection,foreclosure": "Loan issues",
    
    "Advertising and marketing, including promotional offers": "Advertising and marketing issues",
    "Advertising": "Advertising and marketing issues",
    "Advertising and marketing": "Advertising and marketing issues",
    "Confusing or misleading advertising or marketing": "Advertising and marketing issues",
    "Advertising, marketing or disclosures": "Advertising and marketing issues",
    
    "Problem with a lender or other company charging your account": "Lender issues",
    "Dealing with your lender or servicer": "Lender issues",
    "Can't stop withdrawals from your bank account": "Lender issues",
    "Money was taken from your bank account on the wrong day or for the wrong amount": "Lender issues",
    "Was approved for a loan, but didn't receive the money": "Lender issues",
    "Applied for loan/did not receive money": "Lender issues",
    "Was approved for a loan, but didn't receive money": "Lender issues",
    
    "Problem with additional add-on products or services": "Product and service issues",
    "Other service problem": "Product and service issues",
    "Other service issues": "Product and service issues",
    
    "Problem with cash advance": "Transaction and payment issues",
    "Charged bank acct wrong day or amt": "Transaction and payment issues",
    "Payment to acct not credited": "Transaction and payment issues",
    "Payoff process": "Transaction and payment issues",
    "Cash advance": "Transaction and payment issues",
    
    "False statements or representation": "False statements and representation issues",
    "Confusing or missing disclosures": "False statements and representation issues",
    "Disclosure verification of debt": "False statements and representation issues",
    "Incorrect/missing disclosures or info": "False statements and representation issues",
    "Disclosures": "False statements and representation issues",
    "Incorrect exchange rate": "False statements and representation issues",
    
    "Applying for a mortgage or refinancing an existing mortgage": "Mortgage issues",
    "Struggling to pay mortgage": "Mortgage issues",
    "Closing on a mortgage": "Mortgage issues",
    "Application, originator, mortgage broker": "Mortgage issues",
    
    "Credit limit changed": "Credit issues",
    "Credit decision / Underwriting": "Credit issues",
    "Credit card protection / Debt protection": "Credit issues",
    "Rewards": "Credit issues",
    "Credit determination": "Credit issues",
    "Credit line increase/decrease": "Credit issues",
    "APR or interest rate": "Credit issues",
    
    "Vehicle was repossessed or sold the vehicle": "Vehicle issues",
    "Vehicle was damaged or destroyed the vehicle": "Vehicle issues",
    "Lender repossessed or sold the vehicle": "Vehicle issues",
    "Lender damaged or destroyed vehicle": "Vehicle issues",
    "Property was sold": "Vehicle issues",
    "Property was damaged or destroyed property": "Vehicle issues",
    "Lender sold the property": "Vehicle issues",
    "Lender damaged or destroyed property": "Vehicle issues",
    
    "Bankruptcy": "Bankruptcy issues",
    "Balance transfer": "Balance transfer issues",
    "Using a debit or ATM card": "Debit and ATM card issues",
    
    "Privacy": "Privacy issues",
    
    "Forbearance / Workout plans": "Workout plan issues",
    "Sale of account": "Account sale issues",
    
    "Adding money": "Money adding issues",
    "Problem adding money": "Money adding issues",
    
    "Delinquent account": "Account delinquency issues",
    
    "Application processing delay": "Application processing issues",
    
    "Arbitration": "Arbitration issues",
    
    "Convenience checks": "Check issues",
    "Lost or stolen check": "Check issues",
    
    "Overdraft, savings, or rewards features": "Overdraft, savings, or rewards issues",
    "Overdraft, savings or rewards features": "Overdraft, savings, or rewards issues",
    
    "Unexpected/Other fees": "Unexpected fees issues",
    "Balance transfer fee": "Unexpected fees issues",
    "Excessive fees": "Unexpected fees issues",
    
    "Other": "Other issues"
}


# apply the mapping to the 'Product' and 'Issue" column
cfpb_df['combined_product'] = cfpb_df['Product'].map(product_mapping)
cfpb_df['combined_issue'] = cfpb_df['Issue'].map(issue_dict).fillna(cfpb_df['Issue'])

# check the distribution of narrative length by character:
cfpb_df[cfpb_df['narr_len']<1000]['narr_len'].plot(kind='hist', bins=100)

# check the distribution of longer narrative length by products, even by combined products, the imbalance is critical
cfpb_df[(cfpb_df['narr_len']>500)].combined_product.value_counts(normalize=True)

The code defines three primary functions aimed at supporting SVM-based text classification tasks:

1. `plot_proba_distribution`: This function visualizes the distribution of predicted probabilities across multiple classes.
2. `grid_search_svm`: For each unique value in a specified column of the dataframe, this function constructs a binary classification problem, balances the classes, then performs a grid search over specified hyperparameters of an SVM to identify optimal parameters. It iteratively trains and evaluates SVM models on resampled data of varying sizes, printing out performance metrics and best parameters for each iteration.
3. `create_svm_dict`: This function constructs a dictionary of trained SVM models for each unique value in a given dataframe column. It skips training for classes with significant imbalances. For each category, the narratives are vectorized, the dataset is split and balanced, and an SVM model with specified hyperparameters is trained. The trained models are stored in the dictionary, keyed by the unique values of the column.

In essence, the code facilitates the exploration, tuning, and application of SVM classifiers for binary text classification tasks on a per-category basis in a dataframe.

In [None]:
# Function to plot the distribution of predicted probabilities
def plot_proba_distribution(y_pred_proba, classes):
    # Loop through each class and plot its probability distribution
    for idx, _class in enumerate(classes):
        sns.kdeplot(y_pred_proba[:, idx], label=_class)
    plt.xlabel('Probability')
    plt.ylabel('Density')
    plt.title('Probability distributions of classes')
    plt.legend()
    plt.show()

# Function to perform a grid search on SVM for a given column in the dataframe
def print_grid_search_svm_results(df, column_name):
    # Loop through each unique value in the column
    for unique_value in df[column_name].unique():
        try:
            print(f"now training binary classification model for {unique_value}")
            # create binary target column
            df['is_'+ unique_value] = df[column_name].apply(lambda x: 1 if x==unique_value else 0)
            print(df['is_'+ unique_value].value_counts())

            # vectorize the clean narrative using the function defined previously
            df['glove_50_features'] = df['clean_narr'].apply(vectorize_text_100)

            # Create features and target variables
            X = list(df['glove_50_features'])
            y = df['is_'+ unique_value]

            # Split the data into training and test set
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

            # Balance the dataset using RandomUnderSampling
            rus = RandomUnderSampler(random_state=42)
            X_train_resampled, y_train_resampled = rus.fit_resample(np.array(X_train).tolist(), y_train)
            print(y_train_resampled.value_counts())

            # Perform grid search for multiple data sizes
            for data_size in [5000, 10000, 15000, 20000]:
                # Randomly sample data
                sample_idx = np.random.choice(len(X_train_resampled), data_size, replace=False)
                X_train_resampled_sampled = [X_train_resampled[i] for i in sample_idx]
                y_train_resampled_sampled = [y_train_resampled[i] for i in sample_idx]

                # Define grid parameters
                param_grid = {'C': [50, 100, 250], 'gamma': [0.05, 0.15, 0.25, 0.5, 0.75, 0.99], 'kernel': ['rbf']}

                # initialize SVM and grid search
                svm = SVC(class_weight='balanced')
                svm.classes_ = np.array([0, 1])
                grid = GridSearchCV(svm, param_grid, cv=5, scoring='f1', verbose=4, n_jobs=-1)
                grid.fit(np.array(X_train_resampled_sampled).tolist(), y_train_resampled_sampled)

                # Evaluate model
                y_pred = grid.predict(np.array(X_test).tolist())
                print(f1_score(y_test, y_pred, average='weighted'))
                print(classification_report(y_test, y_pred))
                print("The best parameters:", grid.best_params_)
                print("Data size:", data_size)

                # Print grid search results
                results = grid.cv_results_
                for mean_test_score, params in zip(results["mean_test_score"], results["params"]):
                    print(params, "has a score of", mean_test_score)
                print("-----------------------------------------------------------------------")
            print("**************************************************************************")
        except:
            pass

# Function to create a dictionary of SVM models for each unique value in a given column, the best parameters will be used in this function
def create_svm_dict(df, column_name):
    ml_dict={}
    for unique_value in df[column_name].unique():
        try:
            print(f"now training binary classification model for {unique_value}")
            # create binary target column
            df['is_'+ unique_value] = df[column_name].apply(lambda x: 1 if x==unique_value else 0)
            
            # Skip training for highly imbalanced classes
            value_counts = df['is_'+ unique_value].value_counts()
            if min(value_counts)/max(value_counts) < 0.01:
                print(f"Skipping training binary classification model for {unique_value} due to class imbalance.")
                continue

            # Vectorize the clean narrative
            df['glove_50_features'] = df['clean_narr'].apply(vectorize_text_100)

            # Create features and target variables
            X = list(df['glove_50_features'])
            y = df['is_'+ unique_value]

            # Split the data into training and test set
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

            # Balance the dataset using RandomUnderSampling
            rus = RandomUnderSampler(random_state=42)
            X_train_resampled, y_train_resampled = rus.fit_resample(np.array(X_train).tolist(), y_train)
            print(y_train_resampled.value_counts())

            # Randomly sample data
            sample_idx = np.random.choice(len(X_train_resampled), 20000, replace=False)
            X_train_resampled_sampled = [X_train_resampled[i] for i in sample_idx]
            y_train_resampled_sampled = [y_train_resampled[i] for i in sample_idx]

            # Initialize SVM with specified parameters and fit the model, this is the best 
            svm = SVC(class_weight='balanced', C=50, gamma=0.1, kernel='rbf', probability=True)
            svm.classes_ = np.array([0, 1])
            svm.fit(np.array(X_train_resampled_sampled).tolist(), y_train_resampled_sampled)

            # Evaluate model
            y_pred = svm.predict(np.array(X_test).tolist())
            print(f1_score(y_test, y_pred, average='weighted'))
            print(classification_report(y_test, y_pred))

            # Add trained SVM to the dictionary
            ml_dict[unique_value] = svm
            print("-----------------------------------------------------------------------")
            
        except:
            pass
    print("**************************************************************************")
    return ml_dict

To observe and pick the best parameters for model training, run the following code. This will take some time and print a lot of information.

In [None]:
# Combined Product did not significantly changed the distribution of products,so we will train models using combined products
product_svms = print_grid_search_svm_results(cfpb_df.copy(), "combined_product")
product_svms_long = print_grid_search_svm_results(cfpb_df[(cfpb_df['narr_len']>500)].copy(), "combined_product")
issue_svms = print_grid_search_svm_results(cfpb_df.copy(), "Issue")
issue_svms_long = print_grid_search_svm_results(cfpb_df[(cfpb_df['narr_len']>500)].copy(), "Issue")
combined_issue_svms = print_grid_search_svm_results(cfpb_df.copy(), "combined_issue")
combined_issue_svms_long = print_grid_search_svm_results(cfpb_df[(cfpb_df['narr_len']>500)].copy(), "combined_issue")

The following code is dedicated to training multiple SVM classifiers on different categories from the cfpb_df dataframe, especially emphasizing handling class imbalances. It focuses on two primary categories: 'combined_product' and 'Issue', and their respective longer narrative versions, where narratives exceed 500 characters in length. The objective is to construct classifiers for both the original and combined versions of these categories. 

Despite the non-significant change in product distribution after combining (due to imbalance challenges), models for 'combined_product' are still trained. The models for combined issues are given preference due to their marginally higher accuracy and significant recall. Given the severe imbalance, where ratios can reach 100:1, the strategy leans towards accepting some false positives as it's deemed better than missing out on detecting any minority cases. 

Once these models are trained, considering the challenges posed by imbalances, they are serialized and saved as .pkl files for potential future usage.

In [None]:
# Training multiple SVMs and storing them in dictionaries

# Combined Product did not significantly changed the distribution of products,so we will train models using combined products
product_svms = create_svm_dict(cfpb_df.copy(), "combined_product")
product_svms_long = create_svm_dict(cfpb_df[(cfpb_df['narr_len']>500)].copy(), "combined_product")


# We observed that combined issues has slightly higer accuracy rate while having high recalls
# Considering the imbalance ratio is around 100:1, we believe that some fales positives/miss identifying as positive is better than miss identify any minotity cases
issue_svms = create_svm_dict(cfpb_df.copy(), "Issue")
issue_svms_long = create_svm_dict(cfpb_df[(cfpb_df['narr_len']>500)].copy(), "Issue")
combined_issue_svms = create_svm_dict(cfpb_df.copy(), "combined_issue")
combined_issue_svms_long = create_svm_dict(cfpb_df[(cfpb_df['narr_len']>500)].copy(), "combined_issue")


# Save each model
with open("_product_svm_models_dict.pkl", 'wb') as file:
    pickle.dump(product_svms, file)
with open("_product_svm_long_models_dict.pkl", 'wb') as file:
    pickle.dump(product_svms_long, file) 
with open("_combined_issue_svm_models_dict.pkl", 'wb') as file:
    pickle.dump(combined_issue_svms, file)
with open("_combined_issue_svm_long_models_dict.pkl", 'wb') as file:
    pickle.dump(combined_issue_svms_long, file)  
with open("_issue_svm_models_dict.pkl", 'wb') as file:
    pickle.dump(issue_svms, file)
with open("_issue_svm_long_models_dict.pkl", 'wb') as file:
    pickle.dump(issue_svms_long, file)