<a href="https://colab.research.google.com/github/engige/nlp_consumer_complaints_classification/blob/main/joseph_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import zipfile
import pandas as pd

# Define the path to the zip file and the CSV file inside it
zip_path = '/content/drive/MyDrive/data_comp.zip'
csv_filename = 'consumer_complaints.csv'

# Open the zip file and load the CSV file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    with zip_ref.open(csv_filename) as file:
        df = pd.read_csv(file)

# Displaying the first few raws of the dataframe
df.head()

Unnamed: 0.1,Unnamed: 0,product,narrative
0,0,credit_card,purchase order day shipping amount receive pro...
1,1,credit_card,forwarded message date tue subject please inve...
2,2,retail_banking,forwarded message cc sent friday pdt subject f...
3,3,credit_reporting,payment history missing credit report speciali...
4,4,credit_reporting,payment history missing credit report made mis...


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Displaying basic information about the dataframe
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162421 entries, 0 to 162420
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  162421 non-null  int64 
 1   product     162421 non-null  object
 2   narrative   162411 non-null  object
dtypes: int64(1), object(2)
memory usage: 3.7+ MB


In [None]:
# Dropping the unnecessary 'Unnamed: 0' column
df = df.drop(columns=['Unnamed: 0'])

# Dropping rows with missing values in the 'narrative' column
df = df.dropna(subset=['narrative'])

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 162411 entries, 0 to 162420
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   product    162411 non-null  object
 1   narrative  162411 non-null  object
dtypes: object(2)
memory usage: 3.7+ MB


In [None]:
# Checking for duplicate rows based on 'product' and 'narrative' columns
duplicates = df.duplicated(subset=['product', 'narrative']).sum()

duplicates

37735

In [None]:
# Removing duplicate entries based on 'product' and 'narrative' columns
df = df.drop_duplicates(subset=['product', 'narrative'])

# Verifying the new shape of the dataset
df.shape

(124676, 2)

In [None]:
import matplotlib.pyplot as plt

# Calculating the length of each complaint narrative
df['text_length'] = df['narrative'].apply(len)

# Summary statistics for text length
text_length_summary = df['text_length'].describe()

text_length_summary

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text_length'] = df['narrative'].apply(len)


Unnamed: 0,text_length
count,124676.0
mean,633.359484
std,832.511639
min,3.0
25%,219.0
50%,414.0
75%,752.0
max,20596.0


In [None]:
# Removing rows where the text length is below 10 characters
df = df[df['text_length'] >= 10]

# Checking the new shape of the dataset and recalculating summary statistics for text length
new_shape = df.shape
text_length_summary_updated = df['text_length'].describe()

new_shape, text_length_summary_updated

((124633, 3),
 count    124633.00000
 mean        633.57557
 std         832.57394
 min          10.00000
 25%         219.00000
 50%         414.00000
 75%         752.00000
 max       20596.00000
 Name: text_length, dtype: float64)

In [None]:
# Checking the distribution of the 'product' categories
product_distribution = df['product'].value_counts()

product_distribution

Unnamed: 0_level_0,count
product,Unnamed: 1_level_1
credit_reporting,56283
debt_collection,21106
mortgages_and_loans,18758
credit_card,15023
retail_banking,13463


## Data Preprocessing

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

# Ensure that NLTK resources are downloaded
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Initializing stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Defining the preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize
    words = word_tokenize(text)
    # Remove stop words and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    # Join words back into a single string with whitespace handling
    cleaned_text = ' '.join(words).strip()  # Remove leading/trailing whitespace
    return ' '.join(cleaned_text.split())    # Replace multiple spaces with a single space

# Applying preprocessing to the 'narrative' column
df['cleaned_narrative'] = df['narrative'].apply(preprocess_text)

# Displaying a sample of the cleaned narratives
df[['narrative', 'cleaned_narrative']].head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Unnamed: 0,narrative,cleaned_narrative
0,purchase order day shipping amount receive pro...,purchase order day shipping amount receive pro...
1,forwarded message date tue subject please inve...,forwarded message date tue subject please inve...
2,forwarded message cc sent friday pdt subject f...,forwarded message cc sent friday pdt subject f...
3,payment history missing credit report speciali...,payment history missing credit report speciali...
4,payment history missing credit report made mis...,payment history missing credit report made mis...


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming `df` is your DataFrame and 'cleaned_narrative' is the preprocessed text column
# Step 1: Count words in each complaint
df['word_count'] = df['cleaned_narrative'].apply(lambda x: len(x.split()))

# Step 2: Get summary statistics for word counts
word_count_summary = df['word_count'].describe()
print("Word Count Summary:\n", word_count_summary)

Word Count Summary:
 count    124633.000000
mean         87.124116
std         110.856002
min           1.000000
25%          30.000000
50%          58.000000
75%         104.000000
max        2684.000000
Name: word_count, dtype: float64


In [None]:
import pandas as pd

# Calculate text length if not already present
if 'text_length' not in df.columns:
    df['text_length'] = df['cleaned_narrative'].apply(len)

# Calculate word count if not already present
if 'number_of_words' not in df.columns:
    df['number_of_words'] = df['cleaned_narrative'].apply(lambda x: len(x.split()))

# Adding text length column (character count for each narrative)
df['text_length'] = df['cleaned_narrative'].apply(len)

# Adding word count column (number of words in each narrative)
df['number_of_words'] = df['cleaned_narrative'].apply(lambda x: len(x.split()))

# Display the relevant columns in the DataFrame
cleaned_df = df[['product', 'cleaned_narrative', 'text_length', 'number_of_words']]

# View the first few rows
cleaned_df.head()

Unnamed: 0,product,cleaned_narrative,text_length,number_of_words
0,credit_card,purchase order day shipping amount receive pro...,1705,230
1,credit_card,forwarded message date tue subject please inve...,904,132
2,retail_banking,forwarded message cc sent friday pdt subject f...,1230,173
3,credit_reporting,payment history missing credit report speciali...,903,131
4,credit_reporting,payment history missing credit report made mis...,851,123


In [None]:
from sklearn.preprocessing import LabelEncoder

# Ensure cleaned_df is a copy of the original DataFrame
cleaned_df = cleaned_df.copy()

# Create a LabelEncoder instance
label_encoder = LabelEncoder()

# Encode the 'product' column and assign it to the new 'target' column
cleaned_df['target'] = label_encoder.fit_transform(cleaned_df['product'])

# Display the first few rows to verify the target encoding
cleaned_df.head()

Unnamed: 0,product,cleaned_narrative,text_length,number_of_words,target
0,credit_card,purchase order day shipping amount receive pro...,1705,230,0
1,credit_card,forwarded message date tue subject please inve...,904,132,0
2,retail_banking,forwarded message cc sent friday pdt subject f...,1230,173,4
3,credit_reporting,payment history missing credit report speciali...,903,131,1
4,credit_reporting,payment history missing credit report made mis...,851,123,1


In [None]:
# Create a dictionary to map each product category to its numeric label
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Print the mapping
print("Mapping of product categories to numeric targets:")
for product, target in label_mapping.items():
    print(f"{product}: {target}")

Mapping of product categories to numeric targets:
credit_card: 0
credit_reporting: 1
debt_collection: 2
mortgages_and_loans: 3
retail_banking: 4


In [None]:
from sklearn.model_selection import train_test_split

# Separate the features (X) and target (y)
X = cleaned_df['cleaned_narrative']
y = cleaned_df['target']

# Perform the train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Print the shapes of the resulting splits
print("Training set shapes (X_train, y_train):", X_train.shape, y_train.shape)
print("Testing set shapes (X_test, y_test):", X_test.shape, y_test.shape)

Training set shapes (X_train, y_train): (99706,) (99706,)
Testing set shapes (X_test, y_test): (24927,) (24927,)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

# Step 1: Apply TF-IDF transformation
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Step 2: Apply MinMax Scaling
scaler = MinMaxScaler()

# Fit the scaler on the training data and transform both train and test data
X_train_scaled = scaler.fit_transform(X_train_tfidf.toarray())
X_test_scaled = scaler.transform(X_test_tfidf.toarray())

# Print the shapes of the transformed features to verify
print("Scaled Training set shape:", X_train_scaled.shape)
print("Scaled Testing set shape:", X_test_scaled.shape)

Scaled Training set shape: (99706, 5000)
Scaled Testing set shape: (24927, 5000)


In [None]:
!pip install xgboost --upgrade



In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, message='Parameters: { "use_label_encoder" } are not used.')

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Initialize and train the XGBoost model
baseline_xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
baseline_xgb_model.fit(X_train_scaled, y_train)

# Step 2: Make predictions on the test set
y_pred_base_xgb = baseline_xgb_model.predict(X_test_scaled)

# Step 3: Calculate accuracy and classification report
accuracy_base_xgb = accuracy_score(y_test, y_pred_base_xgb)
report_base_xgb = classification_report(y_test, y_pred_base_xgb)

# Step 4: Display the evaluation scores
print(f"Accuracy (Baseline XGBoost): {accuracy_base_xgb}")
print("Classification Report (Baseline XGBoost):\n", report_base_xgb)

Parameters: { "use_label_encoder" } are not used.



Accuracy (Baseline XGBoost): 0.8489188430216231
Classification Report (Baseline XGBoost):
               precision    recall  f1-score   support

           0       0.80      0.78      0.79      3005
           1       0.87      0.91      0.89     11257
           2       0.80      0.76      0.78      4221
           3       0.87      0.82      0.84      3752
           4       0.86      0.86      0.86      2692

    accuracy                           0.85     24927
   macro avg       0.84      0.83      0.83     24927
weighted avg       0.85      0.85      0.85     24927



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Define the parameter grid for tuning
param_grid = {
    'n_estimators': [100, 200],            # Reduced number of estimators
    'max_depth': [10, 20, None],            # Reduced number of depth options
    'min_samples_split': [2, 5],            # Reduced number of split options
    'min_samples_leaf': [1, 2]               # Reduced number of leaf options
}

# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy') # Changed n_jobs to 2

# Perform the grid search on the training data
grid_search.fit(X_train_scaled, y_train)

# Get the best model
best_rf_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred_tuned_rf = best_rf_model.predict(X_test_scaled)

# Calculate accuracy and classification report
accuracy_tuned_rf = accuracy_score(y_test, y_pred_tuned_rf)
report_tuned_rf = classification_report(y_test, y_pred_tuned_rf)

# Display the evaluation scores
print("Best Parameters:", grid_search.best_params_)
print(f"Accuracy (Tuned Random Forest): {accuracy_tuned_rf}")
print("Classification Report (Tuned Random Forest):\n", report_tuned_rf)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Accuracy (Tuned Random Forest): 0.8523288000962811
Classification Report (Tuned Random Forest):
               precision    recall  f1-score   support

           0       0.82      0.73      0.77      3005
           1       0.85      0.95      0.90     11257
           2       0.86      0.72      0.78      4221
           3       0.88      0.81      0.84      3752
           4       0.85      0.85      0.85      2692

    accuracy                           0.85     24927
   macro avg       0.85      0.81      0.83     24927
weighted avg       0.85      0.85      0.85     24927



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Define the parameter distribution for RandomizedSearch
param_dist = {
    'n_estimators': [100, 200, 300, 500],           # Number of trees
    'max_depth': [10, 20, 30, None],                # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],                # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],                  # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]                      # Use bootstrap samples
}

# Step 2: Initialize the RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)

# Step 3: Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist,
    n_iter=20,                # Number of parameter settings that are sampled
    scoring='accuracy',       # Evaluation metric
    cv=3,                     # 3-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1                 # Use all available cores
)

# Step 4: Perform the random search on the training data
random_search.fit(X_train_scaled, y_train)

# Step 5: Get the best model from the search
best_rf_model = random_search.best_estimator_

# Step 6: Make predictions on the test set
y_pred_tuned_rf = best_rf_model.predict(X_test_scaled)

# Step 7: Calculate accuracy and classification report
accuracy_tuned_rf = accuracy_score(y_test, y_pred_tuned_rf)
report_tuned_rf = classification_report(y_test, y_pred_tuned_rf)

# Step 8: Display the best parameters and evaluation scores
print("Best Parameters:", random_search.best_params_)
print(f"Accuracy (Tuned Random Forest): {accuracy_tuned_rf}")
print("Classification Report (Tuned Random Forest):\n", report_tuned_rf)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Parameters: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': None, 'bootstrap': False}
Accuracy (Tuned Random Forest): 0.8556184057447748
Classification Report (Tuned Random Forest):
               precision    recall  f1-score   support

           0       0.81      0.74      0.78      3005
           1       0.86      0.94      0.90     11257
           2       0.86      0.73      0.79      4221
           3       0.87      0.82      0.85      3752
           4       0.85      0.85      0.85      2692

    accuracy                           0.86     24927
   macro avg       0.85      0.82      0.83     24927
weighted avg       0.86      0.86      0.85     24927



In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Extra Trees Classifier
baseline_et_model = ExtraTreesClassifier(n_estimators=100, random_state=42)

# Train the model
baseline_et_model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred_base_et = baseline_et_model.predict(X_test_scaled)

# Evaluate the model
accuracy_base_et = accuracy_score(y_test, y_pred_base_et)
report_base_et = classification_report(y_test, y_pred_base_et)

print(f"Accuracy (Baseline Extra Trees): {accuracy_base_et}")
print("Classification Report (Baseline Extra Trees):\n", report_base_et)


Accuracy (Baseline Extra Trees): 0.8572632085690215
Classification Report (Baseline Extra Trees):
               precision    recall  f1-score   support

           0       0.83      0.74      0.78      3005
           1       0.86      0.95      0.90     11257
           2       0.87      0.73      0.79      4221
           3       0.87      0.83      0.85      3752
           4       0.85      0.86      0.86      2692

    accuracy                           0.86     24927
   macro avg       0.85      0.82      0.84     24927
weighted avg       0.86      0.86      0.85     24927



In [None]:
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the LightGBM Classifier
baseline_lgbm_model = LGBMClassifier(random_state=42)

# Train the model
baseline_lgbm_model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred_base_lgbm = baseline_lgbm_model.predict(X_test_scaled)

# Evaluate the model
accuracy_base_lgbm = accuracy_score(y_test, y_pred_base_lgbm)
report_base_lgbm = classification_report(y_test, y_pred_base_lgbm)

print(f"Accuracy (Baseline LightGBM): {accuracy_base_lgbm}")
print("Classification Report (Baseline LightGBM):\n", report_base_lgbm)

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.943876 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 592314
[LightGBM] [Info] Number of data points in the train set: 99706, number of used features: 4985
[LightGBM] [Info] Start training from score -2.115820
[LightGBM] [Info] Start training from score -0.794986
[LightGBM] [Info] Start training from score -1.775800
[LightGBM] [Info] Start training from score -1.893776
[LightGBM] [Info] Start training from score -2.225369
Accuracy (Baseline LightGBM): 0.8520078629598428
Classification Report (Baseline LightGBM):
               precision    recall  f1-score   support

           0       0.81      0.79      0.80      3005
           1       0.88      0.90      0.89     11257
           2       0.80      0.77      0.78      4221
           3       0.86      0.83      0.84      3752
           4       0.87      0.87      0.87      2692

    accuracy      