In [1]:
import pandas as pd
import numpy as np

In [2]:
# for processing
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# for bag-of-words
from sklearn import feature_extraction, feature_selection, model_selection, naive_bayes, pipeline, manifold, preprocessing, metrics
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

## for train test split
import imblearn

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JYM\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\JYM\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\JYM\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [4]:
cfpb_df = pd.read_csv('../../data/CFPB with Duplicate Marked.csv')
print(cfpb_df.columns)
print(cfpb_df.shape)

  cfpb_df = pd.read_csv('../../data/CFPB with Duplicate Marked.csv')


Index(['Unnamed: 0', 'Date received', 'Product', 'Sub-product', 'Issue',
       'Sub-issue', 'Consumer complaint narrative', 'Company public response',
       'Company', 'State', 'ZIP code', 'Tags', 'Consumer consent provided?',
       'Submitted via', 'Date sent to company', 'Company response to consumer',
       'Timely response?', 'Consumer disputed?', 'Complaint ID', 'narr_len',
       'days_to_today', 'dupi_id', 'dupi_len'],
      dtype='object')
(1300361, 23)


In [7]:
%%time
# Drop duplicates based on 'dupi_id' column
cfpb_df = cfpb_df.drop_duplicates(subset='dupi_id')
print(cfpb_df.shape)

(1106587, 23)
Wall time: 510 ms


### Round 1! Get all the 1-4-grams from the customer complaints.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vectorizer = TfidfVectorizer(lowercase = False,ngram_range = (1,4))
vectorizer.fit(cfpb_df['Consumer complaint narrative'])
# This takes about 36GB RAM

In [None]:
# Extract the vocabulary and idf score
ngrams = vectorizer.get_feature_names_out()
# This takes about 31GB RAM

In [None]:
print(len(ngrams))

In [None]:
# This is the initial filtration
%%time
stop_words = set(stopwords.words('english'))
number_pattern = re.compile(r'^\d+|\d+$')  # match numbers at the start or end of a string
# regular expression to match repeating characters
repeating_chars_pattern = re.compile(r'^(.)\1*$')

filtered_vocab = {}

def clean_features(ngram):
    words = ngram.split()  # splits the n-gram into individual words
    # check if the n-gram starts/ends with a stop word or a number
    if (words[0] in stop_words or words[-1] in stop_words) or (number_pattern.match(words[0]) or number_pattern.match(words[-1]) or (repeating_chars_pattern.match(words[0]) or repeating_chars_pattern.match(words[-1]))):
        return False
    return True

ngrams_to_keep = [ngram for ngram in ngrams if clean_features(ngram)]

In [None]:
print(len(ngrams_to_keep))

### Round 2! Find proper document frequency thresholds

In [None]:
%%time
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import Counter
import matplotlib.pyplot as plt

# Creating a new vectorizer with your filtered vocabulary
filtered_vectorizer = TfidfVectorizer(lowercase=False, vocabulary=ngrams_to_keep, ngram_range=(1,4))

# Fitting the vectorizer and transforming the narratives
X = filtered_vectorizer.fit_transform(cfpb_df['Consumer complaint narrative'])

In [None]:
# Get the term frequencies
tf = np.sum(X, axis=0).A1

# Get the document frequencies
df = np.sum((X > 0), axis=0).A1

In [None]:
# Creating histograms for Term Frequencies and Document Frequencies
fig, axs = plt.subplots(2, figsize=(10, 10))

axs[0].hist(tf, bins=10000)
axs[0].set_title('Term Frequencies')
axs[0].set_yscale('log')  # Using log scale for better visualization
axs[0].set_xlim([0, 50000])  # Set x-axis limit to 0-5000


axs[1].hist(df, bins=10000)
axs[1].set_title('Document Frequencies')
axs[1].set_yscale('log')  # Using log scale for better visualization
axs[1].set_xlim([0, 50000])  # Set x-axis limit to 0-5000


plt.tight_layout()
plt.show()

In [None]:
# Creating histograms for Term Frequencies and Document Frequencies
fig, axs = plt.subplots(2, figsize=(10, 10))

axs[0].hist(tf, bins=10000)
axs[0].set_title('Term Frequencies')
axs[0].set_yscale('log')  # Using log scale for better visualization
axs[0].set_xlim([0, 5000])  # Set x-axis limit to 0-5000


axs[1].hist(df, bins=10000)
axs[1].set_title('Document Frequencies')
axs[1].set_yscale('log')  # Using log scale for better visualization
axs[1].set_xlim([0, 10000])  # Set x-axis limit to 0-5000


plt.tight_layout()
plt.show()

We can see the historgram faltterned out after 800000, this suggest that even the most frequent tokens appeared 0.07 of the total complaints number, suggesting a max_df of 0.08. In this situation, even we set the max_df to 0.8, there won't be much feature can be captured. On the other hand, the histrogram shows a sharp drop after 2000, suggesting a good threshold to drop the rearer n-grams to be 1500 documents.

### Round 3! Further Reduce n-gram Features By through variance and correlation

In [None]:
%%time
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import Counter
import matplotlib.pyplot as plt

# Creating a new vectorizer with your filtered vocabulary, the max_df is actually 0.08 by calculation
filtered_vectorizer = TfidfVectorizer(lowercase=False, vocabulary=ngrams_to_keep, ngram_range=(1,4), max_df=0.8, min_df=1500)

# Fitting the vectorizer and transforming the narratives
filtered_vectorizer.fit(cfpb_df['Consumer complaint narrative'])
# X = filtered_vectorizer.fit_transform(cfpb_df['Consumer complaint narrative'])

In [None]:
%%time
X = filtered_vectorizer.fit_transform(cfpb_df['Consumer complaint narrative'])

In [None]:
new_ngrams = filtered_vectorizer.get_feature_names_out()

In [None]:
print(len(new_ngrams))

In [None]:
%%time
y = cfpb_df.Product.apply(lambda x: 1 if x=="Debt collection" else 0)

In [None]:
y.value_counts()

In [None]:
%%time
from sklearn.feature_selection import chi2

# Compute chi2 scores and p-values for all features
chi2_scores, p_values = chi2(X, y)

In [None]:
# Plot histogram of chi2 scores
plt.hist(chi2_scores, bins=500, log=True)
plt.title('Histogram of Chi2 Scores')
plt.xlabel('Chi2 Score')
plt.ylabel('Frequency')
plt.show()

# Optionally, plot histogram of p-values
plt.hist(p_values, bins=500, log=True)
plt.xlim([0.9, 1.0])
plt.title('Histogram of P-values')
plt.xlabel('P-value')
plt.ylabel('Frequency')
plt.show()

In [None]:
len([s for s in p_values if s>0.99])

In [None]:
len([s for s in p_values if s>0.995])

In [None]:
len([s for s in p_values if s>0.997])

In [None]:
len([s for s in p_values if s>0.998])

In [None]:
len([s for s in p_values if s>0.999])

In [None]:
# use p-values to create a boolean mask
filtered_feature_names_0995 = np.array(filtered_vectorizer.get_feature_names_out())[p_values >= 0.995]
filtered_feature_names_0996 = np.array(filtered_vectorizer.get_feature_names_out())[p_values >= 0.996]
filtered_feature_names_0997 = np.array(filtered_vectorizer.get_feature_names_out())[p_values >= 0.997]
filtered_feature_names_0998 = np.array(filtered_vectorizer.get_feature_names_out())[p_values >= 0.998]
filtered_feature_names_0999 = np.array(filtered_vectorizer.get_feature_names_out())[p_values >= 0.997]

In [None]:
print(len(filtered_feature_names_0995))
print(len(filtered_feature_names_0996))
print(len(filtered_feature_names_0997))
print(len(filtered_feature_names_0998))
print(len(filtered_feature_names_0999))

### Round 4! Get the TF-IDF vectorizers ready, we will create 5 according for different thresholds and computational power

In [None]:
import pickle

In [None]:
# Creating a new vectorizer with your filtered vocabulary, the max_df is actually 0.08 by calculation
filtered_vectorizer = TfidfVectorizer(lowercase=False, vocabulary=filtered_feature_names_0995, ngram_range=(1,4), max_df=0.8, min_df=1500)
# Fitting the vectorizer and transforming the narratives
X = filtered_vectorizer.fit_transform(cfpb_df['Consumer complaint narrative'])
# Save the trained vectorizer
with open('tfidf_vectorizer_995.pkl', 'wb') as f:
    pickle.dump(filtered_vectorizer, f)

In [None]:
# Creating a new vectorizer with your filtered vocabulary, the max_df is actually 0.08 by calculation
filtered_vectorizer = TfidfVectorizer(lowercase=False, vocabulary=filtered_feature_names_0996, ngram_range=(1,4), max_df=0.8, min_df=1500)
# Fitting the vectorizer and transforming the narratives
X = filtered_vectorizer.fit_transform(cfpb_df['Consumer complaint narrative'])
# Save the trained vectorizer
with open('tfidf_vectorizer_996.pkl', 'wb') as f:
    pickle.dump(filtered_vectorizer, f)

In [None]:
# Creating a new vectorizer with your filtered vocabulary, the max_df is actually 0.08 by calculation
filtered_vectorizer = TfidfVectorizer(lowercase=False, vocabulary=filtered_feature_names_0997, ngram_range=(1,4), max_df=0.8, min_df=1500)
# Fitting the vectorizer and transforming the narratives
X = filtered_vectorizer.fit_transform(cfpb_df['Consumer complaint narrative'])
# Save the trained vectorizer
with open('tfidf_vectorizer_997.pkl', 'wb') as f:
    pickle.dump(filtered_vectorizer, f)

In [None]:
# Creating a new vectorizer with your filtered vocabulary, the max_df is actually 0.08 by calculation
filtered_vectorizer = TfidfVectorizer(lowercase=False, vocabulary=filtered_feature_names_0998, ngram_range=(1,4), max_df=0.8, min_df=1500)
# Fitting the vectorizer and transforming the narratives
X = filtered_vectorizer.fit_transform(cfpb_df['Consumer complaint narrative'])
# Save the trained vectorizer
with open('tfidf_vectorizer_998.pkl', 'wb') as f:
    pickle.dump(filtered_vectorizer, f)

In [None]:
# Creating a new vectorizer with your filtered vocabulary, the max_df is actually 0.08 by calculation
filtered_vectorizer = TfidfVectorizer(lowercase=False, vocabulary=filtered_feature_names_0999, ngram_range=(1,4), max_df=0.8, min_df=1500)
# Fitting the vectorizer and transforming the narratives
X = filtered_vectorizer.fit_transform(cfpb_df['Consumer complaint narrative'])
# Save the trained vectorizer
with open('tfidf_vectorizer_999.pkl', 'wb') as f:
    pickle.dump(filtered_vectorizer, f)

In [None]:
# Load the trained vectorizer
with open('tfidf_vectorizer.pkl', 'rb') as f:
    loaded_vectorizer = pickle.load(f)