In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import pandas as pd

# Read data
df = pd.read_csv('CFPB with preprocessing')

# Calculate y_target based on 'product' column
df['y_target'] = (df['Product'] == 'Debt collection').astype(int)

# Vectorize the text data with 4-grams
vectorizer = TfidfVectorizer(ngram_range=(1,4))
X = vectorizer.fit_transform(df['unigram_narr'].values.astype('U'))

# Select top 10000 features
X_new = SelectKBest(chi2, k=10000).fit_transform(X, df['y_target'])

# Apply SMOTE oversampling
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_new, df['y_target']) #undersample uppersample

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Initialize a LogisticRegression model
clf = LogisticRegression(random_state=42)

# Train the model
clf.fit(X_train, y_train)

# Make predictions with the classifier
y_pred = clf.predict(X_test)

# Calculate and print accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Classifier accuracy:", accuracy)

# Print classification report
print(classification_report(y_test, y_pred))


Classifier accuracy: 0.9056083358872203
              precision    recall  f1-score   support

           0       0.91      0.91      0.91      1643
           1       0.90      0.91      0.90      1620

    accuracy                           0.91      3263
   macro avg       0.91      0.91      0.91      3263
weighted avg       0.91      0.91      0.91      3263



In [8]:
print("Number of selected features:", X_new.shape[1])

Number of selected features: 10000


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# Read data
df = pd.read_csv('CFPB with preprocessing')

# Calculate y_target based on 'product' column
df['y_target'] = (df['Product'] == 'Debt collection').astype(int)

# Vectorize the text data with 4-grams
vectorizer = TfidfVectorizer(ngram_range=(1,4))
X = vectorizer.fit_transform(df['unigram_narr'].values.astype('U'))

# Apply SMOTE oversampling
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, df['y_target'])

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Initialize a LogisticRegression model
clf = LogisticRegression(random_state=42)

# Train the model
clf.fit(X_train, y_train)

# Make predictions with the classifier
y_pred = clf.predict(X_test)

# Calculate and print accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Classifier accuracy:", accuracy)

# Print classification report
print(classification_report(y_test, y_pred))


Classifier accuracy: 0.9589334967821024
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1643
           1       0.96      0.95      0.96      1620

    accuracy                           0.96      3263
   macro avg       0.96      0.96      0.96      3263
weighted avg       0.96      0.96      0.96      3263



In [7]:
# Get the feature names
features = vectorizer.get_feature_names_out()

# Print the number of features
print("Number of features:", len(features))

Number of features: 2405449
