In [6]:
import pandas as pd

# Load the dataset
data = pd.read_csv('./data/complaints_processed.csv', index_col = 0)

# Display the first few rows of the dataset
data.head()

Unnamed: 0,product,narrative
0,credit_card,purchase order day shipping amount receive pro...
1,credit_card,forwarded message date tue subject please inve...
2,retail_banking,forwarded message cc sent friday pdt subject f...
3,credit_reporting,payment history missing credit report speciali...
4,credit_reporting,payment history missing credit report made mis...


In [9]:
data.shape

(162421, 2)

In [4]:
# Display basic information to understand the structure of the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162421 entries, 0 to 162420
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  162421 non-null  int64 
 1   product     162421 non-null  object
 2   narrative   162411 non-null  object
dtypes: int64(1), object(2)
memory usage: 3.7+ MB


In [7]:
# Display a summary of the missing values
data.isnull().sum()

product       0
narrative    10
dtype: int64

The dataset contains 162,421 entries with the following columns:

Unnamed: 0: index column, which can be ignored.

product: The target variable representing the product type (e.g., "credit_card," "retail_banking").

narrative: Text data describing the complaint, which will serve as our feature.
There are some missing values (10) in the "narrative" column.

In [8]:
# checking for duplicates
data.duplicated().value_counts()

False    124679
True      37742
Name: count, dtype: int64

## Baseline model (logistic regression) on Bag-of-Words vectorized text

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Drop rows with missing narrative values
data_cleaned = data.dropna(subset=['narrative'])

# Define features and target
X = data_cleaned['narrative']
y = data_cleaned['product']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Vectorize the text data using Bag-of-Words
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

# Initialize and train the Logistic Regression model
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_bow, y_train)

# Make predictions and evaluate the model
y_pred = log_reg.predict(X_test_bow)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy is: ", accuracy)
print("The Report shows: ", report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy is:  0.8568174121848351
The Report shows:                       precision    recall  f1-score   support

        credit_card       0.77      0.71      0.74      3113
   credit_reporting       0.89      0.95      0.91     18235
    debt_collection       0.80      0.70      0.74      4630
mortgages_and_loans       0.85      0.76      0.80      3798
     retail_banking       0.84      0.84      0.84      2707

           accuracy                           0.86     32483
          macro avg       0.83      0.79      0.81     32483
       weighted avg       0.85      0.86      0.85     32483



In [22]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Use a subset of the data for quicker training
data_sampled = data_cleaned.sample(n=20000, random_state=42)
X_sample = data_sampled['narrative']
y_sample = data_sampled['product']

# Split the subset into training and testing sets
X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split(
    X_sample, y_sample, test_size=0.2, random_state=42, stratify=y_sample)

# Re-vectorize the text data on the subset using Bag-of-Words
X_train_bow_sample = vectorizer.fit_transform(X_train_sample)
X_test_bow_sample = vectorizer.transform(X_test_sample)

# Initialize and train the Multinomial Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_bow_sample, y_train_sample)

# Make predictions and evaluate the model
y_pred_sample = nb_model.predict(X_test_bow_sample)
accuracy_sample = accuracy_score(y_test_sample, y_pred_sample)
report_sample = classification_report(y_test_sample, y_pred_sample)

print("Accuracy is: ", accuracy_sample)
print("The Report shows: ", report_sample)

Accuracy is:  0.824
The Report shows:                       precision    recall  f1-score   support

        credit_card       0.63      0.71      0.67       387
   credit_reporting       0.91      0.87      0.89      2254
    debt_collection       0.75      0.67      0.71       565
mortgages_and_loans       0.73      0.86      0.79       467
     retail_banking       0.78      0.88      0.83       327

           accuracy                           0.82      4000
          macro avg       0.76      0.80      0.78      4000
       weighted avg       0.83      0.82      0.83      4000



## Support Vector Machine (SVM)

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

# Use TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limiting features to reduce computational load
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_sample)
X_test_tfidf = tfidf_vectorizer.transform(X_test_sample)

# Initialize and train the SVM model
svm_model = LinearSVC(random_state=42, max_iter=1000)
svm_model.fit(X_train_tfidf, y_train_sample)

# Make predictions and evaluate the model
y_pred_svm = svm_model.predict(X_test_tfidf)
accuracy_svm = accuracy_score(y_test_sample, y_pred_svm)
report_svm = classification_report(y_test_sample, y_pred_svm)

print("SVM Accuracy: ", accuracy_svm)
print("SVM Report: ", report_svm)

SVM Accuracy:  0.85175
SVM Report:                       precision    recall  f1-score   support

        credit_card       0.76      0.72      0.74       387
   credit_reporting       0.90      0.92      0.91      2254
    debt_collection       0.75      0.71      0.73       565
mortgages_and_loans       0.81      0.78      0.79       467
     retail_banking       0.84      0.87      0.85       327

           accuracy                           0.85      4000
          macro avg       0.81      0.80      0.81      4000
       weighted avg       0.85      0.85      0.85      4000



## Feature Engineering Techniques

#### 1. Text Preprocessing Enhancements

In [25]:
import pandas as pd
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download necessary NLTK data files
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and define stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function for text preprocessing
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize and remove stopwords
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    # Join tokens back to string
    return ' '.join(tokens)

# Apply preprocessing
data_cleaned['narrative_cleaned'] = data_cleaned['narrative'].apply(preprocess_text)

# Define features and target
X = data_cleaned['narrative_cleaned']
y = data_cleaned['product']

# Use a subset of the data for faster processing (20,000 samples)
data_sampled = data_cleaned.sample(n=20000, random_state=42)
X_sample = data_sampled['narrative_cleaned']
y_sample = data_sampled['product']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_sample, y_sample, test_size=0.2, random_state=42, stratify=y_sample)

# Vectorize the text data using TF-IDF with enhanced parameters
tfidf_vectorizer = TfidfVectorizer(
    max_features=10000,      # Increase the number of features
    ngram_range=(1,2),       # Include unigrams and bigrams
    min_df=5,                # Ignore terms that appear in fewer than 5 documents
    max_df=0.8,              # Ignore terms that appear in more than 80% of documents
    sublinear_tf=True        # Apply sublinear tf scaling
)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize and train the Multinomial Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Make predictions and evaluate the model
y_pred = nb_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:\n", report)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\John.Kul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\John.Kul\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned['narrative_cleaned'] = data_cleaned['narrative'].apply(preprocess_text)


Accuracy: 0.8365

Classification Report:
                      precision    recall  f1-score   support

        credit_card       0.73      0.71      0.72       387
   credit_reporting       0.89      0.90      0.90      2254
    debt_collection       0.81      0.62      0.70       565
mortgages_and_loans       0.71      0.86      0.78       467
     retail_banking       0.84      0.86      0.85       327

           accuracy                           0.84      4000
          macro avg       0.80      0.79      0.79      4000
       weighted avg       0.84      0.84      0.83      4000



#### 2. Incorporating Additional Features

In [30]:
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Assuming data_sampled and other preprocessing has already been done up to this point.

# Feature: Text Length (number of words)
data_sampled['text_length'] = data_sampled['narrative_cleaned'].apply(lambda x: len(x.split()))

# Feature: Number of Unique Words
data_sampled['unique_words'] = data_sampled['narrative_cleaned'].apply(lambda x: len(set(x.split())))

# Re-split after adding new features
X_sample = data_sampled['narrative_cleaned']
y_sample = data_sampled['product']

X_train, X_test, y_train, y_test = train_test_split(
    X_sample, y_sample, test_size=0.2, random_state=42, stratify=y_sample)

# Vectorize text data
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2), min_df=5, max_df=0.8, sublinear_tf=True)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Extract additional numerical features
additional_features_train = data_sampled.loc[X_train.index, ['text_length', 'unique_words']].values
additional_features_test = data_sampled.loc[X_test.index, ['text_length', 'unique_words']].values

# Scale numerical features with MinMaxScaler to ensure all values are non-negative
scaler = MinMaxScaler()
additional_features_train_scaled = scaler.fit_transform(additional_features_train)
additional_features_test_scaled = scaler.transform(additional_features_test)

# Combine TF-IDF features with additional numerical features
X_train_combined = hstack([X_train_tfidf, additional_features_train_scaled])
X_test_combined = hstack([X_test_tfidf, additional_features_test_scaled])

# Initialize and train the Multinomial Naive Bayes model
nb_model_combined = MultinomialNB()
nb_model_combined.fit(X_train_combined, y_train)

# Make predictions and evaluate the model
y_pred_combined = nb_model_combined.predict(X_test_combined)
accuracy_combined = accuracy_score(y_test, y_pred_combined)
report_combined = classification_report(y_test, y_pred_combined)

print("Accuracy with Additional Features:", accuracy_combined)
print("\nClassification Report with Additional Features:\n", report_combined)

Accuracy with Additional Features: 0.83625

Classification Report with Additional Features:
                      precision    recall  f1-score   support

        credit_card       0.73      0.71      0.72       387
   credit_reporting       0.89      0.90      0.90      2254
    debt_collection       0.81      0.62      0.70       565
mortgages_and_loans       0.71      0.86      0.78       467
     retail_banking       0.84      0.86      0.85       327

           accuracy                           0.84      4000
          macro avg       0.80      0.79      0.79      4000
       weighted avg       0.84      0.84      0.83      4000



#### 3. Dimensionality Reduction with Truncated SVD (Latent Semantic Analysis)

In [32]:
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Apply Truncated SVD to reduce dimensionality
svd = TruncatedSVD(n_components=300, random_state=42)  # Adjust n_components as needed
X_train_svd = svd.fit_transform(X_train_tfidf)
X_test_svd = svd.transform(X_test_tfidf)

# Convert SVD-transformed data to sparse format to use with hstack
X_train_svd_sparse = csr_matrix(X_train_svd)
X_test_svd_sparse = csr_matrix(X_test_svd)

# Combine with additional features
X_train_reduced = hstack([X_train_svd_sparse, additional_features_train_scaled])
X_test_reduced = hstack([X_test_svd_sparse, additional_features_test_scaled])

# Initialize and train the Logistic Regression model
lr_model_reduced = LogisticRegression(max_iter=1000, random_state=42)
lr_model_reduced.fit(X_train_reduced, y_train)

# Make predictions and evaluate the model
y_pred_reduced = lr_model_reduced.predict(X_test_reduced)
accuracy_reduced = accuracy_score(y_test, y_pred_reduced)
report_reduced = classification_report(y_test, y_pred_reduced)

print("Accuracy with Dimensionality Reduction:", accuracy_reduced)
print("\nClassification Report with Dimensionality Reduction:\n", report_reduced)

Accuracy with Dimensionality Reduction: 0.85

Classification Report with Dimensionality Reduction:
                      precision    recall  f1-score   support

        credit_card       0.77      0.70      0.73       387
   credit_reporting       0.88      0.93      0.90      2254
    debt_collection       0.80      0.65      0.72       565
mortgages_and_loans       0.82      0.79      0.80       467
     retail_banking       0.85      0.89      0.87       327

           accuracy                           0.85      4000
          macro avg       0.82      0.79      0.81      4000
       weighted avg       0.85      0.85      0.85      4000



#### 4. Feature Selection with Chi-Squared Test

In [33]:
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import hstack
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Apply Chi-Squared feature selection
selector = SelectKBest(chi2, k=5000)  # Select top 5000 features
X_train_selected = selector.fit_transform(X_train_tfidf, y_train)
X_test_selected = selector.transform(X_test_tfidf)

# Scale additional features using MinMaxScaler to ensure non-negative values
scaler = MinMaxScaler()
additional_features_train_scaled = scaler.fit_transform(additional_features_train)
additional_features_test_scaled = scaler.transform(additional_features_test)

# Combine TF-IDF-selected features with additional scaled features
X_train_final = hstack([X_train_selected, additional_features_train_scaled])
X_test_final = hstack([X_test_selected, additional_features_test_scaled])

# Initialize and train the Multinomial Naive Bayes model
nb_model_final = MultinomialNB()
nb_model_final.fit(X_train_final, y_train)

# Make predictions and evaluate the model
y_pred_final = nb_model_final.predict(X_test_final)
accuracy_final = accuracy_score(y_test, y_pred_final)
report_final = classification_report(y_test, y_pred_final)

print("Accuracy with Feature Selection:", accuracy_final)
print("\nClassification Report with Feature Selection:\n", report_final)

Accuracy with Feature Selection: 0.83875

Classification Report with Feature Selection:
                      precision    recall  f1-score   support

        credit_card       0.73      0.73      0.73       387
   credit_reporting       0.88      0.91      0.90      2254
    debt_collection       0.83      0.60      0.69       565
mortgages_and_loans       0.74      0.85      0.79       467
     retail_banking       0.83      0.88      0.85       327

           accuracy                           0.84      4000
          macro avg       0.80      0.79      0.79      4000
       weighted avg       0.84      0.84      0.84      4000



#### 5. Incorporating Word Embeddings (Optional Advanced Feature Engineering)

In [29]:
import numpy as np
from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder

# Tokenize the cleaned narratives
data_sampled['tokens'] = data_sampled['narrative_cleaned'].apply(lambda x: x.split())

# Train a Word2Vec model
w2v_model = Word2Vec(sentences=data_sampled['tokens'], vector_size=100, window=5, min_count=5, workers=4, seed=42)
w2v_model.train(data_sampled['tokens'], total_examples=len(data_sampled['tokens']), epochs=10)

# Function to compute average Word2Vec vectors
def average_w2v(tokens, model, vector_size=100):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(vector_size)
    return np.mean(vectors, axis=0)

# Compute average Word2Vec vectors for training and testing sets
X_train_w2v = np.array([average_w2v(tokens, w2v_model) for tokens in X_train.apply(lambda x: x.split())])
X_test_w2v = np.array([average_w2v(tokens, w2v_model) for tokens in X_test.apply(lambda x: x.split())])

# Combine with additional features
X_train_final_w2v = np.hstack([X_train_w2v, additional_features_train_scaled])
X_test_final_w2v = np.hstack([X_test_w2v, additional_features_test_scaled])

# Encode target labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Initialize and train a different model, e.g., Random Forest
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train_final_w2v, y_train_encoded)

# Make predictions and evaluate the model
y_pred_rf = rf_model.predict(X_test_final_w2v)
accuracy_rf = accuracy_score(y_test_encoded, y_pred_rf)
report_rf = classification_report(y_test_encoded, y_pred_rf, target_names=label_encoder.classes_)

print("Accuracy with Word Embeddings:", accuracy_rf)
print("\nClassification Report with Word Embeddings:\n", report_rf)

Accuracy with Word Embeddings: 0.843

Classification Report with Word Embeddings:
                      precision    recall  f1-score   support

        credit_card       0.73      0.65      0.69       387
   credit_reporting       0.87      0.95      0.91      2254
    debt_collection       0.82      0.61      0.70       565
mortgages_and_loans       0.80      0.78      0.79       467
     retail_banking       0.83      0.82      0.82       327

           accuracy                           0.84      4000
          macro avg       0.81      0.76      0.78      4000
       weighted avg       0.84      0.84      0.84      4000



### 7. Next Steps

After implementing these feature engineering techniques, consider the following to further enhance your models:

Hyperparameter Tuning: Use techniques like Grid Search or Random Search to find optimal model parameters.

Cross-Validation: Implement k-fold cross-validation to ensure model robustness.

Advanced Models: Experiment with models like XGBoost, LightGBM, or Deep Learning models (e.g., LSTM, BERT) for potentially better performance.

Ensemble Methods: Combine multiple models to leverage their individual strengths.

### Implementing Hyperparameter Tuning with GridSearchCV

In [34]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Define parameter grid for MultinomialNB
param_grid = {
    'alpha': [0.01, 0.1, 0.5, 1.0]  # Explore different smoothing values
}

# Initialize MultinomialNB model
nb_model = MultinomialNB()

# Set up GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(nb_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train_final, y_train)

# Get the best model from grid search
best_nb_model = grid_search.best_estimator_

# Make predictions with the tuned model
y_pred_tuned = best_nb_model.predict(X_test_final)
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
report_tuned = classification_report(y_test, y_pred_tuned)

print("Best Parameters:", grid_search.best_params_)
print("Accuracy with Hyperparameter Tuning:", accuracy_tuned)
print("\nClassification Report with Hyperparameter Tuning:\n", report_tuned)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Parameters: {'alpha': 0.01}
Accuracy with Hyperparameter Tuning: 0.8375

Classification Report with Hyperparameter Tuning:
                      precision    recall  f1-score   support

        credit_card       0.70      0.75      0.72       387
   credit_reporting       0.91      0.88      0.89      2254
    debt_collection       0.77      0.69      0.73       565
mortgages_and_loans       0.71      0.86      0.78       467
     retail_banking       0.83      0.91      0.87       327

           accuracy                           0.84      4000
          macro avg       0.78      0.82      0.80      4000
       weighted avg       0.84      0.84      0.84      4000



### Alternative: RandomizedSearchCV

In [35]:
from sklearn.model_selection import RandomizedSearchCV

# Initialize RandomizedSearchCV with 10 random parameter samples
random_search = RandomizedSearchCV(nb_model, param_grid, n_iter=10, cv=5, scoring='accuracy', n_jobs=-1, verbose=1, random_state=42)
random_search.fit(X_train_final, y_train)

# Get the best model from random search
best_nb_model_random = random_search.best_estimator_

# Make predictions and evaluate
y_pred_tuned_random = best_nb_model_random.predict(X_test_final)
accuracy_tuned_random = accuracy_score(y_test, y_pred_tuned_random)
report_tuned_random = classification_report(y_test, y_pred_tuned_random)

print("Best Parameters with RandomizedSearchCV:", random_search.best_params_)
print("Accuracy with Hyperparameter Tuning (Random Search):", accuracy_tuned_random)
print("\nClassification Report with Hyperparameter Tuning (Random Search):\n", report_tuned_random)



Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Parameters with RandomizedSearchCV: {'alpha': 0.01}
Accuracy with Hyperparameter Tuning (Random Search): 0.8375

Classification Report with Hyperparameter Tuning (Random Search):
                      precision    recall  f1-score   support

        credit_card       0.70      0.75      0.72       387
   credit_reporting       0.91      0.88      0.89      2254
    debt_collection       0.77      0.69      0.73       565
mortgages_and_loans       0.71      0.86      0.78       467
     retail_banking       0.83      0.91      0.87       327

           accuracy                           0.84      4000
          macro avg       0.78      0.82      0.80      4000
       weighted avg       0.84      0.84      0.84      4000



In [36]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import make_scorer, accuracy_score

# Define the MultinomialNB model with the best alpha (if already tuned)
nb_model = MultinomialNB(alpha=grid_search.best_params_['alpha'])  # Replace with best alpha from tuning

# Define custom scorer for accuracy (or use predefined 'accuracy')
scorer = make_scorer(accuracy_score)

# Perform 5-fold cross-validation
cv_scores = cross_val_score(nb_model, X_train_final, y_train, cv=5, scoring=scorer, n_jobs=-1)

# Calculate mean and standard deviation of accuracy
mean_accuracy = cv_scores.mean()
std_accuracy = cv_scores.std()

print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", mean_accuracy)
print("Standard Deviation of CV Accuracy:", std_accuracy)

Cross-Validation Accuracy Scores: [0.849375  0.8359375 0.8378125 0.8290625 0.83375  ]
Mean CV Accuracy: 0.8371875
Standard Deviation of CV Accuracy: 0.006757518960387743


In [37]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {'alpha': [0.01, 0.1, 0.5, 1.0]}

# Set up GridSearchCV with 5-fold cross-validation
grid_search_cv = GridSearchCV(MultinomialNB(), param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search_cv.fit(X_train_final, y_train)

# Results
best_nb_model_cv = grid_search_cv.best_estimator_
print("Best Parameters:", grid_search_cv.best_params_)
print("Best CV Score:", grid_search_cv.best_score_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Parameters: {'alpha': 0.01}
Best CV Score: 0.8371875
