<a href="https://colab.research.google.com/github/janaghoniem/Social-Media-Sentiment-Analysis/blob/main/ML_Arabic%26English.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from scipy.sparse import vstack
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

### Arabic Testing

In [18]:
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/cleaned_arabic_data.csv')
df.dropna(subset=["cleanedtext"], inplace=True)
X = df["cleanedtext"]
y = df["label"]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


1) Random forest

In [19]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

# Train-Test Split (80% train/valid, 20% test)
X_temp, X_test, y_temp, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Further split the training set into training and validation (80% train, 20% valid from the 80%)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=10, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
train_preds = rf_model.predict(X_train)
val_preds = rf_model.predict(X_val)
test_preds = rf_model.predict(X_test)

# Accuracy scores
train_accuracy = accuracy_score(y_train, train_preds)
validation_accuracy = accuracy_score(y_val, val_preds)
test_accuracy = accuracy_score(y_test, test_preds)

# Output
print("Random Forest Training Accuracy:", train_accuracy)
print("Random Forest Validation Accuracy:", validation_accuracy)
print("Random Forest Testing Accuracy:", test_accuracy)

# Common setup for all models
X_temp, X_test, y_temp, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

# Combine train and validation sets for GridSearchCV
X_train_val = vstack([X_train, X_val])
y_train_val = pd.concat([y_train, y_val])

# Random Forest Hyperparameter Tuning
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_grid = GridSearchCV(RandomForestClassifier(random_state=42),
                      rf_params,
                      cv=3,
                      n_jobs=-1,
                      verbose=1)
rf_grid.fit(X_train_val, y_train_val)

# Best model
best_rf = rf_grid.best_estimator_

# Predictions
rf_train_preds = best_rf.predict(X_train)
rf_val_preds = best_rf.predict(X_val)
rf_test_preds = best_rf.predict(X_test)

# Accuracy scores
rf_train_acc = accuracy_score(y_train, rf_train_preds)
rf_val_acc = accuracy_score(y_val, rf_val_preds)
rf_test_acc = accuracy_score(y_test, rf_test_preds)

print("Random Forest - Best Parameters:", rf_grid.best_params_)
print("Random Forest Training Accuracy:", rf_train_acc)
print("Random Forest Validation Accuracy:", rf_val_acc)
print("Random Forest Testing Accuracy:", rf_test_acc)
print("----------------------------------------")

Random Forest Training Accuracy: 0.7630631531576579
Random Forest Validation Accuracy: 0.7395184879621991
Random Forest Testing Accuracy: 0.7427435685892148
Fitting 3 folds for each of 108 candidates, totalling 324 fits
Random Forest - Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Random Forest Training Accuracy: 0.9946247312365618
Random Forest Validation Accuracy: 0.9952748818720468
Random Forest Testing Accuracy: 0.8288457211430286
----------------------------------------


2) SVM

In [20]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

# First split: 80% train/valid, 20% test
X_temp, X_test, y_temp, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Second split: 80% train, 20% validation from the 80% temp data
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)  # 0.25 of 80% = 20%

# SVM Classifier
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)

# Predictions
train_preds = svm_model.predict(X_train)
val_preds = svm_model.predict(X_val)
test_preds = svm_model.predict(X_test)

# Accuracy scores
train_accuracy = accuracy_score(y_train, train_preds)
validation_accuracy = accuracy_score(y_val, val_preds)
test_accuracy = accuracy_score(y_test, test_preds)

# Output
print("SVM Training Accuracy:", train_accuracy)
print("SVM Validation Accuracy:", validation_accuracy)
print("SVM Testing Accuracy:", test_accuracy)

# Common setup for all models
X_temp, X_test, y_temp, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

# Combine train and validation sets for GridSearchCV
X_train_val = vstack([X_train, X_val])
y_train_val = pd.concat([y_train, y_val])

# SVM Hyperparameter Tuning
# Improved SVM Hyperparameter Tuning
print("\nStarting SVM hyperparameter tuning...")
svm_params = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l2'],  # Only l2 penalty to avoid incompatible combinations
    'loss': ['squared_hinge'],  # Only squared_hinge to avoid issues
    'dual': [False]  # Prefer primal form for large n_samples > n_features
}

svm_grid = GridSearchCV(
    LinearSVC(random_state=42, max_iter=10000),  # Increased max_iter for convergence
    svm_params,
    cv=3,
    n_jobs=-1,
    verbose=1
)
svm_grid.fit(X_train_val, y_train_val)
best_svm = svm_grid.best_estimator_

# Predictions
svm_train_preds = best_svm.predict(X_train)
svm_val_preds = best_svm.predict(X_val)
svm_test_preds = best_svm.predict(X_test)

# Accuracy scores
svm_train_acc = accuracy_score(y_train, svm_train_preds)
svm_val_acc = accuracy_score(y_val, svm_val_preds)
svm_test_acc = accuracy_score(y_test, svm_test_preds)

print("SVM - Best Parameters:", svm_grid.best_params_)
print("SVM Training Accuracy:", svm_train_acc)
print("SVM Validation Accuracy:", svm_val_acc)
print("SVM Testing Accuracy:", svm_test_acc)
print("----------------------------------------")

SVM Training Accuracy: 0.8954697734886744
SVM Validation Accuracy: 0.8290707267681692
SVM Testing Accuracy: 0.8349208730218255

Starting SVM hyperparameter tuning...
Fitting 3 folds for each of 4 candidates, totalling 12 fits
SVM - Best Parameters: {'C': 0.1, 'dual': False, 'loss': 'squared_hinge', 'penalty': 'l2'}
SVM Training Accuracy: 0.8779688984449222
SVM Validation Accuracy: 0.8720468011700292
SVM Testing Accuracy: 0.8469961749043726
----------------------------------------


3) Logistic regression

In [21]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

# First split: 80% train/valid, 20% test
X_temp, X_test, y_temp, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Second split: 80% train, 20% validation from the 80% temp data
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)  # 0.25 of 80%

# Logistic Regression Model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

# Predictions
train_preds = lr_model.predict(X_train)
val_preds = lr_model.predict(X_val)
test_preds = lr_model.predict(X_test)

# Accuracy scores
train_accuracy = accuracy_score(y_train, train_preds)
validation_accuracy = accuracy_score(y_val, val_preds)
test_accuracy = accuracy_score(y_test, test_preds)

# Output
print("Logistic Regression Training Accuracy:", train_accuracy)
print("Logistic Regression Validation Accuracy:", validation_accuracy)
print("Logistic Regression Testing Accuracy:", test_accuracy)

# Common setup for all models
X_temp, X_test, y_temp, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

# Combine train and validation sets for GridSearchCV
X_train_val = vstack([X_train, X_val])
y_train_val = pd.concat([y_train, y_val])

# Logistic Regression Hyperparameter Tuning
lr_params = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

lr_grid = GridSearchCV(LogisticRegression(max_iter=1000, random_state=42),
                      lr_params,
                      cv=3,
                      n_jobs=-1,
                      verbose=1)
lr_grid.fit(X_train_val, y_train_val)

# Best model
best_lr = lr_grid.best_estimator_

# Predictions
lr_train_preds = best_lr.predict(X_train)
lr_val_preds = best_lr.predict(X_val)
lr_test_preds = best_lr.predict(X_test)

# Accuracy scores
lr_train_acc = accuracy_score(y_train, lr_train_preds)
lr_val_acc = accuracy_score(y_val, lr_val_preds)
lr_test_acc = accuracy_score(y_test, lr_test_preds)

print("Logistic Regression - Best Parameters:", lr_grid.best_params_)
print("Logistic Regression Training Accuracy:", lr_train_acc)
print("Logistic Regression Validation Accuracy:", lr_val_acc)
print("Logistic Regression Testing Accuracy:", lr_test_acc)
print("----------------------------------------")

Logistic Regression Training Accuracy: 0.879993999699985
Logistic Regression Validation Accuracy: 0.8408460211505288
Logistic Regression Testing Accuracy: 0.8480462011550288
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Logistic Regression - Best Parameters: {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Logistic Regression Training Accuracy: 0.8770438521926096
Logistic Regression Validation Accuracy: 0.8715217880447012
Logistic Regression Testing Accuracy: 0.8470711767794195
----------------------------------------


4) Naive Bayes

In [22]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

# First split: 80% train/valid, 20% test
X_temp, X_test, y_temp, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Second split: 80% train, 20% validation from the 80% temp data
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)  # 0.25 of 80%

# Naive Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Predictions
train_preds = nb_model.predict(X_train)
val_preds = nb_model.predict(X_val)
test_preds = nb_model.predict(X_test)

# Accuracy scores
train_accuracy = accuracy_score(y_train, train_preds)
validation_accuracy = accuracy_score(y_val, val_preds)
test_accuracy = accuracy_score(y_test, test_preds)

# Output
print("Naive Bayes Training Accuracy:", train_accuracy)
print("Naive Bayes Validation Accuracy:", validation_accuracy)
print("Naive Bayes Testing Accuracy:", test_accuracy)

# Common setup for all models
X_temp, X_test, y_temp, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

# Combine train and validation sets for GridSearchCV
X_train_val = vstack([X_train, X_val])
y_train_val = pd.concat([y_train, y_val])

# Naive Bayes Hyperparameter Tuning
nb_params = {
    'alpha': [0.1, 0.5, 1.0, 1.5, 2.0],
    'fit_prior': [True, False]
}

nb_grid = GridSearchCV(MultinomialNB(),
                      nb_params,
                      cv=3,
                      n_jobs=-1,
                      verbose=1)
nb_grid.fit(X_train_val, y_train_val)

# Best model
best_nb = nb_grid.best_estimator_

# Predictions
nb_train_preds = best_nb.predict(X_train)
nb_val_preds = best_nb.predict(X_val)
nb_test_preds = best_nb.predict(X_test)

# Accuracy scores
nb_train_acc = accuracy_score(y_train, nb_train_preds)
nb_val_acc = accuracy_score(y_val, nb_val_preds)
nb_test_acc = accuracy_score(y_test, nb_test_preds)

print("Naive Bayes - Best Parameters:", nb_grid.best_params_)
print("Naive Bayes Training Accuracy:", nb_train_acc)
print("Naive Bayes Validation Accuracy:", nb_val_acc)
print("Naive Bayes Testing Accuracy:", nb_test_acc)
print("----------------------------------------")

Naive Bayes Training Accuracy: 0.8457922896144807
Naive Bayes Validation Accuracy: 0.8221705542638565
Naive Bayes Testing Accuracy: 0.828545713642841
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Naive Bayes - Best Parameters: {'alpha': 2.0, 'fit_prior': False}
Naive Bayes Training Accuracy: 0.8435421771088555
Naive Bayes Validation Accuracy: 0.8373209330233256
Naive Bayes Testing Accuracy: 0.8301207530188255
----------------------------------------


### English testing

In [26]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/English_cleaned.csv')
X = df['normalized_tweet']
y = df['target']

1) Random Forest

In [24]:
# Split into train+validation and test sets first (80% train_val, 20% test)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Then split train+validation into train and validation sets (80/20 split of train_val)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val
)  # 0.25 x 0.8 = 0.2, so final: 60% train, 20% val, 20% test

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

# Random Forest Classifier
rf_model = RandomForestClassifier(
    n_estimators=100,  # Number of trees
    random_state=42,
    n_jobs=-1  # Use all cores for training
)
rf_model.fit(X_train_tfidf, y_train)

# Predictions
y_train_pred = rf_model.predict(X_train_tfidf)
y_val_pred = rf_model.predict(X_val_tfidf)
y_test_pred = rf_model.predict(X_test_tfidf)

# Accuracy calculations
train_acc = accuracy_score(y_train, y_train_pred)
val_acc = accuracy_score(y_val, y_val_pred)
test_acc = accuracy_score(y_test, y_test_pred)

# Output results
print(f"Random Forest Train Accuracy: {train_acc:.4f}")
print(f"Random Forest Validation Accuracy: {val_acc:.4f}")
print(f"Random Forest Test Accuracy: {test_acc:.4f}")

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42, n_jobs=-1),
    param_grid,
    cv=3,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

# Fit grid search to the training data
grid_search.fit(X_train_tfidf, y_train)

# Best parameters and model
print("Best Hyperparameters:", grid_search.best_params_)
best_rf_model = grid_search.best_estimator_

# Evaluate tuned model
y_train_pred_tuned = best_rf_model.predict(X_train_tfidf)
y_val_pred_tuned = best_rf_model.predict(X_val_tfidf)
y_test_pred_tuned = best_rf_model.predict(X_test_tfidf)

# Accuracy calculations for tuned model
train_acc_tuned = accuracy_score(y_train, y_train_pred_tuned)
val_acc_tuned = accuracy_score(y_val, y_val_pred_tuned)
test_acc_tuned = accuracy_score(y_test, y_test_pred_tuned)

# Output tuned model results
print(f"TUNED Random Forest Train Accuracy: {train_acc_tuned:.4f}")
print(f"TUNED Random Forest Validation Accuracy: {val_acc_tuned:.4f}")
print(f"TUNED Random Forest Test Accuracy: {test_acc_tuned:.4f}")

KeyboardInterrupt: 

2) SVM

In [27]:
# Split into train+validation and test sets first (80% train_val, 20% test)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Then split train+validation into train and validation sets (80/20 split of train_val)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val
)  # Final: 60% train, 20% val, 20% test

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

# Linear SVM Model
svm_model = LinearSVC(max_iter=10000, random_state=42)  # Increased max_iter for stability
svm_model.fit(X_train_tfidf, y_train)

# Predictions
y_train_pred = svm_model.predict(X_train_tfidf)
y_val_pred = svm_model.predict(X_val_tfidf)
y_test_pred = svm_model.predict(X_test_tfidf)

# Accuracy calculations
train_acc = accuracy_score(y_train, y_train_pred)
val_acc = accuracy_score(y_val, y_val_pred)
test_acc = accuracy_score(y_test, y_test_pred)

# Output results
print(f"SVM Train Accuracy: {train_acc:.4f}")
print(f"SVM Validation Accuracy: {val_acc:.4f}")
print(f"SVM Test Accuracy: {test_acc:.4f}")

# Define parameter grid
svm_param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'loss': ['hinge', 'squared_hinge'],
    'max_iter': [20000, 30000]
}

# Set up GridSearchCV
svm_grid_search = GridSearchCV(
    LinearSVC(random_state=42),
    svm_param_grid,
    cv=3,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

# Fit grid search to training data
svm_grid_search.fit(X_train_tfidf, y_train)

# Best parameters and model
print("Best SVM Hyperparameters:", svm_grid_search.best_params_)
best_svm_model = svm_grid_search.best_estimator_

# Predictions using the tuned model
y_train_pred_tuned = best_svm_model.predict(X_train_tfidf)
y_val_pred_tuned = best_svm_model.predict(X_val_tfidf)
y_test_pred_tuned = best_svm_model.predict(X_test_tfidf)

# Accuracy calculations for tuned model
train_acc_tuned = accuracy_score(y_train, y_train_pred_tuned)
val_acc_tuned = accuracy_score(y_val, y_val_pred_tuned)
test_acc_tuned = accuracy_score(y_test, y_test_pred_tuned)

# Output tuned model results
print(f"TUNED SVM Train Accuracy: {train_acc_tuned:.4f}")
print(f"TUNED SVM Validation Accuracy: {val_acc_tuned:.4f}")
print(f"TUNED SVM Test Accuracy: {test_acc_tuned:.4f}")

SVM Train Accuracy: 0.7900
SVM Validation Accuracy: 0.7856
SVM Test Accuracy: 0.7862
Fitting 3 folds for each of 16 candidates, totalling 48 fits


KeyboardInterrupt: 

3) Logistic Regression

In [None]:
# Split into train+validation and test sets first (80% train_val, 20% test)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Then split train+validation into train and validation sets (80/20 split of train_val)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val
)  # 0.25 x 0.8 = 0.2, so final: 60% train, 20% val, 20% test

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

# Logistic Regression Model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Predictions
y_train_pred = model.predict(X_train_tfidf)
y_val_pred = model.predict(X_val_tfidf)
y_test_pred = model.predict(X_test_tfidf)

# Accuracy calculations
train_acc = accuracy_score(y_train, y_train_pred)
val_acc = accuracy_score(y_val, y_val_pred)
test_acc = accuracy_score(y_test, y_test_pred)

# Output results
print(f"Train Accuracy: {train_acc:.4f}")
print(f"Validation Accuracy: {val_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")

# Hyperparameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs'],
    'max_iter': [1000]
}

# Grid search setup
grid_search = GridSearchCV(
    LogisticRegression(),
    param_grid,
    cv=3,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

# Train grid search on training data
grid_search.fit(X_train_tfidf, y_train)

# Best parameters and model
print("Best Logistic Regression Parameters:", grid_search.best_params_)
best_lr_model = grid_search.best_estimator_

# Predictions using tuned model
y_train_pred_tuned = best_lr_model.predict(X_train_tfidf)
y_val_pred_tuned = best_lr_model.predict(X_val_tfidf)
y_test_pred_tuned = best_lr_model.predict(X_test_tfidf)

# Accuracy calculations for tuned model
train_acc_tuned = accuracy_score(y_train, y_train_pred_tuned)
val_acc_tuned = accuracy_score(y_val, y_val_pred_tuned)
test_acc_tuned = accuracy_score(y_test, y_test_pred_tuned)

# Output tuned model results
print(f"TUNED Logistic Regression Train Accuracy: {train_acc_tuned:.4f}")
print(f"TUNED Logistic Regression Validation Accuracy: {val_acc_tuned:.4f}")
print(f"TUNED Logistic Regression Test Accuracy: {test_acc_tuned:.4f}")

4) Naive Bayes

In [None]:
# Split into train+validation and test sets first (80% train_val, 20% test)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Then split train+validation into train and validation sets (80/20 split of train_val)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val
)  # Final: 60% train, 20% val, 20% test

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

# Naive Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Predictions
y_train_pred = nb_model.predict(X_train_tfidf)
y_val_pred = nb_model.predict(X_val_tfidf)
y_test_pred = nb_model.predict(X_test_tfidf)

# Accuracy calculations
train_acc = accuracy_score(y_train, y_train_pred)
val_acc = accuracy_score(y_val, y_val_pred)
test_acc = accuracy_score(y_test, y_test_pred)

# Output results
print(f"Naive Bayes Train Accuracy: {train_acc:.4f}")
print(f"Naive Bayes Validation Accuracy: {val_acc:.4f}")
print(f"Naive Bayes Test Accuracy: {test_acc:.4f}")

# Parameter grid for Naive Bayes
nb_param_grid = {
    'alpha': [0.01, 0.1, 0.5, 1.0, 2.0]  # Laplace smoothing
}

# Set up GridSearchCV
nb_grid_search = GridSearchCV(
    MultinomialNB(),
    nb_param_grid,
    cv=3,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

# Fit grid search on training data
nb_grid_search.fit(X_train_tfidf, y_train)

# Best parameters and model
print("Best Naive Bayes Hyperparameters:", nb_grid_search.best_params_)
best_nb_model = nb_grid_search.best_estimator_

# Predictions using tuned model
y_train_pred_tuned = best_nb_model.predict(X_train_tfidf)
y_val_pred_tuned = best_nb_model.predict(X_val_tfidf)
y_test_pred_tuned = best_nb_model.predict(X_test_tfidf)

# Accuracy calculations for tuned model
train_acc_tuned = accuracy_score(y_train, y_train_pred_tuned)
val_acc_tuned = accuracy_score(y_val, y_val_pred_tuned)
test_acc_tuned = accuracy_score(y_test, y_test_pred_tuned)

# Output tuned model results
print(f"TUNED Naive Bayes Train Accuracy: {train_acc_tuned:.4f}")
print(f"TUNED Naive Bayes Validation Accuracy: {val_acc_tuned:.4f}")
print(f"TUNED Naive Bayes Test Accuracy: {test_acc_tuned:.4f}")