# **ML Classifiers with Count Vectorization & Oversampling**

In [1]:
try:
    from google.colab import drive
    drive.mount('/content/drive/')
    %cd 'drive/My Drive/WEB_MINING_PROJECT/ML_CLASSIFIERS'
except ImportError as e:
    pass

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/My Drive/WEB_MINING_PROJECT/ML_CLASSIFIERS


In [2]:
! pip install optuna
! pip install imblearn



In [3]:
import pandas as pd
import nltk
import random
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score, make_scorer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler, ADASYN, SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss
import optuna

#not needed

from sklearn.utils.multiclass import unique_labels
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict, GridSearchCV
from sklearn.linear_model import LogisticRegression





[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Loading Downsampled Dataset

In [4]:
# Load the dataset
df = pd.read_csv("downsampled_dataset_10k.csv")

## **Text Preprocessing**

In [5]:
# Actually we only need the content(i.e. review) & the sentiment for classification
df = df[["review_body", "sentiment"]]
df.head(10)

Unnamed: 0,review_body,sentiment
0,Great tv signal very good buy.I like it,Positive
1,Good sound. Fine Material. Simply perfect!,Positive
2,looks good as shown lots of base. BTW fast shi...,Positive
3,This player is totally awesome! I has all the ...,Positive
4,Very quick delivery and high quality. Sound is...,Positive
5,Awesome,Positive
6,"Extremely nice quality, especially for the pri...",Positive
7,Identical to the one from Verizon that we some...,Positive
8,These are so handy and sound really awesome! P...,Positive
9,Works great!,Positive


In [6]:
# Text normalization
def normalize_text(text):
    # Convert text to lowercase
    text = text.lower()
    return text

# Removing punctuation (not used)
def remove_punctuation(text):
    text = str(text)
    punctuations = string.punctuation
    return text.translate(str.maketrans('', '', punctuations))

# Removing special characters (not used)
def remove_spec_char(text):
    text = str(text)
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    text = re.sub('\s+', ' ', text)
    return text

# Tokenization & Stopword removal
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    return ' '.join([word for word in tokens if word not in stop_words])

# Lemmatization (not used)
def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in tokens])
    #return lemmatized_tokens

# Stemming
def stem_words(text):
    ps = PorterStemmer()
    return ' '.join([ps.stem(word) for word in text.split()])


In [7]:
# Apply preprocessing steps

# lowercase words, remove punctuation & special characters
df["review_body"] = df["review_body"].apply(normalize_text)
#df["review_body"] = df["review_body"].apply(remove_punctuation)
#df["review_body"] = df["review_body"].apply(remove_spec_char)

#tokenization and stopwords removal, stemming or lemmatization
df["review_body"] = df["review_body"].apply(remove_stopwords)
#df["review_body"] = df["review_body"].apply(lemmatize_text)
df["review_body"] = df["review_body"].apply(stem_words)

In [8]:
df.head(10)

Unnamed: 0,review_body,sentiment
0,great tv signal good buy.i like,Positive
1,good sound . fine materi . simpli perfect !,Positive
2,look good shown lot base . btw fast ship recei...,Positive
3,"player total awesom ! featur need , ( somehow ...",Positive
4,quick deliveri high qualiti . sound great cabl...,Positive
5,awesom,Positive
6,"extrem nice qualiti , especi price . boyfriend...",Positive
7,"ident one verizon somehow misplac , fraction c...",Positive
8,handi sound realli awesom ! perfect phone game...,Positive
9,work great !,Positive


## Train/Validation/Test Split

In [9]:
# split into train, val and test data
X_train, X_test = train_test_split(df, test_size=0.2, random_state = 42, stratify=df['sentiment'])
X_val, X_test = train_test_split(X_test, test_size=0.5, random_state = 42, stratify=X_test['sentiment'])


y_train = X_train['sentiment']
y_val = X_val['sentiment']
y_test = X_test['sentiment']


X_train.drop(columns = ['sentiment'], inplace= True)
X_val.drop(columns = ['sentiment'], inplace= True)
X_test.drop(columns = ['sentiment'], inplace= True)
X_train = X_train['review_body']
X_val = X_val['review_body']
X_test = X_test['review_body']


print (f"y_train: {y_train.shape}/ x_train: {X_train.shape}")
print (f"y_train: {y_val.shape}/ x_train: {X_val.shape}")
print (f"y_test: {y_test.shape}/ x_test: {X_test.shape}")



y_train: (7999,)/ x_train: (7999,)
y_train: (1000,)/ x_train: (1000,)
y_test: (1000,)/ x_test: (1000,)


## Count Vectorization  

In [10]:
# creating count vectors
count_vectorizer = CountVectorizer()

X_train_count = count_vectorizer.fit_transform(X_train)
X_val_count = count_vectorizer.transform(X_val)
X_test_count = count_vectorizer.transform(X_test)

print(f"number of terms: {len(count_vectorizer.get_feature_names_out())}")

number of terms: 9114


## Balancing Methods (used in Pipeline)

In [11]:
# Define oversampling and undersampling techniques
oversampler = RandomOverSampler(random_state=42)
undersampler = RandomUnderSampler(random_state=42)
balancing = oversampler

## Naive Bayes

In [12]:
# Define the objective function for hyperparameter optimization
def objective_nb(trial):
    alpha = trial.suggest_float('alpha', 1e-3, 1e3, log = True)
    clf = MultinomialNB(alpha=alpha)
    clf.fit(X_train_count_resampled, y_train_resampled)
    f1 = f1_score(y_val, clf.predict(X_val_count), average="weighted")
    return f1


# Define the hyperparameter optimization study
study_nb = optuna.create_study(direction='maximize')

# Define Naive Bayes pipeline with oversampling
nb_pipeline = Pipeline([
    ('balancing', balancing ),
    ('classifier', MultinomialNB())
])


# Fit and optimize Naive Bayes pipeline
nb_pipeline.fit(X_train_count, y_train)
X_train_count_resampled, y_train_resampled = balancing.fit_resample(X_train_count, y_train)
study_nb.optimize(objective_nb, n_trials=100)



# Print the best hyperparameters and corresponding accuracies
print("Best Naive Bayes Hyperparameters:", study_nb.best_params)
print("Naive Bayes F1 Score:", study_nb.best_value)



[I 2024-05-16 17:39:06,336] A new study created in memory with name: no-name-7f59e0ba-5fad-48dd-9796-e76654cc2749
[I 2024-05-16 17:39:06,469] Trial 0 finished with value: 0.47770706929898765 and parameters: {'alpha': 84.55325033762426}. Best is trial 0 with value: 0.47770706929898765.
[I 2024-05-16 17:39:06,518] Trial 1 finished with value: 0.39292327276299216 and parameters: {'alpha': 386.36305166545765}. Best is trial 0 with value: 0.47770706929898765.
[I 2024-05-16 17:39:06,568] Trial 2 finished with value: 0.6396796788400535 and parameters: {'alpha': 17.60332307226462}. Best is trial 2 with value: 0.6396796788400535.
[I 2024-05-16 17:39:06,619] Trial 3 finished with value: 0.7603244680851063 and parameters: {'alpha': 0.00844999098023753}. Best is trial 3 with value: 0.7603244680851063.
[I 2024-05-16 17:39:06,670] Trial 4 finished with value: 0.45485599759982137 and parameters: {'alpha': 115.99142409225333}. Best is trial 3 with value: 0.7603244680851063.
[I 2024-05-16 17:39:06,719]

Best Naive Bayes Hyperparameters: {'alpha': 1.1570636110086092}
Naive Bayes F1 Score: 0.7792510369798245


### Evaluating Model with best Hyperparameters on Test set 

In [13]:
# Get the best hyperparameters
best_alpha = study_nb.best_params['alpha']

# Train the model with the best hyperparameters
nb_model = MultinomialNB(alpha=best_alpha)
nb_model.fit(X_train_count_resampled, y_train_resampled)

# Evaluate the final model on the test set
nb_pred = nb_model.predict(X_test_count)

f1_test = f1_score(y_test, nb_pred, average='micro')
print("Naive Bayes F1 score on test set:", f1_test)
print("\nNB Classification Report:")
print(classification_report(y_test, nb_pred))

Naive Bayes F1 score on test set: 0.7699999999999999

NB Classification Report:
              precision    recall  f1-score   support

    Negative       0.58      0.73      0.65       168
     Neutral       0.18      0.28      0.22        69
    Positive       0.92      0.82      0.87       763

    accuracy                           0.77      1000
   macro avg       0.56      0.61      0.58      1000
weighted avg       0.81      0.77      0.79      1000



## SVM

In [18]:
# Define the objective function for hyperparameter optimization
def objective_svm(trial):
    C = trial.suggest_float('C', 1e-2, 1e3, log = True)
    gamma = trial.suggest_float('gamma', 1e-2, 1e3, log = True)
    clf = SVC(C=C, gamma=gamma, random_state=42)
    clf.fit(X_train_count_resampled, y_train_resampled)
    f1 = f1_score(y_val, clf.predict(X_val_count), average="weighted")
    return f1

# Define the hyperparameter optimization study
study_svm = optuna.create_study(direction='maximize')

# Define SVM pipeline with oversampling
svm_pipeline = Pipeline([
    ('balancing', balancing),
    ('classifier', SVC(random_state=42))
])

# Fit and optimize SVM pipeline
svm_pipeline.fit(X_train_count, y_train)
X_train_count_resampled, y_train_resampled = balancing.fit_resample(X_train_count, y_train)
study_svm.optimize(objective_svm, n_trials=100)


# Print the best hyperparameters and corresponding accuracies
print("Best SVM Hyperparameters:", study_svm.best_params)
print("SVM F1 Score:", study_svm.best_value)



[I 2024-05-16 17:47:00,812] A new study created in memory with name: no-name-9b78cb16-9a81-450c-8605-272fb755a7e9
[I 2024-05-16 17:47:57,991] Trial 0 finished with value: 0.795730080581036 and parameters: {'C': 12.823795457517726, 'gamma': 0.03400040593618988}. Best is trial 0 with value: 0.795730080581036.
[I 2024-05-16 17:48:26,363] Trial 1 finished with value: 0.8040227050244733 and parameters: {'C': 4.077006319748199, 'gamma': 0.010763681180202257}. Best is trial 1 with value: 0.8040227050244733.
[I 2024-05-16 17:49:48,909] Trial 2 finished with value: 0.07049712860817192 and parameters: {'C': 0.014276195044310212, 'gamma': 45.80548872080507}. Best is trial 1 with value: 0.8040227050244733.
[I 2024-05-16 17:51:22,763] Trial 3 finished with value: 0.6775139747962156 and parameters: {'C': 0.5086699610953508, 'gamma': 195.97239627838286}. Best is trial 1 with value: 0.8040227050244733.
[I 2024-05-16 17:52:54,367] Trial 4 finished with value: 0.6775139747962156 and parameters: {'C': 10

Best SVM Hyperparameters: {'C': 14.304497187304761, 'gamma': 0.013550621221294414}
SVM F1 Score: 0.8119165292308386


### Evaluating Model with best Hyperparameters on Test set 

In [19]:
# Get the best hyperparameters and retrain the model on the training set
best_svm_params = study_svm.best_params
svm_model = SVC(**best_svm_params, random_state=42)
svm_model.fit(X_train_count_resampled, y_train_resampled)

# Evaluate the model on the test set
svm_pred = svm_model.predict(X_test_count)

f1_test = f1_score(y_test, svm_pred, average='micro')
print("SVM F1 score on test set:", f1_test)
print("\nSVM Classification Report:")
print(classification_report(y_test, svm_pred))

SVM F1 score on test set: 0.818

SVM Classification Report:
              precision    recall  f1-score   support

    Negative       0.65      0.73      0.69       168
     Neutral       0.21      0.16      0.18        69
    Positive       0.90      0.90      0.90       763

    accuracy                           0.82      1000
   macro avg       0.59      0.59      0.59      1000
weighted avg       0.81      0.82      0.81      1000



## Random Forest

In [16]:

# Define the objective function for hyperparameter optimization

def objective_rf(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 5, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                  min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
                                  random_state=42)
    clf.fit(X_train_count_resampled, y_train_resampled)
    f1 = f1_score(y_val, clf.predict(X_val_count), average="weighted")
    return f1

# Define the hyperparameter optimization study
study_rf = optuna.create_study(direction='maximize')


# Define Random Forest pipeline with oversampling
rf_pipeline = Pipeline([
    ('balancing', balancing),
    ('classifier', RandomForestClassifier(random_state=42))
])


# Fit and optimize Random Forest pipeline
rf_pipeline.fit(X_train_count, y_train)
X_train_count_resampled, y_train_resampled = balancing.fit_resample(X_train_count, y_train)
study_rf.optimize(objective_rf, n_trials=100)

# Print the best hyperparameters and corresponding accuracies
print("Best Random Forest Hyperparameters:", study_rf.best_params)
print("Random Forest F1 Score:", study_rf.best_value)


[I 2024-05-16 17:41:43,192] A new study created in memory with name: no-name-12656549-c92f-4928-926a-9c8f9d38dcbf
[I 2024-05-16 17:42:06,207] Trial 0 finished with value: 0.8131772017190239 and parameters: {'n_estimators': 88, 'max_depth': 45, 'min_samples_split': 12, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.8131772017190239.
[I 2024-05-16 17:42:08,799] Trial 1 finished with value: 0.8049921214679291 and parameters: {'n_estimators': 163, 'max_depth': 30, 'min_samples_split': 9, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.8131772017190239.
[I 2024-05-16 17:42:11,831] Trial 2 finished with value: 0.8024669765907176 and parameters: {'n_estimators': 165, 'max_depth': 46, 'min_samples_split': 10, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.8131772017190239.
[I 2024-05-16 17:42:12,231] Trial 3 finished with value: 0.7637932402766392 and parameters: {'n_estimators': 75, 'max_depth': 8, 'min_samples_split': 18, 'min_samples_leaf': 5}. Best is trial 0 with value

Best Random Forest Hyperparameters: {'n_estimators': 112, 'max_depth': 38, 'min_samples_split': 12, 'min_samples_leaf': 2}
Random Forest F1 Score: 0.8225994748047674


### Evaluating Model with best Hyperparameters on Test set  

In [17]:
# Get the best hyperparameters and retrain the model on the training set
best_rf_params = study_rf.best_params
rf_model = RandomForestClassifier(**best_rf_params, random_state=42)
rf_model.fit(X_train_count_resampled, y_train_resampled)

# Evaluate the model on the test set
rf_pred = rf_model.predict(X_test_count)

f1_test = f1_score(y_test, rf_pred, average='micro')
print("Random Forest F1 score on test set:", f1_test)
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, rf_pred))

Random Forest F1 score on test set: 0.827

Random Forest Classification Report:
              precision    recall  f1-score   support

    Negative       0.66      0.69      0.67       168
     Neutral       0.29      0.12      0.16        69
    Positive       0.88      0.92      0.90       763

    accuracy                           0.83      1000
   macro avg       0.61      0.58      0.58      1000
weighted avg       0.80      0.83      0.81      1000

