# **ML Classifiers with Count Vectorization & Undersampling**

In [1]:
try:
    from google.colab import drive
    drive.mount('/content/drive/')
    %cd 'drive/My Drive/WEB_MINING_PROJECT/ML_CLASSIFIERS'
except ImportError as e:
    pass

Mounted at /content/drive/
/content/drive/.shortcut-targets-by-id/1EZ3t7HMzSmWcvRl80eFHiMW9UP3zMZxI/WEB_MINING_PROJECT/ML_CLASSIFIERS


In [17]:
! pip install optuna
! pip install imblearn



In [18]:
import pandas as pd
import nltk
import random
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score, make_scorer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler, ADASYN, SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss
import optuna

#not needed

from sklearn.utils.multiclass import unique_labels
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict, GridSearchCV
from sklearn.linear_model import LogisticRegression





[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saman\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\saman\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Loading Downsampled Dataset

In [19]:
# Load the dataset
df = pd.read_csv("downsampled_dataset_10k.csv")

# **Text Preprocessing**

In [20]:
# Actually we only need the content(i.e. review) & the sentiment for classification
df = df[["review_body", "sentiment"]]
df.head(10)

Unnamed: 0,review_body,sentiment
0,Great tv signal very good buy.I like it,Positive
1,Good sound. Fine Material. Simply perfect!,Positive
2,looks good as shown lots of base. BTW fast shi...,Positive
3,This player is totally awesome! I has all the ...,Positive
4,Very quick delivery and high quality. Sound is...,Positive
5,Awesome,Positive
6,"Extremely nice quality, especially for the pri...",Positive
7,Identical to the one from Verizon that we some...,Positive
8,These are so handy and sound really awesome! P...,Positive
9,Works great!,Positive


In [21]:
# Text normalization
def normalize_text(text):
    # Convert text to lowercase
    text = text.lower()
    return text

# Removing punctuation (not used)
def remove_punctuation(text):
    text = str(text)
    punctuations = string.punctuation
    return text.translate(str.maketrans('', '', punctuations))

# Removing special characters (not used)
def remove_spec_char(text):
    text = str(text)
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    text = re.sub('\s+', ' ', text)
    return text

# Tokenization & Stopword removal
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    return ' '.join([word for word in tokens if word not in stop_words])

# Lemmatization (not used)
def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in tokens])
    #return lemmatized_tokens

# Stemming
def stem_words(text):
    ps = PorterStemmer()
    return ' '.join([ps.stem(word) for word in text.split()])


In [22]:
# Apply preprocessing steps

# lowercase words, remove punctuation & special characters
df["review_body"] = df["review_body"].apply(normalize_text)
#df["review_body"] = df["review_body"].apply(remove_punctuation)
#df["review_body"] = df["review_body"].apply(remove_spec_char)

#tokenization and stopwords removal, stemming or lemmatization
df["review_body"] = df["review_body"].apply(remove_stopwords)
#df["review_body"] = df["review_body"].apply(lemmatize_text)
df["review_body"] = df["review_body"].apply(stem_words)

In [23]:
df.head(10)

Unnamed: 0,review_body,sentiment
0,great tv signal good buy.i like,Positive
1,good sound . fine materi . simpli perfect !,Positive
2,look good shown lot base . btw fast ship recei...,Positive
3,"player total awesom ! featur need , ( somehow ...",Positive
4,quick deliveri high qualiti . sound great cabl...,Positive
5,awesom,Positive
6,"extrem nice qualiti , especi price . boyfriend...",Positive
7,"ident one verizon somehow misplac , fraction c...",Positive
8,handi sound realli awesom ! perfect phone game...,Positive
9,work great !,Positive


## Train/Validation/Test Split

In [24]:
# split into train, val and test data
X_train, X_test = train_test_split(df, test_size=0.2, random_state = 42, stratify=df['sentiment'])
X_val, X_test = train_test_split(X_test, test_size=0.5, random_state = 42, stratify=X_test['sentiment'])


y_train = X_train['sentiment']
y_val = X_val['sentiment']
y_test = X_test['sentiment']


X_train.drop(columns = ['sentiment'], inplace= True)
X_val.drop(columns = ['sentiment'], inplace= True)
X_test.drop(columns = ['sentiment'], inplace= True)
X_train = X_train['review_body']
X_val = X_val['review_body']
X_test = X_test['review_body']


print (f"y_train: {y_train.shape}/ x_train: {X_train.shape}")
print (f"y_train: {y_val.shape}/ x_train: {X_val.shape}")
print (f"y_test: {y_test.shape}/ x_test: {X_test.shape}")



y_train: (7999,)/ x_train: (7999,)
y_train: (1000,)/ x_train: (1000,)
y_test: (1000,)/ x_test: (1000,)


## Count Vectorization  

In [25]:
# count vectors
count_vectorizer = CountVectorizer()

X_train_count = count_vectorizer.fit_transform(X_train)
X_val_count = count_vectorizer.transform(X_val)
X_test_count = count_vectorizer.transform(X_test)

print(f"number of terms: {len(count_vectorizer.get_feature_names_out())}")

number of terms: 9115


## Balancing Methods (used in Pipeline)

In [26]:
# Define oversampling and undersampling techniques
oversampler = RandomOverSampler(random_state=42)
undersampler = RandomUnderSampler(random_state=42)
balancing = undersampler

## Naive Bayes

In [27]:
# Define the objective function for hyperparameter optimization
def objective_nb(trial):
    alpha = trial.suggest_float('alpha', 1e-3, 1e3, log = True)
    clf = MultinomialNB(alpha=alpha)
    clf.fit(X_train_count_resampled, y_train_resampled)
    f1 = f1_score(y_val, clf.predict(X_val_count), average="weighted")
    return f1


# Define the hyperparameter optimization study
study_nb = optuna.create_study(direction='maximize')

# Define Naive Bayes pipeline with oversampling
nb_pipeline = Pipeline([
    ('balancing', balancing ),
    ('classifier', MultinomialNB())
])


# Fit and optimize Naive Bayes pipeline
nb_pipeline.fit(X_train_count, y_train)
X_train_count_resampled, y_train_resampled = balancing.fit_resample(X_train_count, y_train)
study_nb.optimize(objective_nb, n_trials=100)



# Print the best hyperparameters and corresponding accuracies
print("Best Naive Bayes Hyperparameters:", study_nb.best_params)
print("Naive Bayes F1 Score:", study_nb.best_value)



[I 2024-05-17 02:19:18,900] A new study created in memory with name: no-name-32d32098-98bb-402c-8fcf-a080da826843
[I 2024-05-17 02:19:19,025] Trial 0 finished with value: 0.34682376675559007 and parameters: {'alpha': 104.37826058477347}. Best is trial 0 with value: 0.34682376675559007.
[I 2024-05-17 02:19:19,048] Trial 1 finished with value: 0.6784910371712016 and parameters: {'alpha': 0.023476782634126624}. Best is trial 1 with value: 0.6784910371712016.
[I 2024-05-17 02:19:19,077] Trial 2 finished with value: 0.4329253991606772 and parameters: {'alpha': 12.459254561487965}. Best is trial 1 with value: 0.6784910371712016.
[I 2024-05-17 02:19:19,092] Trial 3 finished with value: 0.682445451227079 and parameters: {'alpha': 0.03366653594260328}. Best is trial 3 with value: 0.682445451227079.
[I 2024-05-17 02:19:19,109] Trial 4 finished with value: 0.6695508992306336 and parameters: {'alpha': 0.004847445363016423}. Best is trial 3 with value: 0.682445451227079.
[I 2024-05-17 02:19:19,131]

[I 2024-05-17 02:19:20,253] Trial 47 finished with value: 0.595795811019026 and parameters: {'alpha': 1.3520295488997198}. Best is trial 16 with value: 0.705816976498255.
[I 2024-05-17 02:19:20,277] Trial 48 finished with value: 0.6961834734297196 and parameters: {'alpha': 0.26083758369505583}. Best is trial 16 with value: 0.705816976498255.
[I 2024-05-17 02:19:20,308] Trial 49 finished with value: 0.3370131751792321 and parameters: {'alpha': 296.24889756278725}. Best is trial 16 with value: 0.705816976498255.
[I 2024-05-17 02:19:20,335] Trial 50 finished with value: 0.6718623040171496 and parameters: {'alpha': 0.42530393882159545}. Best is trial 16 with value: 0.705816976498255.
[I 2024-05-17 02:19:20,358] Trial 51 finished with value: 0.7048658131071138 and parameters: {'alpha': 0.19649167020921743}. Best is trial 16 with value: 0.705816976498255.
[I 2024-05-17 02:19:20,385] Trial 52 finished with value: 0.702816711613821 and parameters: {'alpha': 0.21028956767790577}. Best is trial 

[I 2024-05-17 02:19:21,620] Trial 95 finished with value: 0.6547380967536724 and parameters: {'alpha': 0.6509926242594879}. Best is trial 79 with value: 0.7058183148763977.
[I 2024-05-17 02:19:21,660] Trial 96 finished with value: 0.35755066111808675 and parameters: {'alpha': 66.41579800485185}. Best is trial 79 with value: 0.7058183148763977.
[I 2024-05-17 02:19:21,694] Trial 97 finished with value: 0.697218773895741 and parameters: {'alpha': 0.2515238234214491}. Best is trial 79 with value: 0.7058183148763977.
[I 2024-05-17 02:19:21,724] Trial 98 finished with value: 0.6718623040171496 and parameters: {'alpha': 0.42354045841598664}. Best is trial 79 with value: 0.7058183148763977.
[I 2024-05-17 02:19:21,756] Trial 99 finished with value: 0.6871982508799073 and parameters: {'alpha': 0.054477228419215394}. Best is trial 79 with value: 0.7058183148763977.


Best Naive Bayes Hyperparameters: {'alpha': 0.18966123526617917}
Naive Bayes F1 Score: 0.7058183148763977


### Evaluating best model on test set

In [28]:
# Get the best hyperparameters
best_alpha = study_nb.best_params['alpha']

# Train the model with the best hyperparameters
nb_model = MultinomialNB(alpha=best_alpha)
nb_model.fit(X_train_count_resampled, y_train_resampled)

# Evaluate the final model on the test set
nb_pred = nb_model.predict(X_test_count)

f1_test = f1_score(y_test, nb_pred, average='weighted')
print("Naive Bayes F1 score on test set:", f1_test)
print("\nNB Classification Report:")
print(classification_report(y_test, nb_pred))

Naive Bayes F1 score on test set: 0.6972236893902174

NB Classification Report:
              precision    recall  f1-score   support

    Negative       0.50      0.63      0.56       168
     Neutral       0.14      0.52      0.22        69
    Positive       0.94      0.65      0.77       763

    accuracy                           0.64      1000
   macro avg       0.53      0.60      0.52      1000
weighted avg       0.81      0.64      0.70      1000



## SVM

In [29]:
# Define the objective function for hyperparameter optimization
def objective_svm(trial):
    C = trial.suggest_float('C', 1e-2, 1e3, log = True)
    gamma = trial.suggest_float('gamma', 1e-2, 1e3, log = True)
    clf = SVC(C=C, gamma=gamma, random_state=42)
    clf.fit(X_train_count_resampled, y_train_resampled)
    f1 = f1_score(y_val, clf.predict(X_val_count), average="weighted")
    return f1

# Define the hyperparameter optimization study
study_svm = optuna.create_study(direction='maximize')

# Define SVM pipeline with oversampling
svm_pipeline = Pipeline([
    ('balancing', balancing),
    ('classifier', SVC(random_state=42))
])

# Fit and optimize SVM pipeline
svm_pipeline.fit(X_train_count, y_train)
X_train_count_resampled, y_train_resampled = balancing.fit_resample(X_train_count, y_train)
study_svm.optimize(objective_svm, n_trials=100)


# Print the best hyperparameters and corresponding accuracies
print("Best SVM Hyperparameters:", study_svm.best_params)
print("SVM F1 Score:", study_svm.best_value)



[I 2024-05-17 02:19:41,033] A new study created in memory with name: no-name-7b0e9126-01a1-4799-b8ff-d441f6e894ea
[I 2024-05-17 02:19:44,165] Trial 0 finished with value: 0.4997032933436426 and parameters: {'C': 443.7222373331844, 'gamma': 0.4520785539824102}. Best is trial 0 with value: 0.4997032933436426.
[I 2024-05-17 02:19:46,018] Trial 1 finished with value: 0.6346936426891764 and parameters: {'C': 0.1506338171734979, 'gamma': 0.06782996124063874}. Best is trial 1 with value: 0.6346936426891764.
[I 2024-05-17 02:19:48,257] Trial 2 finished with value: 0.14333766608193596 and parameters: {'C': 1.4872654682681017, 'gamma': 123.21695955059664}. Best is trial 1 with value: 0.6346936426891764.
[I 2024-05-17 02:19:50,133] Trial 3 finished with value: 0.568788260642829 and parameters: {'C': 192.228531728834, 'gamma': 0.21493831929969673}. Best is trial 1 with value: 0.6346936426891764.
[I 2024-05-17 02:19:52,103] Trial 4 finished with value: 0.14333766608193596 and parameters: {'C': 0.54

[I 2024-05-17 02:20:55,254] Trial 41 finished with value: 0.7077583724397135 and parameters: {'C': 26.765765562519736, 'gamma': 0.016826885607149823}. Best is trial 16 with value: 0.7290286437492265.
[I 2024-05-17 02:20:56,798] Trial 42 finished with value: 0.7287827418561094 and parameters: {'C': 2.322027846712701, 'gamma': 0.016467137739960353}. Best is trial 16 with value: 0.7290286437492265.
[I 2024-05-17 02:20:58,332] Trial 43 finished with value: 0.727647819200861 and parameters: {'C': 2.961045099363011, 'gamma': 0.019331179376284265}. Best is trial 16 with value: 0.7290286437492265.
[I 2024-05-17 02:20:59,901] Trial 44 finished with value: 0.7245747739089766 and parameters: {'C': 1.9769035890987032, 'gamma': 0.019675810392378777}. Best is trial 16 with value: 0.7290286437492265.
[I 2024-05-17 02:21:01,834] Trial 45 finished with value: 0.11571242528516032 and parameters: {'C': 0.4696108539023888, 'gamma': 13.750168644299501}. Best is trial 16 with value: 0.7290286437492265.
[I 2

[I 2024-05-17 02:22:03,563] Trial 83 finished with value: 0.7022527349279573 and parameters: {'C': 1.8617177349318812, 'gamma': 0.034064653051583446}. Best is trial 74 with value: 0.746288820878101.
[I 2024-05-17 02:22:05,400] Trial 84 finished with value: 0.15283591500451882 and parameters: {'C': 5.968872934064419, 'gamma': 5.5579277298827385}. Best is trial 74 with value: 0.746288820878101.
[I 2024-05-17 02:22:07,013] Trial 85 finished with value: 0.7164055198499394 and parameters: {'C': 0.6850296620455192, 'gamma': 0.014315875642548477}. Best is trial 74 with value: 0.746288820878101.
[I 2024-05-17 02:22:08,673] Trial 86 finished with value: 0.7138979726774288 and parameters: {'C': 3.862049309929031, 'gamma': 0.027263807641946122}. Best is trial 74 with value: 0.746288820878101.
[I 2024-05-17 02:22:10,172] Trial 87 finished with value: 0.7348717241379309 and parameters: {'C': 9.657208621817851, 'gamma': 0.010051833842330094}. Best is trial 74 with value: 0.746288820878101.
[I 2024-0

Best SVM Hyperparameters: {'C': 1.2463864804887763, 'gamma': 0.010718573861398862}
SVM F1 Score: 0.746288820878101


### Evaluating best model on test set

In [30]:
# Get the best hyperparameters and retrain the model on the training set
best_svm_params = study_svm.best_params
svm_model = SVC(**best_svm_params, random_state=42)
svm_model.fit(X_train_count_resampled, y_train_resampled)

# Evaluate the model on the test set
svm_pred = svm_model.predict(X_test_count)

f1_test = f1_score(y_test, svm_pred, average='weighted')
print("SVM F1 score on test set:", f1_test)
print("\nSVM Classification Report:")
print(classification_report(y_test, svm_pred))

SVM F1 score on test set: 0.7464067696259112

SVM Classification Report:
              precision    recall  f1-score   support

    Negative       0.63      0.59      0.61       168
     Neutral       0.13      0.36      0.19        69
    Positive       0.89      0.77      0.83       763

    accuracy                           0.71      1000
   macro avg       0.55      0.57      0.54      1000
weighted avg       0.80      0.71      0.75      1000



## Random Forest

In [31]:

# Define the objective function for hyperparameter optimization

def objective_rf(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 5, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                  min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
                                  random_state=42)
    clf.fit(X_train_count_resampled, y_train_resampled)
    f1 = f1_score(y_val, clf.predict(X_val_count), average="weighted")
    return f1

# Define the hyperparameter optimization study
study_rf = optuna.create_study(direction='maximize')


# Define Random Forest pipeline with oversampling
rf_pipeline = Pipeline([
    ('balancing', balancing),
    ('classifier', RandomForestClassifier(random_state=42))
])


# Fit and optimize Random Forest pipeline
rf_pipeline.fit(X_train_count, y_train)
X_train_count_resampled, y_train_resampled = balancing.fit_resample(X_train_count, y_train)
study_rf.optimize(objective_rf, n_trials=100)

# Print the best hyperparameters and corresponding accuracies
print("Best Random Forest Hyperparameters:", study_rf.best_params)
print("Random Forest F1 Score:", study_rf.best_value)


[I 2024-05-17 02:25:49,384] A new study created in memory with name: no-name-619f58d7-e8be-4e00-a8a2-06d1a3d5ef2b
[I 2024-05-17 02:26:07,630] Trial 0 finished with value: 0.7420753633685284 and parameters: {'n_estimators': 184, 'max_depth': 23, 'min_samples_split': 3, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.7420753633685284.
[I 2024-05-17 02:26:08,189] Trial 1 finished with value: 0.7278914275515465 and parameters: {'n_estimators': 81, 'max_depth': 11, 'min_samples_split': 11, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.7420753633685284.
[I 2024-05-17 02:26:09,131] Trial 2 finished with value: 0.7291871800409415 and parameters: {'n_estimators': 109, 'max_depth': 35, 'min_samples_split': 4, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.7420753633685284.
[I 2024-05-17 02:26:10,136] Trial 3 finished with value: 0.7475344309348314 and parameters: {'n_estimators': 68, 'max_depth': 34, 'min_samples_split': 18, 'min_samples_leaf': 4}. Best is trial 3 with valu

[I 2024-05-17 02:27:02,043] Trial 36 finished with value: 0.7394322905383609 and parameters: {'n_estimators': 96, 'max_depth': 34, 'min_samples_split': 5, 'min_samples_leaf': 4}. Best is trial 4 with value: 0.7571985356306064.
[I 2024-05-17 02:27:05,886] Trial 37 finished with value: 0.7419482248409862 and parameters: {'n_estimators': 107, 'max_depth': 26, 'min_samples_split': 5, 'min_samples_leaf': 1}. Best is trial 4 with value: 0.7571985356306064.
[I 2024-05-17 02:27:08,690] Trial 38 finished with value: 0.738697310736308 and parameters: {'n_estimators': 123, 'max_depth': 31, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 4 with value: 0.7571985356306064.
[I 2024-05-17 02:27:09,367] Trial 39 finished with value: 0.7251254189905916 and parameters: {'n_estimators': 79, 'max_depth': 21, 'min_samples_split': 16, 'min_samples_leaf': 10}. Best is trial 4 with value: 0.7571985356306064.
[I 2024-05-17 02:27:10,710] Trial 40 finished with value: 0.7509902200455736 and paramete

[I 2024-05-17 02:28:05,302] Trial 72 finished with value: 0.7487904102310705 and parameters: {'n_estimators': 133, 'max_depth': 18, 'min_samples_split': 14, 'min_samples_leaf': 3}. Best is trial 41 with value: 0.7596463083169482.
[I 2024-05-17 02:28:07,688] Trial 73 finished with value: 0.7573251687134184 and parameters: {'n_estimators': 151, 'max_depth': 22, 'min_samples_split': 13, 'min_samples_leaf': 3}. Best is trial 41 with value: 0.7596463083169482.
[I 2024-05-17 02:28:09,897] Trial 74 finished with value: 0.7415996911656317 and parameters: {'n_estimators': 148, 'max_depth': 22, 'min_samples_split': 15, 'min_samples_leaf': 2}. Best is trial 41 with value: 0.7596463083169482.
[I 2024-05-17 02:28:11,904] Trial 75 finished with value: 0.7564111182934712 and parameters: {'n_estimators': 161, 'max_depth': 20, 'min_samples_split': 17, 'min_samples_leaf': 3}. Best is trial 41 with value: 0.7596463083169482.
[I 2024-05-17 02:28:13,835] Trial 76 finished with value: 0.7472107394847698 and

Best Random Forest Hyperparameters: {'n_estimators': 112, 'max_depth': 22, 'min_samples_split': 4, 'min_samples_leaf': 3}
Random Forest F1 Score: 0.7596463083169482


### Evaluating best model on test set

In [32]:
# Get the best hyperparameters and retrain the model on the training set
best_rf_params = study_rf.best_params
rf_model = RandomForestClassifier(**best_rf_params, random_state=42)
rf_model.fit(X_train_count_resampled, y_train_resampled)

# Evaluate the model on the test set
rf_pred = rf_model.predict(X_test_count)

f1_test = f1_score(y_test, rf_pred, average='weighted')
print("Random Forest F1 score on test set:", f1_test)
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, rf_pred))

Random Forest F1 score on test set: 0.7583059768458279

Random Forest Classification Report:
              precision    recall  f1-score   support

    Negative       0.66      0.56      0.61       168
     Neutral       0.15      0.39      0.22        69
    Positive       0.89      0.79      0.84       763

    accuracy                           0.73      1000
   macro avg       0.57      0.58      0.55      1000
weighted avg       0.80      0.73      0.76      1000

