# **ML Classifiers with Tf-Idf Vectorization & Oversampling**

In [1]:
try:
    from google.colab import drive
    drive.mount('/content/drive/')
    %cd 'drive/My Drive/WEB_MINING_PROJECT/ML_CLASSIFIERS'
except ImportError as e:
    pass

Mounted at /content/drive/
/content/drive/My Drive/WEB_MINING_PROJECT/ML_CLASSIFIERS


In [2]:
! pip install optuna
! pip install imblearn

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.5 alembic-1.13.1 colorlog-6.8.2 optuna-3.6.1
Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully ins

In [3]:
import pandas as pd
import nltk
import random
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score, make_scorer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler, ADASYN, SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss
import optuna

#not needed

from sklearn.utils.multiclass import unique_labels
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict, GridSearchCV
from sklearn.linear_model import LogisticRegression





[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Loading Downsampled Dataset

In [4]:
# Load the dataset
df = pd.read_csv("downsampled_dataset_10k.csv")

# **Text Preprocessing**

In [5]:
# Actually we only need the content(i.e. review) & the sentiment for classification
df = df[["review_body", "sentiment"]]
df.head(10)

Unnamed: 0,review_body,sentiment
0,Great tv signal very good buy.I like it,Positive
1,Good sound. Fine Material. Simply perfect!,Positive
2,looks good as shown lots of base. BTW fast shi...,Positive
3,This player is totally awesome! I has all the ...,Positive
4,Very quick delivery and high quality. Sound is...,Positive
5,Awesome,Positive
6,"Extremely nice quality, especially for the pri...",Positive
7,Identical to the one from Verizon that we some...,Positive
8,These are so handy and sound really awesome! P...,Positive
9,Works great!,Positive


In [6]:
# Text normalization
def normalize_text(text):
    # Convert text to lowercase
    text = text.lower()
    return text

# Removing punctuation (not used)
def remove_punctuation(text):
    text = str(text)
    punctuations = string.punctuation
    return text.translate(str.maketrans('', '', punctuations))

# Removing special characters (not used)
def remove_spec_char(text):
    text = str(text)
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    text = re.sub('\s+', ' ', text)
    return text

# Tokenization & Stopword removal
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    return ' '.join([word for word in tokens if word not in stop_words])

# Lemmatization (not used)
def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in tokens])
    #return lemmatized_tokens

# Stemming
def stem_words(text):
    ps = PorterStemmer()
    return ' '.join([ps.stem(word) for word in text.split()])


In [7]:
# Apply preprocessing steps

# lowercase words, remove punctuation & special characters
df["review_body"] = df["review_body"].apply(normalize_text)
#df["review_body"] = df["review_body"].apply(remove_punctuation)
#df["review_body"] = df["review_body"].apply(remove_spec_char)

#tokenization and stopwords removal, stemming or lemmatization
df["review_body"] = df["review_body"].apply(remove_stopwords)
#df["review_body"] = df["review_body"].apply(lemmatize_text)
df["review_body"] = df["review_body"].apply(stem_words)

In [8]:
df.head(10)

Unnamed: 0,review_body,sentiment
0,great tv signal good buy.i like,Positive
1,good sound . fine materi . simpli perfect !,Positive
2,look good shown lot base . btw fast ship recei...,Positive
3,"player total awesom ! featur need , ( somehow ...",Positive
4,quick deliveri high qualiti . sound great cabl...,Positive
5,awesom,Positive
6,"extrem nice qualiti , especi price . boyfriend...",Positive
7,"ident one verizon somehow misplac , fraction c...",Positive
8,handi sound realli awesom ! perfect phone game...,Positive
9,work great !,Positive


## Train/Validation/Test Split

In [9]:
# split into train, val and test data
X_train, X_test = train_test_split(df, test_size=0.2, random_state = 42, stratify=df['sentiment'])
X_val, X_test = train_test_split(X_test, test_size=0.5, random_state = 42, stratify=X_test['sentiment'])


y_train = X_train['sentiment']
y_val = X_val['sentiment']
y_test = X_test['sentiment']


X_train.drop(columns = ['sentiment'], inplace= True)
X_val.drop(columns = ['sentiment'], inplace= True)
X_test.drop(columns = ['sentiment'], inplace= True)
X_train = X_train['review_body']
X_val = X_val['review_body']
X_test = X_test['review_body']


print (f"y_train: {y_train.shape}/ x_train: {X_train.shape}")
print (f"y_train: {y_val.shape}/ x_train: {X_val.shape}")
print (f"y_test: {y_test.shape}/ x_test: {X_test.shape}")



y_train: (7999,)/ x_train: (7999,)
y_train: (1000,)/ x_train: (1000,)
y_test: (1000,)/ x_test: (1000,)


## Tf-Idf Vectorization  

In [10]:
# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


## Balancing Methods (used in Pipeline)

In [11]:
# Define oversampling and undersampling techniques
oversampler = RandomOverSampler(random_state=42)
undersampler = RandomUnderSampler(random_state=42)
balancing = oversampler

## Naive Bayes

In [12]:
# Define the objective function for hyperparameter optimization
def objective_nb(trial):
    alpha = trial.suggest_float('alpha', 1e-3, 1e3, log = True)
    clf = MultinomialNB(alpha=alpha)
    clf.fit(X_train_tfidf_resampled, y_train_resampled)
    f1 = f1_score(y_val, clf.predict(X_val_tfidf), average="weighted")
    return f1


# Define the hyperparameter optimization study
study_nb = optuna.create_study(direction='maximize')

# Define Naive Bayes pipeline with oversampling
nb_pipeline = Pipeline([
    ('balancing', balancing ),
    ('classifier', MultinomialNB())
])


# Fit and optimize Naive Bayes pipeline
nb_pipeline.fit(X_train_tfidf, y_train)
X_train_tfidf_resampled, y_train_resampled = balancing.fit_resample(X_train_tfidf, y_train)
study_nb.optimize(objective_nb, n_trials=100)



# Print the best hyperparameters and corresponding accuracies
print("Best Naive Bayes Hyperparameters:", study_nb.best_params)
print("Naive Bayes F1 Score:", study_nb.best_value)



[I 2024-05-17 12:05:41,419] A new study created in memory with name: no-name-06af328b-6529-41e2-b070-ae71ad2d79d1
[I 2024-05-17 12:05:41,575] Trial 0 finished with value: 0.761242598397144 and parameters: {'alpha': 0.001351870301054873}. Best is trial 0 with value: 0.761242598397144.
[I 2024-05-17 12:05:41,636] Trial 1 finished with value: 0.7690601463554142 and parameters: {'alpha': 0.06384571070451203}. Best is trial 1 with value: 0.7690601463554142.
[I 2024-05-17 12:05:41,695] Trial 2 finished with value: 0.7114153024627116 and parameters: {'alpha': 88.46860261962992}. Best is trial 1 with value: 0.7690601463554142.
[I 2024-05-17 12:05:41,756] Trial 3 finished with value: 0.7062529050512486 and parameters: {'alpha': 50.19277813858076}. Best is trial 1 with value: 0.7690601463554142.
[I 2024-05-17 12:05:41,815] Trial 4 finished with value: 0.708926731481879 and parameters: {'alpha': 70.62551851291335}. Best is trial 1 with value: 0.7690601463554142.
[I 2024-05-17 12:05:41,895] Trial 

Best Naive Bayes Hyperparameters: {'alpha': 0.07477901849366936}
Naive Bayes F1 Score: 0.7716480083389174


### Evaluating Model with Best Hyperparameters on Test set

In [13]:
# Get the best hyperparameters
best_alpha = study_nb.best_params['alpha']

# Train the model with the best hyperparameters
nb_model = MultinomialNB(alpha=best_alpha)
nb_model.fit(X_train_tfidf_resampled, y_train_resampled)

# Evaluate the final model on the test set
nb_pred = nb_model.predict(X_test_tfidf)

f1_test = f1_score(y_test, nb_pred, average='weighted')
print("Naive Bayes F1 score on test set:", f1_test)
print("\nNB Classification Report:")
print(classification_report(y_test, nb_pred))

Naive Bayes F1 score on test set: 0.7875099505327245

NB Classification Report:
              precision    recall  f1-score   support

    Negative       0.59      0.69      0.64       168
     Neutral       0.21      0.38      0.27        69
    Positive       0.92      0.82      0.87       763

    accuracy                           0.77      1000
   macro avg       0.57      0.63      0.59      1000
weighted avg       0.82      0.77      0.79      1000



## SVM

In [16]:
# Define the objective function for hyperparameter optimization
def objective_svm(trial):
    C = trial.suggest_float('C', 1e-2, 1e3, log = True)
    gamma = trial.suggest_float('gamma', 1e-2, 1e3, log = True)
    clf = SVC(C=C, gamma=gamma, random_state=42)
    clf.fit(X_train_tfidf_resampled, y_train_resampled)
    f1 = f1_score(y_val, clf.predict(X_val_tfidf), average="weighted")
    return f1

# Define the hyperparameter optimization study
study_svm = optuna.create_study(direction='maximize')

# Define SVM pipeline with oversampling
svm_pipeline = Pipeline([
    ('balancing', balancing),
    ('classifier', SVC(random_state=42))
])

# Fit and optimize SVM pipeline
svm_pipeline.fit(X_train_tfidf, y_train)
X_train_tfidf_resampled, y_train_resampled = balancing.fit_resample(X_train_tfidf, y_train)
study_svm.optimize(objective_svm, n_trials=100)


# Print the best hyperparameters and corresponding accuracies
print("Best SVM Hyperparameters:", study_svm.best_params)
print("SVM F1 Score:", study_svm.best_value)



[I 2024-05-17 12:25:35,467] A new study created in memory with name: no-name-c4ae0482-ff78-4aef-8333-a93e32d14821
[I 2024-05-17 12:26:46,478] Trial 0 finished with value: 0.8168062616836035 and parameters: {'C': 15.860466944210888, 'gamma': 0.09679090670643395}. Best is trial 0 with value: 0.8168062616836035.
[I 2024-05-17 12:27:11,170] Trial 1 finished with value: 0.809771634414119 and parameters: {'C': 68.33916860125272, 'gamma': 0.2047802044362455}. Best is trial 0 with value: 0.8168062616836035.
[I 2024-05-17 12:28:01,458] Trial 2 finished with value: 0.8025082015987969 and parameters: {'C': 3.0471425367417555, 'gamma': 0.08170986115071353}. Best is trial 0 with value: 0.8168062616836035.
[I 2024-05-17 12:29:45,793] Trial 3 finished with value: 0.22885511949694218 and parameters: {'C': 0.011237486498511487, 'gamma': 0.2385451619550573}. Best is trial 0 with value: 0.8168062616836035.
[I 2024-05-17 12:31:35,377] Trial 4 finished with value: 0.6775139747962156 and parameters: {'C': 8

Best SVM Hyperparameters: {'C': 1.0956562780450525, 'gamma': 0.46264070110255034}
SVM F1 Score: 0.8252663008056259


### Evaluating Model with Best Hyperparameters on Test set

In [17]:
# Get the best hyperparameters and retrain the model on the training set
best_svm_params = study_svm.best_params
svm_model = SVC(**best_svm_params, random_state=42)
svm_model.fit(X_train_tfidf_resampled, y_train_resampled)

# Evaluate the model on the test set
svm_pred = svm_model.predict(X_test_tfidf)

f1_test = f1_score(y_test, svm_pred, average='weighted')
print("SVM F1 score on test set:", f1_test)
print("\nSVM Classification Report:")
print(classification_report(y_test, svm_pred))

SVM F1 score on test set: 0.8293834454474838

SVM Classification Report:
              precision    recall  f1-score   support

    Negative       0.67      0.74      0.70       168
     Neutral       0.29      0.17      0.22        69
    Positive       0.91      0.92      0.91       763

    accuracy                           0.84      1000
   macro avg       0.62      0.61      0.61      1000
weighted avg       0.82      0.84      0.83      1000



## Random Forest

In [14]:

# Define the objective function for hyperparameter optimization

def objective_rf(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 5, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                  min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
                                  random_state=42)
    clf.fit(X_train_tfidf_resampled, y_train_resampled)
    f1 = f1_score(y_val, clf.predict(X_val_tfidf), average="weighted")
    return f1

# Define the hyperparameter optimization study
study_rf = optuna.create_study(direction='maximize')


# Define Random Forest pipeline with oversampling
rf_pipeline = Pipeline([
    ('balancing', balancing),
    ('classifier', RandomForestClassifier(random_state=42))
])


# Fit and optimize Random Forest pipeline
rf_pipeline.fit(X_train_tfidf, y_train)
X_train_tfidf_resampled, y_train_resampled = balancing.fit_resample(X_train_tfidf, y_train)
study_rf.optimize(objective_rf, n_trials=100)

# Print the best hyperparameters and corresponding accuracies
print("Best Random Forest Hyperparameters:", study_rf.best_params)
print("Random Forest F1 Score:", study_rf.best_value)


[I 2024-05-17 12:16:14,411] A new study created in memory with name: no-name-8d98641a-1587-4e89-9cd8-a920642a5e3d
[I 2024-05-17 12:16:39,784] Trial 0 finished with value: 0.7981677612191426 and parameters: {'n_estimators': 167, 'max_depth': 18, 'min_samples_split': 7, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.7981677612191426.
[I 2024-05-17 12:16:41,193] Trial 1 finished with value: 0.7711298701424117 and parameters: {'n_estimators': 186, 'max_depth': 7, 'min_samples_split': 11, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.7981677612191426.
[I 2024-05-17 12:16:42,321] Trial 2 finished with value: 0.7970291357919964 and parameters: {'n_estimators': 50, 'max_depth': 43, 'min_samples_split': 16, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.7981677612191426.
[I 2024-05-17 12:16:44,646] Trial 3 finished with value: 0.8150066336646328 and parameters: {'n_estimators': 53, 'max_depth': 42, 'min_samples_split': 15, 'min_samples_leaf': 1}. Best is trial 3 with value

Best Random Forest Hyperparameters: {'n_estimators': 153, 'max_depth': 48, 'min_samples_split': 20, 'min_samples_leaf': 3}
Random Forest F1 Score: 0.8270906681528267


### Evaluating Model with best Hyperparameters on Test set

In [15]:
# Get the best hyperparameters and retrain the model on the training set
best_rf_params = study_rf.best_params
rf_model = RandomForestClassifier(**best_rf_params, random_state=42)
rf_model.fit(X_train_tfidf_resampled, y_train_resampled)

# Evaluate the model on the test set
rf_pred = rf_model.predict(X_test_tfidf)

f1_test = f1_score(y_test, rf_pred, average='weighted')
print("Random Forest F1 score on test set:", f1_test)
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, rf_pred))

Random Forest F1 score on test set: 0.8220251748251749

Random Forest Classification Report:
              precision    recall  f1-score   support

    Negative       0.63      0.74      0.68       168
     Neutral       0.34      0.20      0.25        69
    Positive       0.90      0.90      0.90       763

    accuracy                           0.83      1000
   macro avg       0.63      0.62      0.61      1000
weighted avg       0.82      0.83      0.82      1000

