# **Importing Libraries and Loading the data**





In [1]:
import pandas as pd
import numpy as np
import random
import re
import matplotlib.pyplot as plt
from datetime import datetime
from tqdm import tqdm
from sklearn import model_selection, linear_model, metrics
from sklearn.metrics import classification_report, ConfusionMatrixDisplay

In [2]:
try:
    from google.colab import drive
    drive.mount('/content/drive/')
    %cd 'drive/My Drive/WEB_MINING_PROJECT/FASTTEXT'
except ImportError as e:
    pass

Mounted at /content/drive/
/content/drive/My Drive/WEB_MINING_PROJECT/FASTTEXT


In [3]:
# Load the dataset
df1 = pd.read_csv("downsampled_dataset_10k.csv")
df1.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,sentiment
0,US,6108596,RN8YG8Q0AS227,B00P8LFJ3Y,406643994,Lava HD-8000 OmniPro,Electronics,5,0,0,N,Y,Five Stars,Great tv signal very good buy.I like it,2015-08-21,Positive
1,US,37870958,R1U4X7M4TMY84A,B00WBS32K4,527761468,PonoMusic Pono Portable Music Player,Electronics,5,1,2,N,Y,Five Stars,Good sound. Fine Material. Simply perfect!,2015-08-29,Positive
2,US,38204831,R291XVGVS56XGR,B00XVT4DLO,80192107,DIVOIX® DV110 In-Ear Headphone Earbuds Lightwe...,Electronics,5,0,0,N,Y,Five Stars,looks good as shown lots of base. BTW fast shi...,2015-08-22,Positive
3,US,26611731,R2Y9PUSXVAMOUU,B00N1KWERI,97589125,"Eclipse 8GB 2.8"" Supra Fit Bluetooth MP3 with ...",Electronics,5,0,0,N,Y,Love it!,This player is totally awesome! I has all the ...,2015-08-18,Positive
4,US,47611641,R1O2U9QMV39FK2,B004LTEUDO,533949740,Mediabridge 3.5mm Male To Male Stereo Audio Ca...,Electronics,5,0,0,N,Y,High Quality Cable,Very quick delivery and high quality. Sound is...,2015-08-20,Positive


In [4]:
df1.shape

(9999, 16)

In [5]:
import numpy as np
import random
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [6]:
# Splitting the data into features (X) and target (y)
X = df1['review_body']
y = df1['sentiment']

In [7]:
from sklearn.model_selection import train_test_split
# split into train, val and test data
X_train, X_test = train_test_split(df1, test_size=0.2, random_state=42, stratify=df1['sentiment'])
X_val, X_test = train_test_split(X_test, test_size=0.5, random_state=42, stratify=X_test['sentiment'])

y_train = X_train['sentiment']
y_val = X_val['sentiment']
y_test = X_test['sentiment']

X_train.drop(columns=['sentiment'], inplace=True)
X_val.drop(columns=['sentiment'], inplace=True)
X_test.drop(columns=['sentiment'], inplace=True)

X_train = X_train['review_body']
X_val = X_val['review_body']
X_test = X_test['review_body']

print(f"y_train: {y_train.shape}/ x_train: {X_train.shape}")
print(f"y_val: {y_val.shape}/ x_val: {X_val.shape}")
print(f"y_test: {y_test.shape}/ x_test: {X_test.shape}")


y_train: (7999,)/ x_train: (7999,)
y_val: (1000,)/ x_val: (1000,)
y_test: (1000,)/ x_test: (1000,)


In [8]:
!pip install gensim
import gensim.downloader as api

# Load the FastText model
WV_fastText = api.load("fasttext-wiki-news-subwords-300")



In [9]:
def preprocess_and_vectorize(text, wv=WV_fastText):
    words = re.findall(r"(?u)\b\w\w+\b", text.lower())
    embeddings = []
    for word in words:
        if word in wv:
            embeddings.append(wv[word])
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        # If no embeddings are found, return a zero vector
        return np.zeros(wv.vector_size)

##Training set

In [10]:
# Apply preprocessing and vectorization to the training set
X_train_embeddings = []
for text in tqdm(X_train):
    if isinstance(text, str):  # Check if text is a string
        X_train_embeddings.append(preprocess_and_vectorize(text))
    else:
        X_train_embeddings.append(np.zeros_like(WV_fastText['word']))

100%|██████████| 7999/7999 [00:01<00:00, 5584.41it/s]


In [11]:
# Convert embeddings to numpy array and drop any NaN values
X_train_embeddings = np.array(X_train_embeddings)
X_train_embeddings = X_train_embeddings[~np.isnan(X_train_embeddings).any(axis=1)]

In [12]:
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression classifier
logistic_regression = LogisticRegression()

# Increase max_iter value
log_reg = LogisticRegression(max_iter=1000)

# Fit the Logistic Regression classifier on the training data
log_reg.fit(X_train_embeddings, y_train)

# Predict on training set
y_pred_train = log_reg.predict(X_train_embeddings)

# Calculate F1-score on training set
f1_train = f1_score(y_train, y_pred_train, average='weighted')
print("Training Set F1-score:", f1_train)

Training Set F1-score: 0.7514499803520344


##Validation Hyperparameter tuning: GridSearchCV

In [13]:
from sklearn.model_selection import GridSearchCV

# Apply preprocessing and vectorization to the validation set
X_val_embeddings = [preprocess_and_vectorize(text) for text in tqdm(X_val)]

# Convert embeddings to numpy array and drop any NaN values
X_val_embeddings = np.array(X_val_embeddings)
X_val_embeddings = X_val_embeddings[~np.isnan(X_val_embeddings).any(axis=1)]


# Define the parameter grid for GridSearchCV
param_grid = {
    'C': [0.1, 1.0, 10.0],  # Regularization parameter
    'penalty': ['l2'],  # Only applying L2 penalty
}

# Initialize the Logistic Regression classifier
logistic_regression = LogisticRegression(max_iter=1000, solver='lbfgs')  # Specify solver

# Initialize GridSearchCV
grid_search = GridSearchCV(logistic_regression, param_grid, cv=5, scoring='f1_weighted')

# Fit GridSearchCV on the validation set
grid_search.fit(X_val_embeddings, y_val)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Predict on validation set with best parameters
y_pred_val = grid_search.predict(X_val_embeddings)

# Calculate F1-score on validation set
f1_val = f1_score(y_val, y_pred_val, average='weighted')
print("Validation Set F1-score:", f1_val)


100%|██████████| 1000/1000 [00:00<00:00, 6476.95it/s]


Best Parameters: {'C': 10.0, 'penalty': 'l2'}
Validation Set F1-score: 0.791350911403137


In [14]:
#train the final model with the best parameters and evaluate it on the test set
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# Apply preprocessing and vectorization to the testing set
X_test_embeddings = []
for text in tqdm(X_test):
    if isinstance(text, str):  # Check if text is a string
        X_test_embeddings.append(preprocess_and_vectorize(text))
    else:
        X_test_embeddings.append(np.zeros_like(WV_fastText['word']))

# Initialize Logistic Regression with best parameters
best_logistic_regression = LogisticRegression(C=10.0, penalty='l2', max_iter=1000)

# Fit the model on the resampled training data
best_logistic_regression.fit(X_train_embeddings, y_train)

# Predict on the test set
y_pred_test = best_logistic_regression.predict(X_test_embeddings)

# Calculate F1-score on the test set
f1_test = f1_score(y_test, y_pred_test, average='micro')
print("Test Set F1-score:", f1_test)

# Calculate classification report on testing set
print("Testing Set Classification Report:")
print(classification_report(y_test, y_pred_test))

100%|██████████| 1000/1000 [00:00<00:00, 6613.20it/s]


Test Set F1-score: 0.843
Testing Set Classification Report:
              precision    recall  f1-score   support

    Negative       0.75      0.60      0.66       168
     Neutral       0.50      0.01      0.03        69
    Positive       0.86      0.97      0.91       763

    accuracy                           0.84      1000
   macro avg       0.70      0.53      0.53      1000
weighted avg       0.82      0.84      0.81      1000

