<a href="https://colab.research.google.com/github/lebe1/text-oriented-data-science-project/blob/main/Data_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading the Dataset



## Connect to Google Drive

In [8]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [9]:
folder_path = '/content/drive/MyDrive/'

## Imports

In [10]:
!pip install wandb



In [11]:
import pandas as pd
import numpy as np
import nltk
import time
from nltk.corpus import stopwords
import string
from gensim.models import Word2Vec

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import classification_report

import wandb


nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Reading the CSV File

In [12]:
file_name = 'combined_reviews.csv'

file_path = folder_path + file_name
df = pd.read_csv(file_path)

In [13]:
df.head()

Unnamed: 0,rating,reviewTime,reviewerID,reviewText,summary,unixReviewTime,category,reviewToken
0,5.0,2017-01-16,ASWLL1VJA7WOG,Great product... just what I wanted. Works gr...,Five Stars,1484524800,All_Beauty,"['great', 'product', 'want', 'works', 'great',..."
1,5.0,2008-12-08,A265K3A7V83112,"After seeing the popularity of this shoe, I de...",What can i say? chucks rock,1228694400,Clothing_Shoes_and_Jewelry,"['see', 'popularity', 'shoe', 'decide', 'test'..."
2,5.0,2013-02-08,A1D18EJF6LHYDV,I was nervousness about the scent because IVe ...,Smells great,1360281600,All_Beauty,"['nervousness', 'scent', 'ive', 'never', 'try'..."
3,5.0,2018-02-15,A25EOTX5I354I2,"I LOVE the smell. A bit expensive, so I cant b...",Five Stars,1518652800,Luxury_Beauty,"['love', 'smell', 'bit', 'expensive', 'buy', '..."
4,5.0,2013-11-11,A1DFZPQPCHBYTY,Found this stuff in Japan and wondered if I co...,Super lathery nice soap!,1384128000,All_Beauty,"['found', 'stuff', 'japan', 'wonder', 'could',..."


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   rating          12000 non-null  float64
 1   reviewTime      12000 non-null  object 
 2   reviewerID      12000 non-null  object 
 3   reviewText      11975 non-null  object 
 4   summary         11991 non-null  object 
 5   unixReviewTime  12000 non-null  int64  
 6   category        12000 non-null  object 
 7   reviewToken     12000 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 750.1+ KB


# Build the model

In [15]:
# Preprocessing
def preprocess_text(text):
    # Convert only string instances to lowercase
    text = text.lower() if isinstance(text, str)  else ''
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize text
    tokens = text.split()
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)


df['preprocessedText'] = df['reviewText'].apply(preprocess_text)

# Tokenize text again for word2vec
df['tokenized_text'] = df['preprocessedText'].str.split()

df['preprocessedText'].head()


Unnamed: 0,preprocessedText
0,great product wanted works great stylish
1,seeing popularity shoe decided test impressed ...
2,nervousness scent ive never tried love paul mi...
3,love smell bit expensive cant buy often would ...
4,found stuff japan wondered could find 3drops g...


In [89]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=100)
X_tfidf = tfidf.fit_transform(df['preprocessedText']).toarray()

# Word2Vec Embeddings
w2v_model = Word2Vec(sentences=df['tokenized_text'], vector_size=100, window=5, min_count=1, workers=4)
def get_sentence_embedding(word_list):
    word_vecs = [w2v_model.wv[word] for word in word_list if word in w2v_model.wv]
    if word_vecs:
        return np.mean(word_vecs, axis=0)
    else:
        return np.zeros(w2v_model.vector_size)

X_w2v = np.array([get_sentence_embedding(word_list) for word_list in df['tokenized_text']])


## Including wandb for analysis during model training

In [19]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33ml-beccard[0m ([33ml-beccard-tu-wien[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [29]:
X_combined = np.hstack((X_tfidf, X_w2v))
y = df['rating']

X_train, X_test, y_train, y_test = train_test_split(X_w2v, y, test_size=0.2, random_state=42)

wandb_project_name = "DOPP analysis"
wandb_run_name = "rf_experiment-6-estimators-500"

rf_config = {
    "n_estimators": 500,
    "max_depth": None,
    "random_state": 42,
    "test_size": 0.2,
    "dataset": "Word2Vec"
}


wandb.init(
    project=wandb_project_name,
    name=wandb_run_name,
    config=rf_config
)


start_time = time.time()

rf_model = RandomForestClassifier(
    n_estimators=rf_config["n_estimators"],
    max_depth=rf_config["max_depth"],
    random_state=rf_config["random_state"]
)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
end_time = time.time()
execution_time = end_time - start_time

f1Score = f1_score(y_test, y_pred_rf, average='macro')
precision = precision_score(y_test, y_pred_rf, average='macro')
recall = recall_score(y_test, y_pred_rf, average='macro')

# Log metrics to W&B
wandb.log({
    "Execution Time": execution_time,
    "F1 Score": f1Score,
    "Precision Score": precision,
    "Recall Score": recall
})

print("Execution Time:", execution_time, "seconds")
print("Precision Score:", precision)
print("Recall Score:", recall)
print("F1 Score:", f1Score, "\n")
print(classification_report(y_test, y_pred_rf))

# [optional] finish the wandb run, necessary in notebooks
wandb.finish()

Execution Time: 67.39437866210938 seconds
Precision Score: 0.8265042886613451
Recall Score: 0.48876636982713356
F1 Score: 0.583058928001704 

              precision    recall  f1-score   support

         1.0       0.85      0.32      0.47        68
         2.0       0.90      0.37      0.52        73
         3.0       0.86      0.39      0.54       182
         4.0       0.72      0.38      0.50       351
         5.0       0.81      0.98      0.89      1726

    accuracy                           0.81      2400
   macro avg       0.83      0.49      0.58      2400
weighted avg       0.81      0.81      0.78      2400



0,1
Execution Time,▁
F1 Score,▁
Precision Score,▁
Recall Score,▁

0,1
Execution Time,67.39438
F1 Score,0.58306
Precision Score,0.8265
Recall Score,0.48877


In [41]:
X_combined = np.hstack((X_tfidf, X_w2v))
y = df['rating']

X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

svc_config = {
    "random_state": 42,
    "test_size": 0.2,
    "max_iter": 10000,
    "penalty": "l1",
    "dataset": "Combined"
}

wandb_project_name = "DOPP analysis"

wandb.init(
    project=wandb_project_name,
    name="svc_experiment-3",
    config=svc_config
)

start_time = time.time()

linear_svc_model = LinearSVC(random_state=svc_config["random_state"], penalty=svc_config["penalty"])
linear_svc_model.fit(X_train, y_train)

y_pred_svc = linear_svc_model.predict(X_test)
end_time = time.time()
execution_time = end_time - start_time

f1Score = f1_score(y_test, y_pred_svc, average='macro')
precision = precision_score(y_test, y_pred_svc, average='macro')
recall = recall_score(y_test, y_pred_svc, average='macro')

wandb.log({
    "Execution Time": execution_time,
    "F1 Score": f1Score,
    "Precision Score": precision,
    "Recall Score": recall
})

print("Execution Time:", execution_time, "seconds")
print("Precision Score:", precision)
print("Recall Score:", recall)
print("F1 Score:", f1Score, "\n")
print(classification_report(y_test, y_pred_svc))

wandb.finish()

Execution Time: 22.80403232574463 seconds
Precision Score: 0.5481849115690947
Recall Score: 0.2836730018456059
F1 Score: 0.30986140029291037 

              precision    recall  f1-score   support

         1.0       0.35      0.10      0.16        68
         2.0       0.60      0.04      0.08        73
         3.0       0.51      0.15      0.24       182
         4.0       0.52      0.14      0.22       351
         5.0       0.76      0.98      0.86      1726

    accuracy                           0.74      2400
   macro avg       0.55      0.28      0.31      2400
weighted avg       0.69      0.74      0.67      2400





0,1
Execution Time,▁
F1 Score,▁
Precision Score,▁
Recall Score,▁

0,1
Execution Time,22.80403
F1 Score,0.30986
Precision Score,0.54818
Recall Score,0.28367


## Qualitative misclassification analysis

In [45]:
print(np.unique(y_pred_svc))
print(np.unique(y_pred_rf))


[1. 2. 3. 4. 5.]
[1. 2. 3. 4. 5.]


We see our models predict all given classes.  
Now, let's understand why some classes are misclassified.

In [53]:
false_preds_svc = y_pred_svc != y_test

misclassified_predictions = y_pred_svc[false_preds_svc]
misclassified_labels = y_test[false_preds_svc]

In [73]:
misclassified_predictions

array([5., 5., 5., 5., 5., 5., 5., 4., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 1., 5., 5.,
       4., 5., 5., 5., 4., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 1.,
       5., 5., 5., 5., 4., 5., 2., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 2., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 4.,
       5., 5., 4., 5., 5., 3., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 4., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 3., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 4., 4., 5.,
       5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 4., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 4., 5., 5., 5., 5., 5., 5., 5., 3., 3., 5., 5., 5., 5.,
       5., 5., 5., 5., 1., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 5., 5., 3., 5., 5., 5., 5., 5., 5., 5., 5., 5.,
       5., 5., 5., 5., 4., 5., 5., 5., 5., 5., 4., 5., 4., 4., 1., 5., 5.,
       5., 5., 5., 5., 3.

In [71]:
np.unique(misclassified_predictions, return_counts=True)

(array([1., 2., 3., 4., 5.]), array([ 13,   2,  27,  45, 533]))

By this frequency count, it is observable that most of the time a 5-star-rating is predicting wrong (533 times in total), which makes sense since the original dataset is quite imbalanced. The grade 2 has been misclassified the least with only 2 wrong predictions.

In [55]:
misclassified_labels

Unnamed: 0,rating
6494,3.0
1720,3.0
9120,4.0
9663,4.0
5277,1.0
...,...
5157,2.0
9586,4.0
1206,2.0
4295,4.0


In [58]:
df_misclassified = df.iloc[misclassified_labels.index]

In [64]:
df_misclassified["misclassified_rating"] = misclassified_predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_misclassified["misclassified_rating"] = misclassified_predictions


In [65]:
df_misclassified.iloc[0]

Unnamed: 0,6494
rating,3.0
reviewTime,2014-08-18
reviewerID,AOEUN9718KVRD
reviewText,The foam conditioner is easy to use and doesn'...
summary,Don't like the scent.
unixReviewTime,1408320000
category,All_Beauty
reviewToken,"['foam', 'conditioner', 'easy', 'use', 'leave'..."
preprocessedText,foam conditioner easy use doesnt leave greasy ...
tokenized_text,"[foam, conditioner, easy, use, doesnt, leave, ..."


In [68]:
df_misclassified.iloc[0]["reviewText"]

"The foam conditioner is easy to use and doesn't leave greasy residue on hair. There was no problem in rinsing it off.\nI just don't like the smell, it bothers me. It's very chemical."

Based on the review text, we can observe that the model does not really understand the final critizing words of the this review. The review text itself is reasonable to give this three stars.

In [72]:
df_misclassified.iloc[14]

Unnamed: 0,11074
rating,3.0
reviewTime,2017-05-25
reviewerID,ATFW3Q0V6Q4XI
reviewText,It felt smaller than other Dickie's shorts.
summary,A little smaller
unixReviewTime,1495670400
category,Clothing_Shoes_and_Jewelry
reviewToken,"['feel', 'small', 'dicky', 'short']"
preprocessedText,felt smaller dickies shorts
tokenized_text,"[felt, smaller, dickies, shorts]"


In [69]:
df_misclassified.iloc[14]["reviewText"]

"It felt smaller than other Dickie's shorts."

Again, the review text presents some kind of critique, which should be understood by the model not to rate it with five stars.

In [74]:
df_misclassified.iloc[31]

Unnamed: 0,11957
rating,5.0
reviewTime,2014-09-05
reviewerID,A2I9O5E0Q731GN
reviewText,Wish I could get some more!
summary,Five Stars
unixReviewTime,1409875200
category,All_Beauty
reviewToken,"['wish', 'could', 'get']"
preprocessedText,wish could get
tokenized_text,"[wish, could, get]"


In [75]:
df_misclassified.iloc[31]["reviewText"]

'Wish I could get some more!'

This is an interesting case since we have the opposite behaviour of the model now predicting a high rated product with the lowest class of one star.
This review text is easy to understand for a human but since we remove stopwords for model training it might be possible that the sentence ends up with only two words in "Wish more!". Based on an assumption like this a one star rating seems plausible.

## Balancing optimizations

In [77]:
df["rating"].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
5.0,8546
4.0,1765
3.0,904
1.0,406
2.0,379


We have only 379 two star ratings as the least represented class. Therefore, we will take only 379 random samples from the other classes.

In [82]:
df_rating_2 = df[df["rating"] == 2]

In [97]:
df_rating_1 = df[df["rating"] == 1].sample(n=379, random_state=42)
df_rating_3 = df[df["rating"] == 3].sample(n=379, random_state=42)
df_rating_4 = df[df["rating"] == 4].sample(n=379, random_state=42)
df_rating_5 = df[df["rating"] == 5].sample(n=379, random_state=42)

In [84]:
# Merge all dataframes
df_balanced = pd.concat([df_rating_2, df_rating_1, df_rating_3, df_rating_4, df_rating_5])

In [85]:
df_balanced.shape

(1895, 10)

In [86]:
df_balanced["rating"].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
2.0,379
1.0,379
3.0,379
4.0,379
5.0,379


In [87]:
df_balanced.head()

Unnamed: 0,rating,reviewTime,reviewerID,reviewText,summary,unixReviewTime,category,reviewToken,preprocessedText,tokenized_text
64,2.0,2015-10-25,A2R95Z7PTF5ZRO,The XEN-TAN products are some of the priciest ...,Not for everyone....,1445731200,Luxury_Beauty,"['xentan', 'product', 'priciest', 'market', 'y...",xentan products priciest market yet im sure de...,"[xentan, products, priciest, market, yet, im, ..."
95,2.0,2017-01-22,A2V5R832QCSOMX,UPDATE ON 8-28-17: Since I first purchased th...,Be careful if you have sensitive skin!,1485043200,All_Beauty,"['update', '82817', 'since', 'first', 'purchas...",update 82817 since first purchased ive getting...,"[update, 82817, since, first, purchased, ive, ..."
121,2.0,2013-08-18,A3LT26VKSXZFQM,shows like I have put dried mustard in my hair...,Can't get the hang of it.,1376784000,Luxury_Beauty,"['show', 'like', 'put', 'dry', 'mustard', 'hai...",shows like put dried mustard hair couple hours...,"[shows, like, put, dried, mustard, hair, coupl..."
224,2.0,2017-06-25,A2ZJMLZ1IA2YA9,I didn't realize when I request this that it i...,Dermablend Quick Fix Concealer Natural,1498348800,Luxury_Beauty,"['realize', 'request', 'oldschool', 'waxy', 'c...",didnt realize request oldschool waxy concealer...,"[didnt, realize, request, oldschool, waxy, con..."
261,2.0,2015-08-15,A2PG8IVYXCKKGE,"Size runs large. I wear a woman's size 9, but...",Size runs large. I wear a woman's size 9 ...,1439596800,Clothing_Shoes_and_Jewelry,"['size', 'run', 'large', 'wear', 'woman', 'siz...",size runs large wear womans size 9 get 88and12,"[size, runs, large, wear, womans, size, 9, get..."


In [90]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=100)
X_tfidf_balanced = tfidf.fit_transform(df_balanced['preprocessedText']).toarray()

# Word2Vec Embeddings
w2v_model = Word2Vec(sentences=df_balanced['tokenized_text'], vector_size=100, window=5, min_count=1, workers=4)

X_w2v_balanced = np.array([get_sentence_embedding(word_list) for word_list in df_balanced['tokenized_text']])

In [102]:
X_combined_balanced = np.hstack((X_tfidf_balanced, X_w2v_balanced))
y_balanced = df_balanced['rating']

X_train, X_test, y_train, y_test = train_test_split(X_combined_balanced, y_balanced, test_size=0.2, random_state=42)

wandb_project_name = "DOPP analysis"
wandb_run_name = "rf_balanced"

rf_config = {
    "n_estimators": 200,
    "max_depth": None,
    "random_state": 42,
    "test_size": 0.2,
    "dataset": "Balanced-Combined"
}


wandb.init(
    project=wandb_project_name,
    name=wandb_run_name,
    config=rf_config
)


start_time = time.time()

rf_model = RandomForestClassifier(
    n_estimators=rf_config["n_estimators"],
    max_depth=rf_config["max_depth"],
    random_state=rf_config["random_state"]
)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
end_time = time.time()
execution_time = end_time - start_time

f1Score = f1_score(y_test, y_pred_rf, average='macro')
precision = precision_score(y_test, y_pred_rf, average='macro')
recall = recall_score(y_test, y_pred_rf, average='macro')

# Log metrics to W&B
wandb.log({
    "Execution Time": execution_time,
    "F1 Score": f1Score,
    "Precision Score": precision,
    "Recall Score": recall
})

print("Execution Time:", execution_time, "seconds")
print("Precision Score:", precision)
print("Recall Score:", recall)
print("F1 Score:", f1Score, "\n")
print(classification_report(y_test, y_pred_rf))

# [optional] finish the wandb run, necessary in notebooks
wandb.finish()

Execution Time: 9.131192684173584 seconds
Precision Score: 0.5283233481866831
Recall Score: 0.5245834165834166
F1 Score: 0.5241172288691164 

              precision    recall  f1-score   support

         1.0       0.60      0.62      0.61        78
         2.0       0.58      0.47      0.52        77
         3.0       0.52      0.61      0.56        75
         4.0       0.40      0.43      0.41        75
         5.0       0.54      0.50      0.52        74

    accuracy                           0.53       379
   macro avg       0.53      0.52      0.52       379
weighted avg       0.53      0.53      0.52       379



0,1
Execution Time,▁
F1 Score,▁
Precision Score,▁
Recall Score,▁

0,1
Execution Time,9.13119
F1 Score,0.52412
Precision Score,0.52832
Recall Score,0.52458


In [95]:
X_combined_balanced = np.hstack((X_tfidf_balanced, X_w2v_balanced))
y_balanced = df_balanced['rating']

X_train, X_test, y_train, y_test = train_test_split(X_combined_balanced, y_balanced, test_size=0.2, random_state=42)

svc_config = {
    "random_state": 42,
    "test_size": 0.2,
    "max_iter": 1000,
    "penalty": "l2",
    "dataset": "Balanced-Combined"
}

wandb_project_name = "DOPP analysis"

wandb.init(
    project=wandb_project_name,
    name="svc_balanced",
    config=svc_config
)

start_time = time.time()

linear_svc_model = LinearSVC(random_state=svc_config["random_state"], penalty=svc_config["penalty"])
linear_svc_model.fit(X_train, y_train)

y_pred_svc = linear_svc_model.predict(X_test)
end_time = time.time()
execution_time = end_time - start_time

f1Score = f1_score(y_test, y_pred_svc, average='macro')
precision = precision_score(y_test, y_pred_svc, average='macro')
recall = recall_score(y_test, y_pred_svc, average='macro')

wandb.log({
    "Execution Time": execution_time,
    "F1 Score": f1Score,
    "Precision Score": precision,
    "Recall Score": recall
})

print("Execution Time:", execution_time, "seconds")
print("Precision Score:", precision)
print("Recall Score:", recall)
print("F1 Score:", f1Score, "\n")
print(classification_report(y_test, y_pred_svc))

wandb.finish()

Execution Time: 0.1198265552520752 seconds
Precision Score: 0.4353986342116287
Recall Score: 0.4369243369243369
F1 Score: 0.4327628734877484 

              precision    recall  f1-score   support

         1.0       0.46      0.58      0.51        78
         2.0       0.41      0.45      0.43        77
         3.0       0.42      0.36      0.39        75
         4.0       0.41      0.31      0.35        75
         5.0       0.48      0.49      0.48        74

    accuracy                           0.44       379
   macro avg       0.44      0.44      0.43       379
weighted avg       0.44      0.44      0.43       379



0,1
Execution Time,▁
F1 Score,▁
Precision Score,▁
Recall Score,▁

0,1
Execution Time,0.11983
F1 Score,0.43276
Precision Score,0.4354
Recall Score,0.43692


By the metrices, we can observe that the random forest model does not improve, whereas the svc model trained with a balanced dataset results in the best performances of every svc model run so far. The random forest outperforms the svc model in every metric though.

## Oversampling
In our next approach, we target the sample count of 1765, which is the count of class 4. Therefore, we have to oversample classes 1 to 3 and undersample the majority class 5.

## TODO
Fix data leakage creating a test set first and then doing TFIDF/Word2Vec transformation.

In [98]:
df_rating_4 = df[df["rating"] == 4]
df_rating_1 = df[df["rating"] == 1].sample(n=1765, random_state=42, replace=True)
df_rating_2 = df[df["rating"] == 2].sample(n=1765, random_state=42, replace=True)
df_rating_3 = df[df["rating"] == 3].sample(n=1765, random_state=42, replace=True)
df_rating_5 = df[df["rating"] == 5].sample(n=1765, random_state=42)

In [99]:
df_oversampled = pd.concat([df_rating_2, df_rating_1, df_rating_3, df_rating_4, df_rating_5])

In [100]:
df_oversampled["rating"].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
2.0,1765
1.0,1765
3.0,1765
4.0,1765
5.0,1765


In [101]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=100)
X_tfidf_oversampled = tfidf.fit_transform(df_oversampled['preprocessedText']).toarray()

# Word2Vec Embeddings
w2v_model = Word2Vec(sentences=df_oversampled['tokenized_text'], vector_size=100, window=5, min_count=1, workers=4)

X_w2v_oversampled = np.array([get_sentence_embedding(word_list) for word_list in df_oversampled['tokenized_text']])

In [103]:
X_combined_oversampled = np.hstack((X_tfidf_oversampled, X_w2v_oversampled))
y_oversampled = df_oversampled['rating']

X_train, X_test, y_train, y_test = train_test_split(X_combined_oversampled, y_oversampled, test_size=0.2, random_state=42)

wandb_project_name = "DOPP analysis"
wandb_run_name = "rf_oversampled"

rf_config = {
    "n_estimators": 200,
    "max_depth": None,
    "random_state": 42,
    "test_size": 0.2,
    "dataset": "Oversampled-Combined"
}


wandb.init(
    project=wandb_project_name,
    name=wandb_run_name,
    config=rf_config
)


start_time = time.time()

rf_model = RandomForestClassifier(
    n_estimators=rf_config["n_estimators"],
    max_depth=rf_config["max_depth"],
    random_state=rf_config["random_state"]
)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
end_time = time.time()
execution_time = end_time - start_time

f1Score = f1_score(y_test, y_pred_rf, average='macro')
precision = precision_score(y_test, y_pred_rf, average='macro')
recall = recall_score(y_test, y_pred_rf, average='macro')

# Log metrics to W&B
wandb.log({
    "Execution Time": execution_time,
    "F1 Score": f1Score,
    "Precision Score": precision,
    "Recall Score": recall
})

print("Execution Time:", execution_time, "seconds")
print("Precision Score:", precision)
print("Recall Score:", recall)
print("F1 Score:", f1Score, "\n")
print(classification_report(y_test, y_pred_rf))

# [optional] finish the wandb run, necessary in notebooks
wandb.finish()

Execution Time: 21.750843048095703 seconds
Precision Score: 0.8664517420516858
Recall Score: 0.8654219383813275
F1 Score: 0.8656000188491205 

              precision    recall  f1-score   support

         1.0       0.97      0.97      0.97       327
         2.0       0.98      0.98      0.98       381
         3.0       0.93      0.91      0.92       368
         4.0       0.70      0.75      0.72       347
         5.0       0.76      0.72      0.74       342

    accuracy                           0.87      1765
   macro avg       0.87      0.87      0.87      1765
weighted avg       0.87      0.87      0.87      1765



0,1
Execution Time,▁
F1 Score,▁
Precision Score,▁
Recall Score,▁

0,1
Execution Time,21.75084
F1 Score,0.8656
Precision Score,0.86645
Recall Score,0.86542


In [104]:
X_combined_oversampled = np.hstack((X_tfidf_oversampled, X_w2v_oversampled))
y_oversampled = df_oversampled['rating']

X_train, X_test, y_train, y_test = train_test_split(X_combined_oversampled, y_oversampled, test_size=0.2, random_state=42)

svc_config = {
    "random_state": 42,
    "test_size": 0.2,
    "max_iter": 1000,
    "penalty": "l2",
    "dataset": "Oversampled-Combined"
}

wandb_project_name = "DOPP analysis"

wandb.init(
    project=wandb_project_name,
    name="svc_balanced",
    config=svc_config
)

start_time = time.time()

linear_svc_model = LinearSVC(random_state=svc_config["random_state"], penalty=svc_config["penalty"])
linear_svc_model.fit(X_train, y_train)

y_pred_svc = linear_svc_model.predict(X_test)
end_time = time.time()
execution_time = end_time - start_time

f1Score = f1_score(y_test, y_pred_svc, average='macro')
precision = precision_score(y_test, y_pred_svc, average='macro')
recall = recall_score(y_test, y_pred_svc, average='macro')

wandb.log({
    "Execution Time": execution_time,
    "F1 Score": f1Score,
    "Precision Score": precision,
    "Recall Score": recall
})

print("Execution Time:", execution_time, "seconds")
print("Precision Score:", precision)
print("Recall Score:", recall)
print("F1 Score:", f1Score, "\n")
print(classification_report(y_test, y_pred_svc))

wandb.finish()

Execution Time: 9.2102210521698 seconds
Precision Score: 0.5506393688260347
Recall Score: 0.55516599143837
F1 Score: 0.5516456972259235 

              precision    recall  f1-score   support

         1.0       0.60      0.67      0.63       327
         2.0       0.57      0.60      0.58       381
         3.0       0.56      0.51      0.54       368
         4.0       0.46      0.40      0.43       347
         5.0       0.57      0.60      0.58       342

    accuracy                           0.55      1765
   macro avg       0.55      0.56      0.55      1765
weighted avg       0.55      0.55      0.55      1765



0,1
Execution Time,▁
F1 Score,▁
Precision Score,▁
Recall Score,▁

0,1
Execution Time,9.21022
F1 Score,0.55165
Precision Score,0.55064
Recall Score,0.55517
