<a href="https://colab.research.google.com/github/lebe1/text-oriented-data-science-project/blob/main/Data_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading the Dataset



## Connect to Google Drive

In [57]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [58]:
folder_path = '/content/drive/MyDrive/'

## Imports

In [59]:
!pip install wandb



In [117]:
import pandas as pd
import numpy as np
import nltk
import time
from nltk.corpus import stopwords
import string
from gensim.models import Word2Vec
from gensim.test.utils import common_texts

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import classification_report

import wandb

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Reading the CSV File

In [61]:
file_name = 'combined_reviews.csv'

file_path = folder_path + file_name
df = pd.read_csv(file_path)

In [125]:
df.head()

Unnamed: 0,rating,reviewTime,reviewerID,reviewText,summary,unixReviewTime,category,reviewToken,preprocessedText,tokenized_text
0,5,2017-01-16,ASWLL1VJA7WOG,Great product... just what I wanted. Works gr...,Five Stars,1484524800,All_Beauty,"['great', 'product', 'want', 'works', 'great',...",great product wanted works great stylish,"[great, product, wanted, works, great, stylish]"
1,5,2008-12-08,A265K3A7V83112,"After seeing the popularity of this shoe, I de...",What can i say? chucks rock,1228694400,Clothing_Shoes_and_Jewelry,"['see', 'popularity', 'shoe', 'decide', 'test'...",seeing popularity shoe decided test impressed ...,"[seeing, popularity, shoe, decided, test, impr..."
2,5,2013-02-08,A1D18EJF6LHYDV,I was nervousness about the scent because IVe ...,Smells great,1360281600,All_Beauty,"['nervousness', 'scent', 'ive', 'never', 'try'...",nervousness scent ive never tried love paul mi...,"[nervousness, scent, ive, never, tried, love, ..."
3,5,2018-02-15,A25EOTX5I354I2,"I LOVE the smell. A bit expensive, so I cant b...",Five Stars,1518652800,Luxury_Beauty,"['love', 'smell', 'bit', 'expensive', 'buy', '...",love smell bit expensive cant buy often would ...,"[love, smell, bit, expensive, cant, buy, often..."
4,5,2013-11-11,A1DFZPQPCHBYTY,Found this stuff in Japan and wondered if I co...,Super lathery nice soap!,1384128000,All_Beauty,"['found', 'stuff', 'japan', 'wonder', 'could',...",found stuff japan wondered could find 3drops g...,"[found, stuff, japan, wondered, could, find, 3..."


In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   rating          12000 non-null  float64
 1   reviewTime      12000 non-null  object 
 2   reviewerID      12000 non-null  object 
 3   reviewText      11975 non-null  object 
 4   summary         11991 non-null  object 
 5   unixReviewTime  12000 non-null  int64  
 6   category        12000 non-null  object 
 7   reviewToken     12000 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 750.1+ KB


# Build the model

In [64]:
# Preprocessing
def preprocess_text(text):
    # Convert only string instances to lowercase
    text = text.lower() if isinstance(text, str)  else ''
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize text
    tokens = text.split()
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)


df['preprocessedText'] = df['reviewText'].apply(preprocess_text)

# Tokenize text again for word2vec
df['tokenized_text'] = df['preprocessedText'].str.split()

df['preprocessedText'].head()


Unnamed: 0,preprocessedText
0,great product wanted works great stylish
1,seeing popularity shoe decided test impressed ...
2,nervousness scent ive never tried love paul mi...
3,love smell bit expensive cant buy often would ...
4,found stuff japan wondered could find 3drops g...


In [65]:
y = df['rating']
X = pd.DataFrame({'preprocessedText': df['preprocessedText'], 'tokenized_text': df['tokenized_text'], 'reviewText': df["reviewText"]})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Combine train and test sets
df_train = pd.DataFrame({'preprocessedText': X_train['preprocessedText'], 'tokenized_text': X_train['tokenized_text'], 'reviewText': X_train["reviewText"], 'rating': y_train})
df_test = pd.DataFrame({'preprocessedText': X_test['preprocessedText'], 'tokenized_text': X_test['tokenized_text'], 'reviewText': X_test["reviewText"], 'rating': y_test})

# Save them into csv files
df_train.to_csv(folder_path + 'train.csv', index=True)
df_test.to_csv(folder_path + 'test.csv', index=True)


In [66]:
X_train.head()

Unnamed: 0,preprocessedText,tokenized_text,reviewText
9182,comfortable shoes,"[comfortable, shoes]",Very comfortable shoes.
11091,really like cleanser gentle skin face feels cl...,"[really, like, cleanser, gentle, skin, face, f...",I really like this cleanser. It is gentle on t...
6428,nails home love cnd shellac great prodct truly...,"[nails, home, love, cnd, shellac, great, prodc...",I do my own nails at home and love CND Shellac...
288,finding natural shampoo isnt disappointing cha...,"[finding, natural, shampoo, isnt, disappointin...","Finding a ""natural"" shampoo that isn't disappo..."
2626,shower lotion amazing smells wonderful never w...,"[shower, lotion, amazing, smells, wonderful, n...",The in shower lotion is amazing. And it smells...


In [67]:
print(common_texts)

[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']]


In [68]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer()
X_tfidf_train = tfidf.fit_transform(X_train['preprocessedText']).toarray()
X_tfidf_test = tfidf.transform(X_test['preprocessedText']).toarray()

# Word2Vec Embeddings
w2v_model_train = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)

def get_sentence_embedding(word_list, model):
    word_vecs = [model.wv[word] for word in word_list if word in model.wv]
    if word_vecs:
        return np.mean(word_vecs, axis=0)
    else:
        return np.zeros(model.vector_size)

X_w2v_train = np.array([get_sentence_embedding(word_list, w2v_model_train) for word_list in X_train['tokenized_text']])
X_w2v_test = np.array([get_sentence_embedding(word_list, w2v_model_train) for word_list in X_test['tokenized_text']])

X_train_vectorized = np.hstack((X_tfidf_train, X_w2v_train))
X_test_vectorized = np.hstack((X_tfidf_test, X_w2v_test))


## Including wandb for analysis during model training

In [69]:
wandb.login()

True

In [70]:
wandb_project_name = "DOPP analysis"
wandb_run_name = "rf_experiment-9-estimators-100"

rf_config = {
    "n_estimators": 100,
    "max_depth": None,
    "random_state": 42,
    "test_size": 0.2,
    "dataset": "Word2Vec"
}


wandb.init(
    project=wandb_project_name,
    name=wandb_run_name,
    config=rf_config
)


start_time = time.time()

rf_model = RandomForestClassifier(
    n_estimators=rf_config["n_estimators"],
    max_depth=rf_config["max_depth"],
    random_state=rf_config["random_state"]
)
rf_model.fit(X_train_vectorized, y_train)

y_pred_rf = rf_model.predict(X_test_vectorized)
end_time = time.time()
execution_time = end_time - start_time

f1Score = f1_score(y_test, y_pred_rf, average='macro')
precision = precision_score(y_test, y_pred_rf, average='macro')
recall = recall_score(y_test, y_pred_rf, average='macro')

# Log metrics to W&B
wandb.log({
    "Execution Time": execution_time,
    "F1 Score": f1Score,
    "Precision Score": precision,
    "Recall Score": recall
})

print("Execution Time:", execution_time, "seconds")
print("Precision Score:", precision)
print("Recall Score:", recall)
print("F1 Score:", f1Score, "\n")
print(classification_report(y_test, y_pred_rf))

# [optional] finish the wandb run, necessary in notebooks
wandb.finish()

Execution Time: 118.98339509963989 seconds
Precision Score: 0.8809310081132928
Recall Score: 0.4893494597601945
F1 Score: 0.5915729568156776 

              precision    recall  f1-score   support

         1.0       0.77      0.35      0.48        68
         2.0       0.97      0.38      0.55        73
         3.0       0.99      0.37      0.54       182
         4.0       0.88      0.34      0.49       351
         5.0       0.80      0.99      0.89      1726

    accuracy                           0.81      2400
   macro avg       0.88      0.49      0.59      2400
weighted avg       0.83      0.81      0.78      2400



0,1
Execution Time,▁
F1 Score,▁
Precision Score,▁
Recall Score,▁

0,1
Execution Time,118.9834
F1 Score,0.59157
Precision Score,0.88093
Recall Score,0.48935


In [71]:
svc_config = {
    "random_state": 42,
    "test_size": 0.2,
    "max_iter": 1000,
    "penalty": "l1",
    "dataset": "Combined"
}

wandb_project_name = "DOPP analysis"

wandb.init(
    project=wandb_project_name,
    name="svc_experiment-4",
    config=svc_config
)

start_time = time.time()

linear_svc_model = LinearSVC(random_state=svc_config["random_state"], penalty=svc_config["penalty"])
linear_svc_model.fit(X_train_vectorized, y_train)

y_pred_svc = linear_svc_model.predict(X_test_vectorized)
end_time = time.time()
execution_time = end_time - start_time

f1Score = f1_score(y_test, y_pred_svc, average='macro')
precision = precision_score(y_test, y_pred_svc, average='macro')
recall = recall_score(y_test, y_pred_svc, average='macro')

wandb.log({
    "Execution Time": execution_time,
    "F1 Score": f1Score,
    "Precision Score": precision,
    "Recall Score": recall
})

print("Execution Time:", execution_time, "seconds")
print("Precision Score:", precision)
print("Recall Score:", recall)
print("F1 Score:", f1Score, "\n")
print(classification_report(y_test, y_pred_svc))

wandb.finish()

Execution Time: 2.0560805797576904 seconds
Precision Score: 0.6633378211874695
Recall Score: 0.516365226302919
F1 Score: 0.5710940107187673 

              precision    recall  f1-score   support

         1.0       0.57      0.38      0.46        68
         2.0       0.68      0.38      0.49        73
         3.0       0.61      0.40      0.48       182
         4.0       0.60      0.47      0.53       351
         5.0       0.86      0.95      0.90      1726

    accuracy                           0.81      2400
   macro avg       0.66      0.52      0.57      2400
weighted avg       0.79      0.81      0.79      2400



0,1
Execution Time,▁
F1 Score,▁
Precision Score,▁
Recall Score,▁

0,1
Execution Time,2.05608
F1 Score,0.57109
Precision Score,0.66334
Recall Score,0.51637


## Qualitative misclassification analysis

In [72]:
print(np.unique(y_pred_svc))
print(np.unique(y_pred_rf))


[1. 2. 3. 4. 5.]
[1. 2. 3. 4. 5.]


We see our models predict all given classes.  
Now, let's understand why some classes are misclassified.

In [73]:
false_preds_svc = y_pred_svc != y_test

misclassified_predictions = y_pred_svc[false_preds_svc]
misclassified_labels = y_test[false_preds_svc]

In [74]:
misclassified_predictions

array([5., 5., 5., 3., 5., 5., 5., 5., 1., 3., 4., 4., 5., 5., 5., 5., 5.,
       5., 4., 5., 5., 5., 5., 4., 4., 5., 1., 5., 5., 2., 5., 5., 5., 4.,
       5., 4., 4., 5., 5., 4., 3., 5., 5., 5., 5., 4., 5., 4., 5., 5., 5.,
       4., 4., 5., 5., 5., 5., 1., 4., 5., 2., 4., 3., 5., 5., 3., 5., 5.,
       2., 5., 5., 4., 5., 5., 5., 5., 5., 5., 5., 5., 4., 5., 4., 3., 5.,
       4., 5., 5., 1., 5., 5., 4., 4., 3., 5., 5., 5., 5., 5., 4., 5., 4.,
       4., 1., 4., 5., 5., 5., 4., 4., 5., 5., 5., 1., 5., 5., 5., 3., 5.,
       4., 5., 2., 4., 4., 5., 4., 5., 5., 4., 3., 5., 5., 5., 5., 5., 4.,
       5., 4., 1., 3., 3., 5., 5., 5., 5., 3., 5., 4., 5., 5., 4., 5., 4.,
       4., 4., 4., 4., 5., 3., 4., 5., 3., 4., 2., 5., 4., 5., 5., 5., 5.,
       5., 5., 5., 5., 5., 4., 5., 4., 5., 4., 4., 4., 3., 5., 5., 1., 5.,
       3., 4., 4., 5., 4., 5., 3., 5., 4., 4., 2., 3., 1., 4., 4., 5., 5.,
       5., 5., 1., 5., 1., 5., 5., 5., 5., 5., 4., 5., 3., 5., 5., 5., 5.,
       3., 5., 5., 5., 5.

In [75]:
np.unique(misclassified_predictions, return_counts=True)

(array([1., 2., 3., 4., 5.]), array([ 20,  13,  46, 108, 278]))

By this frequency count, it is observable that most of the time a 5-star-rating is predicting wrong, which makes sense since the original dataset is quite imbalanced. The grade 2 has been misclassified the least but is also the least represented class in the dataset.

In [76]:
misclassified_labels

Unnamed: 0,rating
6494,3.0
9120,4.0
9663,4.0
5277,1.0
8546,3.0
...,...
7889,4.0
5157,2.0
9586,4.0
4295,4.0


In [77]:
df_misclassified = df.iloc[misclassified_labels.index]

In [78]:
df_misclassified["misclassified_rating"] = misclassified_predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_misclassified["misclassified_rating"] = misclassified_predictions


In [79]:
df_misclassified.iloc[0]

Unnamed: 0,6494
rating,3.0
reviewTime,2014-08-18
reviewerID,AOEUN9718KVRD
reviewText,The foam conditioner is easy to use and doesn'...
summary,Don't like the scent.
unixReviewTime,1408320000
category,All_Beauty
reviewToken,"['foam', 'conditioner', 'easy', 'use', 'leave'..."
preprocessedText,foam conditioner easy use doesnt leave greasy ...
tokenized_text,"[foam, conditioner, easy, use, doesnt, leave, ..."


In [80]:
df_misclassified.iloc[0]["reviewText"]

"The foam conditioner is easy to use and doesn't leave greasy residue on hair. There was no problem in rinsing it off.\nI just don't like the smell, it bothers me. It's very chemical."

Based on the review text, we can observe that the model does not really understand the final critizing words of the this review. The review text itself is reasonable to give this three stars.

In [81]:
df_misclassified.iloc[14]

Unnamed: 0,11815
rating,4.0
reviewTime,2015-05-14
reviewerID,A22HY3D260KUAW
reviewText,One of my recent samples has been Mario Badesc...
summary,I like the way it makes my skin tight
unixReviewTime,1431561600
category,Luxury_Beauty
reviewToken,"['one', 'recent', 'sample', 'mario', 'badescu'..."
preprocessedText,one recent samples mario badescu skin care pro...
tokenized_text,"[one, recent, samples, mario, badescu, skin, c..."


In [82]:
df_misclassified.iloc[14]["reviewText"]

'One of my recent samples has been Mario Badescu Skin Care Products. I started off with the Almond and Honey Face Scrub which was a pretty gentle scrub. It smells AWESOME, I am a big fan of the sweet almond scent! I give this scent 5 stars :-) I scrubbed it onto my moistened face and rinsed it off, it left my skin feeling very clean and soft. I then applied some of the Seaweed Cleansing Lotion (toner) it smelled pretty plain and earthy. The Seaweed scent was just ok, but it left my skin feeling clean and tight. Next I applied a small amount of the Ceramide Herbal Eye Cream. It smells like a non-scented lotion. This was just ok. I then massaged the Hyaluronic Moisturizer (SPF 15) into my face and neck which also smells like a non-scented lotion. It went on light, its not really thick, which I liked. Out of all these products I loved the Almond and Honey Scrub & the rest of the products were just ok.'

Again, the review text presents some kind of critique, which should be understood by the model not to rate it with five stars.

In [83]:
df_misclassified.iloc[31]

Unnamed: 0,2036
rating,4.0
reviewTime,2017-03-04
reviewerID,A2YKWYC3WQJX5J
reviewText,"As someone with sensitive skin, I appreciate g..."
summary,Good for sensitive skin
unixReviewTime,1488585600
category,Luxury_Beauty
reviewToken,"['someone', 'sensitive', 'skin', 'appreciate',..."
preprocessedText,someone sensitive skin appreciate gentle clean...
tokenized_text,"[someone, sensitive, skin, appreciate, gentle,..."


In [84]:
df_misclassified.iloc[31]["reviewText"]

'As someone with sensitive skin, I appreciate gentle cleansers.\nAs someone that will occasionally wear heavy foundation, I appreciate a cleanser that will remove every last bit of makeup.\nThis cleanser treats my sensitive skin with kid gloves, but I find that I need a toner with it to remove the last traces of makeup.\nIt contains the soothing benefits of oat and aloe vera. Sage and borage extracts are also beneficial to skin, however, fragrance and yellow dye, not so much.'

This is an interesting case since we have the opposite behaviour of the model now predicting a higher rated product of 4 with a lower rating of 2.
This review text is easy to understand for a human but since we remove stopwords for model training it might be possible that the sentence ends up with complete different meaning with meaningful words like `died` and `carefully`. Based on an assumption like this a two star rating seems plausible.

## Balancing optimizations

In [85]:
df_train["rating"].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
5.0,6820
4.0,1414
3.0,722
1.0,338
2.0,306


We have only 306 two star ratings as the least represented class. Therefore, we will take only 306 random samples from the other classes.

In [86]:
df_rating_2 = df_train[df_train["rating"] == 2]

In [87]:
df_rating_1 = df_train[df_train["rating"] == 1].sample(n=306, random_state=42)
df_rating_3 = df_train[df_train["rating"] == 3].sample(n=306, random_state=42)
df_rating_4 = df_train[df_train["rating"] == 4].sample(n=306, random_state=42)
df_rating_5 = df_train[df_train["rating"] == 5].sample(n=306, random_state=42)

In [88]:
# Merge all dataframes
df_balanced = pd.concat([df_rating_2, df_rating_1, df_rating_3, df_rating_4, df_rating_5])

In [89]:
df_balanced.shape

(1530, 4)

In [90]:
df_balanced["rating"].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
2.0,306
1.0,306
3.0,306
4.0,306
5.0,306


In [91]:
df_balanced.head()

Unnamed: 0,preprocessedText,tokenized_text,reviewText,rating
10238,feel like got large 68,"[feel, like, got, large, 68]",feel like I should have got a large.\n\ni am a...,2.0
3499,purchased product read review amazoncom well f...,"[purchased, product, read, review, amazoncom, ...",I purchased this product when I read a review ...,2.0
1026,2 coats topcoat still chipped thought price wo...,"[2, coats, topcoat, still, chipped, thought, p...",2 coats and a topcoat and still it chipped. T...,2.0
10506,email query received listed site converse mens...,"[email, query, received, listed, site, convers...",This is the email query I received.... the wer...,2.0
6728,long walking tougher barefoot,"[long, walking, tougher, barefoot]",Long walking is tougher than barefoot,2.0


In [92]:
y_train_balanced = df_balanced['rating']

X_tfidf_train_balanced = tfidf.fit_transform(df_balanced['preprocessedText']).toarray()
X_tfidf_test_balanced = tfidf.transform(X_test['preprocessedText']).toarray()

# Using w2v model with train set only as recommended here: https://stackoverflow.com/a/70900433/19932351
X_w2v_train_balanced = np.array([get_sentence_embedding(word_list, w2v_model_train) for word_list in df_balanced['tokenized_text']])
X_w2v_test_balanced = np.array([get_sentence_embedding(word_list, w2v_model_train) for word_list in X_test['tokenized_text']])

X_train_vectorized_balanced = np.hstack((X_tfidf_train_balanced, X_w2v_train_balanced))
X_test_vectorized_balanced = np.hstack((X_tfidf_test_balanced, X_w2v_test_balanced))

In [93]:
X_train_vectorized_balanced.shape

(1530, 5877)

In [94]:
X_test_vectorized_balanced.shape

(2400, 5877)

In [95]:
wandb_project_name = "DOPP analysis"
wandb_run_name = "rf_balanced-2-estimators-100"

rf_config = {
    "n_estimators": 100,
    "max_depth": None,
    "random_state": 42,
    "test_size": 0.2,
    "dataset": "Balanced-Combined"
}


wandb.init(
    project=wandb_project_name,
    name=wandb_run_name,
    config=rf_config
)


start_time = time.time()

rf_model = RandomForestClassifier(
    n_estimators=rf_config["n_estimators"],
    max_depth=rf_config["max_depth"],
    random_state=rf_config["random_state"]
)
rf_model.fit(X_train_vectorized_balanced, y_train_balanced)

y_pred_rf = rf_model.predict(X_test_vectorized_balanced)
end_time = time.time()
execution_time = end_time - start_time

f1Score = f1_score(y_test, y_pred_rf, average='macro')
precision = precision_score(y_test, y_pred_rf, average='macro')
recall = recall_score(y_test, y_pred_rf, average='macro')

# Log metrics to W&B
wandb.log({
    "Execution Time": execution_time,
    "F1 Score": f1Score,
    "Precision Score": precision,
    "Recall Score": recall
})

print("Execution Time:", execution_time, "seconds")
print("Precision Score:", precision)
print("Recall Score:", recall)
print("F1 Score:", f1Score, "\n")
print(classification_report(y_test, y_pred_rf))

# [optional] finish the wandb run, necessary in notebooks
wandb.finish()

Execution Time: 2.7628962993621826 seconds
Precision Score: 0.43000363773035327
Recall Score: 0.5496563911596176
F1 Score: 0.46494174468149074 

              precision    recall  f1-score   support

         1.0       0.27      0.46      0.34        68
         2.0       0.29      0.58      0.38        73
         3.0       0.32      0.45      0.38       182
         4.0       0.35      0.57      0.43       351
         5.0       0.92      0.70      0.80      1726

    accuracy                           0.65      2400
   macro avg       0.43      0.55      0.46      2400
weighted avg       0.76      0.65      0.69      2400



0,1
Execution Time,▁
F1 Score,▁
Precision Score,▁
Recall Score,▁

0,1
Execution Time,2.7629
F1 Score,0.46494
Precision Score,0.43
Recall Score,0.54966


In [96]:
svc_config = {
    "random_state": 42,
    "test_size": 0.2,
    "max_iter": 1000,
    "penalty": "l2",
    "dataset": "Balanced-Combined"
}

wandb_project_name = "DOPP analysis"

wandb.init(
    project=wandb_project_name,
    name="svc_balanced",
    config=svc_config
)

start_time = time.time()

linear_svc_model = LinearSVC(random_state=svc_config["random_state"], penalty=svc_config["penalty"])
linear_svc_model.fit(X_train_vectorized_balanced, y_train_balanced)

y_pred_svc = linear_svc_model.predict(X_test_vectorized_balanced)
end_time = time.time()
execution_time = end_time - start_time

f1Score = f1_score(y_test, y_pred_svc, average='macro')
precision = precision_score(y_test, y_pred_svc, average='macro')
recall = recall_score(y_test, y_pred_svc, average='macro')

wandb.log({
    "Execution Time": execution_time,
    "F1 Score": f1Score,
    "Precision Score": precision,
    "Recall Score": recall
})

print("Execution Time:", execution_time, "seconds")
print("Precision Score:", precision)
print("Recall Score:", recall)
print("F1 Score:", f1Score, "\n")
print(classification_report(y_test, y_pred_svc))

wandb.finish()

Execution Time: 0.1225895881652832 seconds
Precision Score: 0.3991277698172116
Recall Score: 0.5576925754959859
F1 Score: 0.4316517105973183 

              precision    recall  f1-score   support

         1.0       0.24      0.57      0.34        68
         2.0       0.21      0.58      0.31        73
         3.0       0.30      0.48      0.37       182
         4.0       0.32      0.54      0.40       351
         5.0       0.93      0.62      0.74      1726

    accuracy                           0.59      2400
   macro avg       0.40      0.56      0.43      2400
weighted avg       0.75      0.59      0.64      2400



0,1
Execution Time,▁
F1 Score,▁
Precision Score,▁
Recall Score,▁

0,1
Execution Time,0.12259
F1 Score,0.43165
Precision Score,0.39913
Recall Score,0.55769


By the metrices, we can observe that the random forest improves in terms of predicting several classes, whereas the svc model trained with a balanced dataset results in a lower performance not recognizing the majority class that well anymore.

## Oversampling


In [97]:
df_rating_1 = df_train[df_train["rating"] == 1]
df_rating_2 = df_train[df_train["rating"] == 2]
df_rating_3 = df_train[df_train["rating"] == 3]
df_rating_4 = df_train[df_train["rating"] == 4]
df_rating_5 = df_train[df_train["rating"] == 5]
print("1",len(df_rating_1))
print("2", len(df_rating_2))
print("3", len(df_rating_3))
print("4",len(df_rating_4))
print("5", len(df_rating_5))

1 338
2 306
3 722
4 1414
5 6820


In our next approach, we target the sample count of 1414, which is the count of class 4, the second most represented class. Therefore, we have to oversample classes 1 to 3 and undersample the majority class 5.

In [98]:
df_rating_1 = df_train[df_train["rating"] == 1].sample(n=1414, random_state=42, replace=True)
df_rating_2 = df_train[df_train["rating"] == 2].sample(n=1414, random_state=42, replace=True)
df_rating_3 = df_train[df_train["rating"] == 3].sample(n=1414, random_state=42, replace=True)
df_rating_5 = df_train[df_train["rating"] == 5].sample(n=1414, random_state=42)

In [99]:
df_oversampled = pd.concat([df_rating_2, df_rating_1, df_rating_3, df_rating_4, df_rating_5])

In [100]:
df_oversampled["rating"].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
2.0,1414
1.0,1414
3.0,1414
4.0,1414
5.0,1414


In [101]:
y_train_oversampled = df_oversampled['rating']

X_tfidf_train_oversampled = tfidf.fit_transform(df_oversampled['preprocessedText']).toarray()
X_tfidf_test_oversampled = tfidf.transform(X_test['preprocessedText']).toarray()

# Using w2v model with train set only as recommended here: https://stackoverflow.com/a/70900433/19932351
X_w2v_train_oversampled = np.array([get_sentence_embedding(word_list, w2v_model_train) for word_list in df_oversampled['tokenized_text']])
X_w2v_test_oversampled = np.array([get_sentence_embedding(word_list, w2v_model_train) for word_list in X_test['tokenized_text']])

X_train_vectorized_oversampled = np.hstack((X_tfidf_train_oversampled, X_w2v_train_oversampled))
X_test_vectorized_oversampled = np.hstack((X_tfidf_test_oversampled, X_w2v_test_oversampled))


In [102]:
wandb_project_name = "DOPP analysis"
wandb_run_name = "rf_oversampled"

rf_config = {
    "n_estimators": 100,
    "max_depth": None,
    "random_state": 42,
    "test_size": 0.2,
    "dataset": "Oversampled-Combined"
}


wandb.init(
    project=wandb_project_name,
    name=wandb_run_name,
    config=rf_config
)


start_time = time.time()

rf_model = RandomForestClassifier(
    n_estimators=rf_config["n_estimators"],
    max_depth=rf_config["max_depth"],
    random_state=rf_config["random_state"]
)
rf_model.fit(X_train_vectorized_oversampled, y_train_oversampled)

y_pred_rf = rf_model.predict(X_test_vectorized_oversampled)
end_time = time.time()
execution_time = end_time - start_time

f1Score = f1_score(y_test, y_pred_rf, average='macro')
precision = precision_score(y_test, y_pred_rf, average='macro')
recall = recall_score(y_test, y_pred_rf, average='macro')

# Log metrics to W&B
wandb.log({
    "Execution Time": execution_time,
    "F1 Score": f1Score,
    "Precision Score": precision,
    "Recall Score": recall
})

print("Execution Time:", execution_time, "seconds")
print("Precision Score:", precision)
print("Recall Score:", recall)
print("F1 Score:", f1Score, "\n")
print(classification_report(y_test, y_pred_rf))

# [optional] finish the wandb run, necessary in notebooks
wandb.finish()

Execution Time: 36.6269371509552 seconds
Precision Score: 0.6533820401226568
Recall Score: 0.5490315971025749
F1 Score: 0.5673749938122136 

              precision    recall  f1-score   support

         1.0       0.53      0.41      0.46        68
         2.0       0.76      0.40      0.52        73
         3.0       0.68      0.39      0.50       182
         4.0       0.38      0.74      0.50       351
         5.0       0.92      0.80      0.86      1726

    accuracy                           0.74      2400
   macro avg       0.65      0.55      0.57      2400
weighted avg       0.80      0.74      0.76      2400



0,1
Execution Time,▁
F1 Score,▁
Precision Score,▁
Recall Score,▁

0,1
Execution Time,36.62694
F1 Score,0.56737
Precision Score,0.65338
Recall Score,0.54903


In [103]:
svc_config = {
    "random_state": 42,
    "test_size": 0.2,
    "max_iter": 1000,
    "penalty": "l2",
    "dataset": "Oversampled-Combined"
}

wandb_project_name = "DOPP analysis"

wandb.init(
    project=wandb_project_name,
    name="svc_balanced",
    config=svc_config
)

start_time = time.time()

linear_svc_model = LinearSVC(random_state=svc_config["random_state"], penalty=svc_config["penalty"])
linear_svc_model.fit(X_train_vectorized_oversampled, y_train_oversampled)

y_pred_svc = linear_svc_model.predict(X_test_vectorized_oversampled)
end_time = time.time()
execution_time = end_time - start_time

f1Score = f1_score(y_test, y_pred_svc, average='macro')
precision = precision_score(y_test, y_pred_svc, average='macro')
recall = recall_score(y_test, y_pred_svc, average='macro')

wandb.log({
    "Execution Time": execution_time,
    "F1 Score": f1Score,
    "Precision Score": precision,
    "Recall Score": recall
})

print("Execution Time:", execution_time, "seconds")
print("Precision Score:", precision)
print("Recall Score:", recall)
print("F1 Score:", f1Score, "\n")
print(classification_report(y_test, y_pred_svc))

wandb.finish()

Execution Time: 0.5385520458221436 seconds
Precision Score: 0.4991385022698241
Recall Score: 0.5691498052091545
F1 Score: 0.52035302821631 

              precision    recall  f1-score   support

         1.0       0.34      0.43      0.38        68
         2.0       0.41      0.51      0.45        73
         3.0       0.45      0.49      0.47       182
         4.0       0.37      0.68      0.47       351
         5.0       0.93      0.74      0.82      1726

    accuracy                           0.70      2400
   macro avg       0.50      0.57      0.52      2400
weighted avg       0.78      0.70      0.72      2400



0,1
Execution Time,▁
F1 Score,▁
Precision Score,▁
Recall Score,▁

0,1
Execution Time,0.53855
F1 Score,0.52035
Precision Score,0.49914
Recall Score,0.56915
