<a href="https://colab.research.google.com/github/lebe1/text-oriented-data-science-project/blob/main/BERT_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading the Dataset

## Connect to Google Drive

In [15]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [16]:
csv_path = '/content/drive/MyDrive/DOPP_Ex2_data/combined_reviews.csv'

In [17]:
!pip install datasets



## Imports

In [4]:
import pandas as pd
import numpy as np
import nltk
import time
from nltk.corpus import stopwords
import string
from gensim.models import Word2Vec

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import classification_report


from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from datasets import Dataset


## Reading the CSV File

In [18]:
df = pd.read_csv(csv_path)

In [19]:
df.head()

Unnamed: 0,rating,reviewTime,reviewerID,reviewText,summary,unixReviewTime,category,reviewToken
0,5.0,2017-01-16,ASWLL1VJA7WOG,Great product... just what I wanted. Works gr...,Five Stars,1484524800,All_Beauty,"['great', 'product', 'want', 'works', 'great',..."
1,5.0,2008-12-08,A265K3A7V83112,"After seeing the popularity of this shoe, I de...",What can i say? chucks rock,1228694400,Clothing_Shoes_and_Jewelry,"['see', 'popularity', 'shoe', 'decide', 'test'..."
2,5.0,2013-02-08,A1D18EJF6LHYDV,I was nervousness about the scent because IVe ...,Smells great,1360281600,All_Beauty,"['nervousness', 'scent', 'ive', 'never', 'try'..."
3,5.0,2018-02-15,A25EOTX5I354I2,"I LOVE the smell. A bit expensive, so I cant b...",Five Stars,1518652800,Luxury_Beauty,"['love', 'smell', 'bit', 'expensive', 'buy', '..."
4,5.0,2013-11-11,A1DFZPQPCHBYTY,Found this stuff in Japan and wondered if I co...,Super lathery nice soap!,1384128000,All_Beauty,"['found', 'stuff', 'japan', 'wonder', 'could',..."


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   rating          12000 non-null  float64
 1   reviewTime      12000 non-null  object 
 2   reviewerID      12000 non-null  object 
 3   reviewText      11975 non-null  object 
 4   summary         11991 non-null  object 
 5   unixReviewTime  12000 non-null  int64  
 6   category        12000 non-null  object 
 7   reviewToken     12000 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 750.1+ KB


In [8]:
df[['rating', 'reviewText']].head()

Unnamed: 0,rating,reviewText
0,5.0,Great product... just what I wanted. Works gr...
1,5.0,"After seeing the popularity of this shoe, I de..."
2,5.0,I was nervousness about the scent because IVe ...
3,5.0,"I LOVE the smell. A bit expensive, so I cant b..."
4,5.0,Found this stuff in Japan and wondered if I co...


In [9]:
# Preprocess: Map ratings to integers (1-5) and shift to (0-4)
df['rating'] = df['rating'].astype(int) - 1
df['reviewText'] = df['reviewText'].astype(str)

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['reviewText'], df['rating'], test_size=0.2, random_state=42
)

# Create validation set
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.2, random_state=42
)

# Convert to Hugging Face dataset format
train_dataset = Dataset.from_dict({"text": train_texts.tolist(), "label": train_labels})
val_dataset = Dataset.from_dict({"text": val_texts.tolist(), "label": val_labels})
test_dataset = Dataset.from_dict({"text": test_texts.tolist(), "label": test_labels})


In [10]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize datasets
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/7680 [00:00<?, ? examples/s]

Map:   0%|          | 0/1920 [00:00<?, ? examples/s]

Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print("Evaluation results:", results)

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [12]:
raw_preds = trainer.predict(test_dataset)

# Extract predictions and labels
predictions = np.argmax(raw_preds.predictions, axis=1)
labels = raw_preds.label_ids

report = classification_report(labels, predictions, target_names=[str(i) for i in range(1, 6)])
print(report)

              precision    recall  f1-score   support

           1       0.75      0.65      0.69        68
           2       0.57      0.47      0.51        73
           3       0.71      0.62      0.66       182
           4       0.60      0.60      0.60       351
           5       0.92      0.95      0.93      1726

    accuracy                           0.85      2400
   macro avg       0.71      0.66      0.68      2400
weighted avg       0.84      0.85      0.85      2400

