# ISY503 - Assessment 3 
Sentiment Analysis using NLP

This project uses python version 3.11.10

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET

import nltk
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joaquinmorales13a06/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data upload
First, we're going to read and parse multiple XML-formatted review files from the specified file paths, extract key information such as ratings and review texts from each review, and compile all this data into a single pandas DataFrame for analysis.

In [3]:
# Function to parse review files
def parse_review_files(file_paths):
    # Define the columns for the DataFrame
    columns = ["unique_id", "asin", "product_name", "product_type", "helpful", 
               "rating", "title", "date", "reviewer", "reviewer_location", "review_text"]
    data = []

    for file_path in file_paths:
        # Initialize an empty list to store each review's content temporarily
        review = []
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                review.append(line.strip())
                # End of a review block
                if line.strip() == "</review>":
                    try:
                        # Convert the review to a single XML structure
                        review_xml = "\n".join(review)
                        # Parse the XML content
                        root = ET.fromstring(review_xml)
                        # Extract values into a dictionary
                        review_data = {
                            "unique_id": root.find('unique_id').text if root.find('unique_id') is not None else None,
                            "asin": root.find('asin').text if root.find('asin') is not None else None,
                            "product_name": root.find('product_name').text if root.find('product_name') is not None else None,
                            "product_type": root.find('product_type').text if root.find('product_type') is not None else None,
                            "helpful": root.find('helpful').text if root.find('helpful') is not None else None,
                            "rating": float(root.find('rating').text) if root.find('rating') is not None else None,
                            "title": root.find('title').text if root.find('title') is not None else None,
                            "date": root.find('date').text if root.find('date') is not None else None,
                            "reviewer": root.find('reviewer').text if root.find('reviewer') is not None else None,
                            "reviewer_location": root.find('reviewer_location').text if root.find('reviewer_location') is not None else None,
                            "review_text": root.find('review_text').text if root.find('review_text') is not None else None
                        }
                        # Append the parsed data to the list
                        data.append(review_data)
                    except ET.ParseError:
                        # Skip entries that are not well-formed XML
                        print(f"Skipping a malformed entry in file: {file_path}")
                    finally:
                        # Reset review list for the next block
                        review = []

    # Create a DataFrame with the combined data from all files
    df_reviews = pd.DataFrame(data, columns=columns)
    return df_reviews

# File paths
file_paths = [
    './DATA/books/positive.review', 
    './DATA/books/negative.review',
    './DATA/dvd/positive.review',
    './DATA/dvd/negative.review',
    './DATA/electronics/positive.review',
    './DATA/electronics/negative.review',
    './DATA/kitchen_&_housewares/positive.review',
    './DATA/kitchen_&_housewares/negative.review',
]

# Parse all specified files
df_reviews = parse_review_files(file_paths)

# Display the DataFrame
df_reviews.head()

Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entr

Unnamed: 0,unique_id,asin,product_name,product_type,helpful,rating,title,date,reviewer,reviewer_location,review_text
0,\n0785758968:one_of_the_best_crichton_novels:j...,\n0785758968\n,\nSphere: Books: Michael Crichton\n,\nbooks\n,\n0 of 1\n,5.0,\nOne of the best Crichton novels\n,"\nJuly 1, 2006\n",\nJoseph M\n,"\nColorado, USA\n",\nSphere by Michael Crichton is an excellant n...
1,\n0452279550:the_medicine_of_the_future:wafa_r...,\n0452279550\n,\nHealing from the Heart: A Leading Surgeon Co...,\nbooks\n,\n34 of 41\n,4.0,\nThe Medicine of the Future\n,"\nNovember 6, 2002\n",\nWafa Rashed\n,"\nJabriya, KUWAIT\n",\nDr. Oz is an accomplished heart surgeon in t...
2,"\n1599620065:beautiful!:sarah_silva_""sar""\n",\n1599620065\n,\nMythology: DC Comics Art of Alex Ross 2007 C...,\nbooks\n,\n\n,5.0,\nBeautiful!\n,"\nJune 13, 2006\n","\nSarah Silva ""Sar""\n","\nSan Diego, CA USA\n",\nThe most gorgeous artwork in comic books. Co...
3,\n0743277724:for_lovers_of_robicheaux:g._rouss...,\n0743277724\n,\nPegasus Descending: A Dave Robicheaux Novel ...,\nbooks\n,\n1 of 1\n,4.0,\nFor lovers of Robicheaux\n,"\nNovember 2, 2006\n",\nG. Rousseau\n,"\nFinistere, France\n",\nThis book is for lovers of Robicheaux. His ...
4,\n061318114X:excellent_and_broad_survey_of_the...,\n061318114X\n,"\nGuns, Germs, and Steel: The Fates of Human S...",\nbooks\n,\n7 of 9\n,5.0,\nExcellent and broad survey of the developmen...,"\nOctober 6, 2006\n","\nPatrick D. Goonan ""www.meaningful-life.us""\n","\nPleasanton, CA\n",\nThis is going to be a short and sweet review...


# Data Cleaning process
Next, we're going to clean the dataset by stripping whitespace and dropping unnecessary columns and missing values, encode the ratings into binary labels for positive and negative sentiments, remove outlier reviews that are too short, prepare the data by converting reviews and labels into lists, split the data into training, validation, and test sets, initialize the BERT tokenizer, define a custom ReviewDataset class to tokenize and encode the reviews, and create dataset objects for training, validation, and testing.

In [4]:
# Clean the data
df_reviews = df_reviews.map(lambda x: x.strip() if isinstance(x, str) else x)
df_reviews.drop(columns=["unique_id", "asin", "product_name", "product_type", "helpful", "title", "date", "reviewer", "reviewer_location"], inplace=True)
df_reviews.dropna(inplace=True)

# Encode the labels for 'positive' and 'negative'
df_reviews['label'] = df_reviews['rating'].apply(lambda x: 1 if x >= 3 else 0)

# Conduct outlier removal to eliminate really short or wrong reviews
df_reviews['review_length'] = df_reviews['review_text'].apply(lambda x: len(x.split()))
df_reviews = df_reviews[df_reviews['review_length'] >= 5]
df_reviews.drop(columns=['rating', 'review_length'], inplace=True)

# Display the DataFrame
df_reviews.head()

Unnamed: 0,review_text,label
0,Sphere by Michael Crichton is an excellant nov...,1
1,Dr. Oz is an accomplished heart surgeon in the...,1
2,The most gorgeous artwork in comic books. Cont...,1
3,This book is for lovers of Robicheaux. His de...,1
4,This is going to be a short and sweet review b...,1


In [5]:
# Prepare the data
reviews = df_reviews['review_text'].tolist()
labels = df_reviews['label'].tolist()

# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(reviews, labels, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [6]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
MAX_LEN = 128
BATCH_SIZE = 16

# Define the Dataset class
class ReviewDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_len):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, item):
        review = str(self.reviews[item])
        label = self.labels[item]
        
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',  # Return PyTorch tensors
        )
        
        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),  # Flatten to 1D tensor
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create datasets
train_dataset = ReviewDataset(
    reviews=X_train,
    labels=y_train,
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

val_dataset = ReviewDataset(
    reviews=X_val,
    labels=y_val,
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

test_dataset = ReviewDataset(
    reviews=X_test,
    labels=y_test,
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

# Model Training and Testing
Finally, we're going to define the model by loading a pre-trained BERT sequence classification model configured for binary classification, set up the training arguments specifying parameters like batch size, number of epochs, and evaluation strategy, define a function to compute evaluation metrics such as accuracy, precision, recall, and F1 score, instantiate the Trainer with the model, training arguments, and datasets, train the model using the training data, and finally test the model on the test dataset to evaluate its performance and print the test metrics.

In [7]:
# Define the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=2,              
    per_device_train_batch_size=BATCH_SIZE,  
    per_device_eval_batch_size=BATCH_SIZE,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=10,
    eval_strategy="steps",     # Updated parameter name
    eval_steps=100,                  
    save_steps=500,                  
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

# Define the compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Instantiate the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Train your model
trainer.train()

# Test the model
test_results = trainer.predict(test_dataset)
print("Test Metrics:", test_results.metrics)

  0%|          | 0/490 [00:00<?, ?it/s]

{'loss': 0.2449, 'grad_norm': 1.998392939567566, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.04}
{'loss': 0.1529, 'grad_norm': 17.16566276550293, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.08}
{'loss': 0.1575, 'grad_norm': 3.36753249168396, 'learning_rate': 3e-06, 'epoch': 0.12}
{'loss': 0.1835, 'grad_norm': 1.4633985757827759, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.16}
{'loss': 0.1161, 'grad_norm': 15.111444473266602, 'learning_rate': 5e-06, 'epoch': 0.2}
{'loss': 0.2327, 'grad_norm': 5.126888751983643, 'learning_rate': 6e-06, 'epoch': 0.24}
{'loss': 0.183, 'grad_norm': 4.10707950592041, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.29}
{'loss': 0.2024, 'grad_norm': 19.38473892211914, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.33}
{'loss': 0.0957, 'grad_norm': 2.114967107772827, 'learning_rate': 9e-06, 'epoch': 0.37}
{'loss': 0.1101, 'grad_norm': 0.5976212620735168, 'learning_rate': 1e-05, 'epoch': 0.41}


  0%|          | 0/53 [00:00<?, ?it/s]

{'eval_loss': 0.43557608127593994, 'eval_accuracy': 0.8721624850657109, 'eval_f1': 0.8788221970554927, 'eval_precision': 0.8565121412803532, 'eval_recall': 0.9023255813953488, 'eval_runtime': 8.4739, 'eval_samples_per_second': 98.773, 'eval_steps_per_second': 6.254, 'epoch': 0.41}
{'loss': 0.1448, 'grad_norm': 7.147447109222412, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.45}
{'loss': 0.0875, 'grad_norm': 0.3766646087169647, 'learning_rate': 1.2e-05, 'epoch': 0.49}
{'loss': 0.0711, 'grad_norm': 1.2705013751983643, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.53}
{'loss': 0.1175, 'grad_norm': 0.34913870692253113, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.57}
{'loss': 0.162, 'grad_norm': 0.17149928212165833, 'learning_rate': 1.5e-05, 'epoch': 0.61}
{'loss': 0.1804, 'grad_norm': 6.665289878845215, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.65}
{'loss': 0.122, 'grad_norm': 0.9197729825973511, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.69}
{'loss':

  0%|          | 0/53 [00:00<?, ?it/s]

{'eval_loss': 0.6709767580032349, 'eval_accuracy': 0.8494623655913979, 'eval_f1': 0.8428927680798005, 'eval_precision': 0.9086021505376344, 'eval_recall': 0.786046511627907, 'eval_runtime': 8.4547, 'eval_samples_per_second': 98.998, 'eval_steps_per_second': 6.269, 'epoch': 0.82}
{'loss': 0.1536, 'grad_norm': 0.15850085020065308, 'learning_rate': 2.1e-05, 'epoch': 0.86}
{'loss': 0.072, 'grad_norm': 0.15548285841941833, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.9}
{'loss': 0.0178, 'grad_norm': 0.09007176011800766, 'learning_rate': 2.3000000000000003e-05, 'epoch': 0.94}
{'loss': 0.1759, 'grad_norm': 26.490074157714844, 'learning_rate': 2.4e-05, 'epoch': 0.98}
{'loss': 0.1003, 'grad_norm': 0.5370209813117981, 'learning_rate': 2.5e-05, 'epoch': 1.02}
{'loss': 0.0747, 'grad_norm': 17.784093856811523, 'learning_rate': 2.6000000000000002e-05, 'epoch': 1.06}
{'loss': 0.1242, 'grad_norm': 5.101869106292725, 'learning_rate': 2.7000000000000002e-05, 'epoch': 1.1}
{'loss': 0.0183, 'grad_n

  0%|          | 0/53 [00:00<?, ?it/s]

{'eval_loss': 0.6526992321014404, 'eval_accuracy': 0.8566308243727598, 'eval_f1': 0.8636363636363636, 'eval_precision': 0.8444444444444444, 'eval_recall': 0.8837209302325582, 'eval_runtime': 8.8962, 'eval_samples_per_second': 94.085, 'eval_steps_per_second': 5.958, 'epoch': 1.22}
{'loss': 0.0184, 'grad_norm': 0.09183143079280853, 'learning_rate': 3.1e-05, 'epoch': 1.27}
{'loss': 0.0896, 'grad_norm': 0.08778955042362213, 'learning_rate': 3.2000000000000005e-05, 'epoch': 1.31}
{'loss': 0.1549, 'grad_norm': 2.842386484146118, 'learning_rate': 3.3e-05, 'epoch': 1.35}
{'loss': 0.0106, 'grad_norm': 0.14584599435329437, 'learning_rate': 3.4000000000000007e-05, 'epoch': 1.39}
{'loss': 0.1224, 'grad_norm': 25.73926544189453, 'learning_rate': 3.5e-05, 'epoch': 1.43}
{'loss': 0.1043, 'grad_norm': 0.1102948933839798, 'learning_rate': 3.6e-05, 'epoch': 1.47}
{'loss': 0.0851, 'grad_norm': 24.53289031982422, 'learning_rate': 3.7e-05, 'epoch': 1.51}
{'loss': 0.0422, 'grad_norm': 0.16173535585403442, '

  0%|          | 0/53 [00:00<?, ?it/s]

{'eval_loss': 0.6012182235717773, 'eval_accuracy': 0.8530465949820788, 'eval_f1': 0.8501827040194885, 'eval_precision': 0.8925831202046036, 'eval_recall': 0.8116279069767441, 'eval_runtime': 8.6853, 'eval_samples_per_second': 96.37, 'eval_steps_per_second': 6.102, 'epoch': 1.63}
{'loss': 0.2111, 'grad_norm': 0.3872028887271881, 'learning_rate': 4.1e-05, 'epoch': 1.67}
{'loss': 0.1476, 'grad_norm': 1.3868128061294556, 'learning_rate': 4.2e-05, 'epoch': 1.71}
{'loss': 0.1325, 'grad_norm': 1.516251802444458, 'learning_rate': 4.3e-05, 'epoch': 1.76}
{'loss': 0.2181, 'grad_norm': 0.8209595680236816, 'learning_rate': 4.4000000000000006e-05, 'epoch': 1.8}
{'loss': 0.1818, 'grad_norm': 0.18251252174377441, 'learning_rate': 4.5e-05, 'epoch': 1.84}
{'loss': 0.06, 'grad_norm': 0.1612289994955063, 'learning_rate': 4.600000000000001e-05, 'epoch': 1.88}
{'loss': 0.2027, 'grad_norm': 0.16316230595111847, 'learning_rate': 4.7e-05, 'epoch': 1.92}
{'loss': 0.19, 'grad_norm': 0.8881770968437195, 'learnin

  0%|          | 0/53 [00:00<?, ?it/s]

Test Metrics: {'test_loss': 0.5396663546562195, 'test_accuracy': 0.8566308243727598, 'test_f1': 0.8620689655172413, 'test_precision': 0.8278145695364238, 'test_recall': 0.8992805755395683, 'test_runtime': 8.7886, 'test_samples_per_second': 95.237, 'test_steps_per_second': 6.031}


In [10]:
# Save the trained model
model.save_pretrained('./model')

# Save the tokenizer
tokenizer.save_pretrained('./model')

('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.txt',
 './model/added_tokens.json')