# ISY503 - Assessment 3 
Sentiment Analysis using NLP

This project uses python version 3.11.10

In [1]:
#import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import nltk.data
from nltk.tokenize import word_tokenize
nltk.download ('punkt')
from numpy import loadtxt
import re
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/joaquinmorales13a06/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Data upload
Data is divided into .review files (8 total) which includes positive and negative reviews alongside other metadata. I'll create a function that reads all the .review files and add them into a single dataframe

In [2]:
# function that iterates on all file paths and parses the XML content
def parse_review_files(file_paths):
    # Define the columns for the DataFrame
    columns = ["unique_id", "asin", "product_name", "product_type", "helpful", 
               "rating", "title", "date", "reviewer", "reviewer_location", "review_text"]
    data = []

    for file_path in file_paths:
        # Initialize an empty list to store each review's content temporarily
        review = []
        with open(file_path, 'r') as file:
            for line in file:
                review.append(line.strip())
                # End of a review block
                if line.strip() == "</review>":
                    try:
                        # Convert the review to a single XML structure
                        review_xml = "\n".join(review)
                        # Parse the XML content
                        root = ET.fromstring(review_xml)
                        # Extract values into a dictionary
                        review_data = {
                            "unique_id": root.find('unique_id').text if root.find('unique_id') is not None else None,
                            "asin": root.find('asin').text if root.find('asin') is not None else None,
                            "product_name": root.find('product_name').text if root.find('product_name') is not None else None,
                            "product_type": root.find('product_type').text if root.find('product_type') is not None else None,
                            "helpful": root.find('helpful').text if root.find('helpful') is not None else None,
                            "rating": float(root.find('rating').text) if root.find('rating') is not None else None,
                            "title": root.find('title').text if root.find('title') is not None else None,
                            "date": root.find('date').text if root.find('date') is not None else None,
                            "reviewer": root.find('reviewer').text if root.find('reviewer') is not None else None,
                            "reviewer_location": root.find('reviewer_location').text if root.find('reviewer_location') is not None else None,
                            "review_text": root.find('review_text').text if root.find('review_text') is not None else None
                        }
                        # Append the parsed data to the list
                        data.append(review_data)
                    except ET.ParseError:
                        # Skip entries that are not well-formed XML
                        print(f"Skipping a malformed entry in file: {file_path}")
                    finally:
                        # Reset review list for the next block
                        review = []

    # Create a DataFrame with the combined data from all files
    df_reviews = pd.DataFrame(data, columns=columns)
    return df_reviews

# file paths:
file_paths = [
    './DATA/books/positive.review', 
    './DATA/books/negative.review',
    './DATA/dvd/positive.review',
    './DATA/dvd/negative.review',
    './DATA/electronics/positive.review',
    './DATA/electronics/negative.review',
    './DATA/kitchen_&_housewares/positive.review',
    './DATA/kitchen_&_housewares/negative.review',
]

# Call the function to parse all specified files
df_reviews = parse_review_files(file_paths)

# Display the DataFrame
df_reviews.head()


Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entr

Unnamed: 0,unique_id,asin,product_name,product_type,helpful,rating,title,date,reviewer,reviewer_location,review_text
0,\n0785758968:one_of_the_best_crichton_novels:j...,\n0785758968\n,\nSphere: Books: Michael Crichton\n,\nbooks\n,\n0 of 1\n,5.0,\nOne of the best Crichton novels\n,"\nJuly 1, 2006\n",\nJoseph M\n,"\nColorado, USA\n",\nSphere by Michael Crichton is an excellant n...
1,\n0452279550:the_medicine_of_the_future:wafa_r...,\n0452279550\n,\nHealing from the Heart: A Leading Surgeon Co...,\nbooks\n,\n34 of 41\n,4.0,\nThe Medicine of the Future\n,"\nNovember 6, 2002\n",\nWafa Rashed\n,"\nJabriya, KUWAIT\n",\nDr. Oz is an accomplished heart surgeon in t...
2,"\n1599620065:beautiful!:sarah_silva_""sar""\n",\n1599620065\n,\nMythology: DC Comics Art of Alex Ross 2007 C...,\nbooks\n,\n\n,5.0,\nBeautiful!\n,"\nJune 13, 2006\n","\nSarah Silva ""Sar""\n","\nSan Diego, CA USA\n",\nThe most gorgeous artwork in comic books. Co...
3,\n0743277724:for_lovers_of_robicheaux:g._rouss...,\n0743277724\n,\nPegasus Descending: A Dave Robicheaux Novel ...,\nbooks\n,\n1 of 1\n,4.0,\nFor lovers of Robicheaux\n,"\nNovember 2, 2006\n",\nG. Rousseau\n,"\nFinistere, France\n",\nThis book is for lovers of Robicheaux. His ...
4,\n061318114X:excellent_and_broad_survey_of_the...,\n061318114X\n,"\nGuns, Germs, and Steel: The Fates of Human S...",\nbooks\n,\n7 of 9\n,5.0,\nExcellent and broad survey of the developmen...,"\nOctober 6, 2006\n","\nPatrick D. Goonan ""www.meaningful-life.us""\n","\nPleasanton, CA\n",\nThis is going to be a short and sweet review...


# Data Cleaning process
Once all .review files have been appended to a single variable df_reviews, now I'll proceed with further cleaning each cell value, dropping columns not needed for the analysis and na values

In [3]:
# Remove leading and trailing whitespace characters from each value in the DataFrame
df_reviews = df_reviews.map(lambda x: x.strip() if isinstance(x, str) else x)

# Drop the columns that are not needed for the sentiment analysis
df_reviews.drop(columns=["unique_id", "asin", "product_name", "product_type", "helpful", "title", "date", "reviewer", "reviewer_location"], inplace=True)

# Display the cleaned DataFrame
df_reviews

# Count the number of NaN values in the DataFrame
na_counts = df_reviews.isna().sum()
print("NaN values in each column:\n", na_counts)

# Drop rows with any NaN values
df_reviews.dropna(inplace=True)

# Randomize the data
df_reviews = df_reviews.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the cleaned and randomized DataFrame
df_reviews.head()



NaN values in each column:
 rating         0
review_text    0
dtype: int64


Unnamed: 0,rating,review_text
0,5.0,Nothing much to say about them. They're high ...
1,1.0,"I have spent time with lynksys and Dell, both ..."
2,5.0,I bought this book for my son as a study guide...
3,5.0,"On Nation, Underprivileged is a timely discuss..."
4,5.0,Patton describes qualitative research design a...


In [8]:
# Function that removes HTML tags, hyperlinks, punctuation, and extra whitespace from the review text
def clean_review_text(text):
    # Remove HTML tags but keep their content
    text = BeautifulSoup(text, "html.parser").get_text()
    # Remove hyperlinks
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove undesired symbols and punctuation, keep alphanumerics
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Convert text to lowercase
    text = text.lower()
    # Remove alphanumeric words
    text = re.sub(r'\b\w*\d\w*\b', '', text)
    return text

# Apply the cleaning function to the review_text column
df_reviews['review_text'] = df_reviews['review_text'].apply(clean_review_text)

# Display the cleaned DataFrame
df_reviews.head()

print(df_reviews['review_text'][0])


nothing much to say about them theyre high quality high speed rewritable discs that do what they are supposed to do ive been using these kind of rewritable discs for a while now and i havent had any problems with them what so ever even when i burn at higher than x they come with cases too so you wont have to worry about scratching the cds
