# ISY503 - Assessment 3 
Sentiment Analysis using NLP

This project uses python version 3.11.10

In [1]:
#import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import nltk.data
from nltk.tokenize import word_tokenize
nltk.download ('punkt')
from numpy import loadtxt
import re
import xml.etree.ElementTree as ET

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/joaquinmorales13a06/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Data upload
Data is divided into .review files (8 total) which includes positive and negative reviews alongside other metadata. I'll create a function that reads all the .review files and add them into a single dataframe

In [2]:
# function that iterates on all file paths and parses the XML content
def parse_review_files(file_paths):
    # Define the columns for the DataFrame
    columns = ["unique_id", "asin", "product_name", "product_type", "helpful", 
               "rating", "title", "date", "reviewer", "reviewer_location", "review_text"]
    data = []

    for file_path in file_paths:
        # Initialize an empty list to store each review's content temporarily
        review = []
        with open(file_path, 'r') as file:
            for line in file:
                review.append(line.strip())
                # End of a review block
                if line.strip() == "</review>":
                    try:
                        # Convert the review to a single XML structure
                        review_xml = "\n".join(review)
                        # Parse the XML content
                        root = ET.fromstring(review_xml)
                        # Extract values into a dictionary
                        review_data = {
                            "unique_id": root.find('unique_id').text if root.find('unique_id') is not None else None,
                            "asin": root.find('asin').text if root.find('asin') is not None else None,
                            "product_name": root.find('product_name').text if root.find('product_name') is not None else None,
                            "product_type": root.find('product_type').text if root.find('product_type') is not None else None,
                            "helpful": root.find('helpful').text if root.find('helpful') is not None else None,
                            "rating": float(root.find('rating').text) if root.find('rating') is not None else None,
                            "title": root.find('title').text if root.find('title') is not None else None,
                            "date": root.find('date').text if root.find('date') is not None else None,
                            "reviewer": root.find('reviewer').text if root.find('reviewer') is not None else None,
                            "reviewer_location": root.find('reviewer_location').text if root.find('reviewer_location') is not None else None,
                            "review_text": root.find('review_text').text if root.find('review_text') is not None else None
                        }
                        # Append the parsed data to the list
                        data.append(review_data)
                    except ET.ParseError:
                        # Skip entries that are not well-formed XML
                        print(f"Skipping a malformed entry in file: {file_path}")
                    finally:
                        # Reset review list for the next block
                        review = []

    # Create a DataFrame with the combined data from all files
    df_reviews = pd.DataFrame(data, columns=columns)
    return df_reviews

# file paths:
file_paths = [
    './DATA/books/positive.review', 
    './DATA/books/negative.review',
    './DATA/dvd/positive.review',
    './DATA/dvd/negative.review',
    './DATA/electronics/positive.review',
    './DATA/electronics/negative.review',
    './DATA/kitchen_&_housewares/positive.review',
    './DATA/kitchen_&_housewares/negative.review',
]

# Call the function to parse all specified files
df_reviews = parse_review_files(file_paths)

# Display the DataFrame
df_reviews.head()


Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entr

Unnamed: 0,unique_id,asin,product_name,product_type,helpful,rating,title,date,reviewer,reviewer_location,review_text
0,\n0785758968:one_of_the_best_crichton_novels:j...,\n0785758968\n,\nSphere: Books: Michael Crichton\n,\nbooks\n,\n0 of 1\n,5.0,\nOne of the best Crichton novels\n,"\nJuly 1, 2006\n",\nJoseph M\n,"\nColorado, USA\n",\nSphere by Michael Crichton is an excellant n...
1,\n0452279550:the_medicine_of_the_future:wafa_r...,\n0452279550\n,\nHealing from the Heart: A Leading Surgeon Co...,\nbooks\n,\n34 of 41\n,4.0,\nThe Medicine of the Future\n,"\nNovember 6, 2002\n",\nWafa Rashed\n,"\nJabriya, KUWAIT\n",\nDr. Oz is an accomplished heart surgeon in t...
2,"\n1599620065:beautiful!:sarah_silva_""sar""\n",\n1599620065\n,\nMythology: DC Comics Art of Alex Ross 2007 C...,\nbooks\n,\n\n,5.0,\nBeautiful!\n,"\nJune 13, 2006\n","\nSarah Silva ""Sar""\n","\nSan Diego, CA USA\n",\nThe most gorgeous artwork in comic books. Co...
3,\n0743277724:for_lovers_of_robicheaux:g._rouss...,\n0743277724\n,\nPegasus Descending: A Dave Robicheaux Novel ...,\nbooks\n,\n1 of 1\n,4.0,\nFor lovers of Robicheaux\n,"\nNovember 2, 2006\n",\nG. Rousseau\n,"\nFinistere, France\n",\nThis book is for lovers of Robicheaux. His ...
4,\n061318114X:excellent_and_broad_survey_of_the...,\n061318114X\n,"\nGuns, Germs, and Steel: The Fates of Human S...",\nbooks\n,\n7 of 9\n,5.0,\nExcellent and broad survey of the developmen...,"\nOctober 6, 2006\n","\nPatrick D. Goonan ""www.meaningful-life.us""\n","\nPleasanton, CA\n",\nThis is going to be a short and sweet review...


# Data Cleaning process
Once all .review files have been appended to a single variable df_reviews, now I'll proceed with the data cleaning process

In [3]:
# Remove leading and trailing whitespace characters from each value in the DataFrame
df_reviews = df_reviews.map(lambda x: x.strip() if isinstance(x, str) else x)

# Drop unique_id column as it is not needed
df_reviews.drop(columns=['unique_id'], inplace=True)

# Display the cleaned DataFrame
df_reviews



Unnamed: 0,asin,product_name,product_type,helpful,rating,title,date,reviewer,reviewer_location,review_text
0,0785758968,Sphere: Books: Michael Crichton,books,0 of 1,5.0,One of the best Crichton novels,"July 1, 2006",Joseph M,"Colorado, USA",Sphere by Michael Crichton is an excellant nov...
1,0452279550,Healing from the Heart: A Leading Surgeon Comb...,books,34 of 41,4.0,The Medicine of the Future,"November 6, 2002",Wafa Rashed,"Jabriya, KUWAIT",Dr. Oz is an accomplished heart surgeon in the...
2,1599620065,Mythology: DC Comics Art of Alex Ross 2007 Cal...,books,,5.0,Beautiful!,"June 13, 2006","Sarah Silva ""Sar""","San Diego, CA USA",The most gorgeous artwork in comic books. Cont...
3,0743277724,Pegasus Descending: A Dave Robicheaux Novel (D...,books,1 of 1,4.0,For lovers of Robicheaux,"November 2, 2006",G. Rousseau,"Finistere, France",This book is for lovers of Robicheaux. His de...
4,061318114X,"Guns, Germs, and Steel: The Fates of Human Soc...",books,7 of 9,5.0,Excellent and broad survey of the development ...,"October 6, 2006","Patrick D. Goonan ""www.meaningful-life.us""","Pleasanton, CA",This is going to be a short and sweet review b...
...,...,...,...,...,...,...,...,...,...,...
5577,B0000C3GWV,"JBL Duet PC Speakers (2-Speaker, Silver): Elec...",electronics,1 of 4,1.0,Blah sound overall,"March 17, 2006","J. E. Mildren ""Coffee drinker extraordinaire""","Lawrence, Kan.",I was really excited for these affordable spea...
5578,B0000C3GWV,"JBL Duet PC Speakers (2-Speaker, Silver): Elec...",electronics,0 of 4,2.0,"Where's the headphone jack, Jack?","March 16, 2006",D. G. Springs,,These speakers sound great - but sometimes you...
5579,B0000BVUZ2,Targus PA206U Mini Retractable RJ-11 Cord (Pla...,electronics,1 of 2,1.0,Does not retract,"October 11, 2005",R. Abramson,,Agree with the prior reviewers. I can not get...
5580,B00032Q1IK,Lexar Media Single-Slot USB 2.0 Multi-Card Rea...,electronics,,2.0,Don't buy the generic one,"September 5, 2006",Feliciano,Venezuela,I have bought several of these. Some came in a...
