# ISY503 - Assessment 3 
Sentiment Analysis using NLP

This project uses python version 3.11.10

In [2]:
#import necessary libraries
import pandas as pd
import numpy as np
from numpy import loadtxt
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import re
import pickle

import nltk
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier




[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joaquinmorales13a06/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Data upload
Data is divided into .review files (8 total) which includes positive and negative reviews alongside other metadata. I'll create a function that reads all the .review files and add them into a single dataframe

In [15]:
# function that iterates on all file paths and parses the XML content
def parse_review_files(file_paths):
    # Define the columns for the DataFrame
    columns = ["unique_id", "asin", "product_name", "product_type", "helpful", 
               "rating", "title", "date", "reviewer", "reviewer_location", "review_text"]
    data = []

    for file_path in file_paths:
        # Initialize an empty list to store each review's content temporarily
        review = []
        with open(file_path, 'r') as file:
            for line in file:
                review.append(line.strip())
                # End of a review block
                if line.strip() == "</review>":
                    try:
                        # Convert the review to a single XML structure
                        review_xml = "\n".join(review)
                        # Parse the XML content
                        root = ET.fromstring(review_xml)
                        # Extract values into a dictionary
                        review_data = {
                            "unique_id": root.find('unique_id').text if root.find('unique_id') is not None else None,
                            "asin": root.find('asin').text if root.find('asin') is not None else None,
                            "product_name": root.find('product_name').text if root.find('product_name') is not None else None,
                            "product_type": root.find('product_type').text if root.find('product_type') is not None else None,
                            "helpful": root.find('helpful').text if root.find('helpful') is not None else None,
                            "rating": float(root.find('rating').text) if root.find('rating') is not None else None,
                            "title": root.find('title').text if root.find('title') is not None else None,
                            "date": root.find('date').text if root.find('date') is not None else None,
                            "reviewer": root.find('reviewer').text if root.find('reviewer') is not None else None,
                            "reviewer_location": root.find('reviewer_location').text if root.find('reviewer_location') is not None else None,
                            "review_text": root.find('review_text').text if root.find('review_text') is not None else None
                        }
                        # Append the parsed data to the list
                        data.append(review_data)
                    except ET.ParseError:
                        # Skip entries that are not well-formed XML
                        print(f"Skipping a malformed entry in file: {file_path}")
                    finally:
                        # Reset review list for the next block
                        review = []

    # Create a DataFrame with the combined data from all files
    df_reviews = pd.DataFrame(data, columns=columns)
    return df_reviews

# file paths:
file_paths = [
    './DATA/books/positive.review', 
    './DATA/books/negative.review',
    './DATA/dvd/positive.review',
    './DATA/dvd/negative.review',
    './DATA/electronics/positive.review',
    './DATA/electronics/negative.review',
    './DATA/kitchen_&_housewares/positive.review',
    './DATA/kitchen_&_housewares/negative.review',
]

# Call the function to parse all specified files
df_reviews = parse_review_files(file_paths)

# Display the DataFrame
df_reviews.head()


Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entry in file: ./DATA/books/positive.review
Skipping a malformed entr

Unnamed: 0,unique_id,asin,product_name,product_type,helpful,rating,title,date,reviewer,reviewer_location,review_text
0,\n0785758968:one_of_the_best_crichton_novels:j...,\n0785758968\n,\nSphere: Books: Michael Crichton\n,\nbooks\n,\n0 of 1\n,5.0,\nOne of the best Crichton novels\n,"\nJuly 1, 2006\n",\nJoseph M\n,"\nColorado, USA\n",\nSphere by Michael Crichton is an excellant n...
1,\n0452279550:the_medicine_of_the_future:wafa_r...,\n0452279550\n,\nHealing from the Heart: A Leading Surgeon Co...,\nbooks\n,\n34 of 41\n,4.0,\nThe Medicine of the Future\n,"\nNovember 6, 2002\n",\nWafa Rashed\n,"\nJabriya, KUWAIT\n",\nDr. Oz is an accomplished heart surgeon in t...
2,"\n1599620065:beautiful!:sarah_silva_""sar""\n",\n1599620065\n,\nMythology: DC Comics Art of Alex Ross 2007 C...,\nbooks\n,\n\n,5.0,\nBeautiful!\n,"\nJune 13, 2006\n","\nSarah Silva ""Sar""\n","\nSan Diego, CA USA\n",\nThe most gorgeous artwork in comic books. Co...
3,\n0743277724:for_lovers_of_robicheaux:g._rouss...,\n0743277724\n,\nPegasus Descending: A Dave Robicheaux Novel ...,\nbooks\n,\n1 of 1\n,4.0,\nFor lovers of Robicheaux\n,"\nNovember 2, 2006\n",\nG. Rousseau\n,"\nFinistere, France\n",\nThis book is for lovers of Robicheaux. His ...
4,\n061318114X:excellent_and_broad_survey_of_the...,\n061318114X\n,"\nGuns, Germs, and Steel: The Fates of Human S...",\nbooks\n,\n7 of 9\n,5.0,\nExcellent and broad survey of the developmen...,"\nOctober 6, 2006\n","\nPatrick D. Goonan ""www.meaningful-life.us""\n","\nPleasanton, CA\n",\nThis is going to be a short and sweet review...


# Data Cleaning process
Once all .review files have been appended to a single variable df_reviews, now I'll proceed with further cleaning each cell value, dropping columns not needed for the analysis and na values

In [16]:
# Remove leading and trailing whitespace characters from each value in the DataFrame
df_reviews = df_reviews.map(lambda x: x.strip() if isinstance(x, str) else x)

# Drop the columns that are not needed for the sentiment analysis
df_reviews.drop(columns=["unique_id", "asin", "product_name", "product_type", "helpful", "title", "date", "reviewer", "reviewer_location"], inplace=True)

# Display the cleaned DataFrame
df_reviews

# Count the number of NaN values in the DataFrame
na_counts = df_reviews.isna().sum()
print("NaN values in each column:\n", na_counts)

# Drop rows with any NaN values
df_reviews.dropna(inplace=True)

# Display the cleaned and randomized DataFrame
df_reviews.head()

# Display df_reviews shape
print(df_reviews.shape)


NaN values in each column:
 rating         0
review_text    0
dtype: int64
(5582, 2)


In [13]:
# Initialize an empty list to store the processed reviews
corpus = []

# Create an instance of PorterStemmer for stemming words
stemmer = PorterStemmer()

# Loop through each review in the dataframe
for i in range(0, len(df_reviews)):
    # Extract the review text at the current index
    review = df_reviews['review_text'][i]

    # Remove punctuation and numbers
    review = re.sub('[^a-zA-Z]', ' ', df_reviews.iloc[i]['review_text'])
    
    # Split the review text into individual words
    review = review.lower().split()
    
    # Stem each word and remove stopwords from the review
    review = [stemmer.stem(word) for word in review if not word in STOPWORDS]
    
    # Join the processed words back into a single string
    review = ' '.join(review)
    
    # Append the processed review to the corpus list
    corpus.append(review)

# Display the first 5 processed reviews
corpus[:5]


['sphere michael crichton excel novel certainli hardest put crichton novel read stori revolv around man name norman johnson johnson phycologist travel civilan remot locat pacif ocean help navi top secret misssion quickli learn ocean half mile long spaceship civilan travel center feet ocean live research spacecraft join navi personel help run oper howev surfac typhoon come support ship surfac must leav team ten stuck feet surfac ocean day sea find spacecraft actual american ship explor black hole brought back strang thing back earth novel research crichton novel still lot inform random thing law partial pressur behavior analysi would strongli recommend book',
 'dr oz accomplish heart surgeon field cardiac transplant describ combin complementari medicin e g hypnosi reflexolog yoga messag acupunctur etc orthodox western medicin excel forward dr dean ornish interest epilogu contain overview complementari medicin techniqu bulk book contain stori patient dr oz treat use revolutionari way car

In [18]:
# Using Count Vectorizer to convert the text data into a matrix of token counts
cv = CountVectorizer(max_features=2500)

# Fit and transform the corpus to the Count Vectorizer
X = cv.fit_transform(corpus).toarray()
Y = df_reviews['rating'].values

# Save the Count Vectorizer
pickle.dump(cv, open('Models/count_vectorizer.pkl', 'wb'))



In [19]:
# Check the shape of the matrix
print(X.shape)
print(Y.shape)

(5582, 2500)
(5582,)


In [20]:
# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

print(f"X train: {X_train.shape}")
print(f"y train: {Y_train.shape}")
print(f"X test: {X_test.shape}")
print(f"y test: {Y_test.shape}")

print(f"X train max value: {X_train.max()}")
print(f"X test max value: {X_test.max()}")

X train: (3907, 2500)
y train: (3907,)
X test: (1675, 2500)
y test: (1675,)
X train max value: 29
X test max value: 56


In [21]:
# Scale the data to be between 0 and 1 using MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Save the scaler model
pickle.dump(scaler, open('Models/min_max_scaler.pkl', 'wb'))