# This notebook aims to clean and preprocess the Amazon Review csv file for further analysis

In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
stop = stopwords.words('english')
nltk.download('vader_lexicon')
nltk.download('punkt')
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
reviews = pd.read_csv('amazon_reviews.csv')

In [3]:
reviews.head()

Unnamed: 0,reviewId,userName,content,score,thumbsUpCount,reviewCreatedVersion,at,appVersion
0,d5a7e529-2077-42dd-a624-21df7a9d5bc0,sateesh gunda,"Cheating,I have cancelled my subscription befo...",1,0,28.18.0.100,2024-09-21 16:07:04,28.18.0.100
1,84c32777-b078-47dd-b8f2-d42e9843482b,Alexis Harber,I'm echoing everyone else's sentiment the new ...,1,0,28.18.0.100,2024-09-21 16:03:47,28.18.0.100
2,7e30d63b-042b-4175-9af9-09b36d0496b1,Carly-jade Howard,I really like it it's great for bdays,3,0,28.18.0.100,2024-09-21 15:39:27,28.18.0.100
3,b6acf26a-1cd7-4e63-87ff-dc57524e5ab7,Bryan Shackelford,"App keeps crashing I open it ,it crashes can't...",1,1,28.18.0.100,2024-09-21 15:28:41,28.18.0.100
4,1f39586f-4ae5-4a37-8a02-1492293e83c3,Tim Tucker,"It's messed up, not accurate. I had to go into...",2,1,28.18.0.100,2024-09-21 15:19:33,28.18.0.100


# Data Cleaning

In [4]:
# Checking for nulls
reviews.isnull().sum()

reviewId                   0
userName                   5
content                    4
score                      0
thumbsUpCount              0
reviewCreatedVersion    6373
at                         0
appVersion              6373
dtype: int64

In [5]:
# Drop columns reviewCreatedVersion and appVersion
reviews.drop(['reviewCreatedVersion', 'appVersion', 'reviewId'], axis=1, inplace=True)

In [6]:
# Fill username nulls with 'Unknown'
reviews['userName'].fillna('Unknown', inplace=True)

In [7]:
reviews.dropna(inplace=True)

In [8]:
reviews.shape

(61353, 5)

### Fixing encoding issues â€™ were showing in excel instead of " ' ".

In [9]:
# Fix encoding issues in text by re-encoding from Windows-1252 to UTF-8
def fix_encoding(text):
    return text.encode('windows-1252', errors='ignore').decode('utf-8', errors='ignore')

In [10]:
# Apply the fix_encoding function to the Review column
reviews['content'] = reviews['content'].apply(fix_encoding)

In [11]:
# Map treebank part of speech tags to WordNet part of speech tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [12]:
# Data preprocessing function
def preprocess_text(text):
    """
    Preprocess text by removing punctuation, stop words, and lemmatizing
    """
    # Remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token.lower() not in stop_words]
    
    # Lemmatize tokens using their part of speech tags
    lemmatizer = WordNetLemmatizer()
    pos_tags = nltk.pos_tag(tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(pos)) for token, pos in pos_tags]
    
    return ' '.join(lemmatized_tokens)

In [13]:
# Apply the preprocess_text function to the Review column
reviews['content'] = reviews['content'].apply(preprocess_text)

In [14]:
# Remove rows with empty content
reviews = reviews[reviews['content'] != '']

In [17]:
reviews.head()

Unnamed: 0,userName,content,score,thumbsUpCount,at
0,sateesh gunda,CheatingI cancel subscription 2days even thoug...,1,0,2024-09-21 16:07:04
1,Alexis Harber,Im echo everyone elses sentiment new update la...,1,0,2024-09-21 16:03:47
2,Carly-jade Howard,really like great bdays,3,0,2024-09-21 15:39:27
3,Bryan Shackelford,App keep crash open crash cant even use guess ...,1,1,2024-09-21 15:28:41
4,Tim Tucker,messed accurate go website get accurate depict...,2,1,2024-09-21 15:19:33


In [16]:
# Save the cleaned data to a new CSV file
# reviews.to_csv('cleaned_reviews.csv', index=False)