# Research Paper

## Import Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from textblob import TextBlob
import re
import emoji
from spellchecker import SpellChecker
from nltk.corpus import stopwords

## Preparing for Sentiment Analysis

### Loading datasets

In [None]:
shopee_dataset = pd.read_csv("shopee_reviews.csv")
ebay_dataset = pd.read_csv("ebay_reviews.csv", usecols=['review title', 'review content'])
amazon_dataset = pd.read_csv("Reviews.csv", usecols=['ProductId', 'UserId', 'Time', 'Text'])
walmart_dataset = pd.read_csv("marketing_sample_for_walmart_com-walmart_product_reviews__20200401_20200630__30k_data.csv", usecols=['Review', 'Reviewer Name'])

### Setting up methods for getting polarity, subjectivity, and analyzing polarity score

In [None]:
# Get subjectivity score of text
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

# Get polarity score of text
def getPolarity(text):
    return TextBlob(text).sentiment.polarity

# Analyze polarity score
def getAnalysis(polarity_score):
    if polarity_score < 0:
        return 'Negative'
    elif polarity_score == 0:
        return 'Neutral'
    else:
        return 'Positive'

### Text pre-processing

Used pre-processing techniques:

 - Converting all characters to lowercase
 - Removing URLs
 - Removing hashtags
 - Removing numerical data
 - Converting emojis to words
 - Removing extra whitespaces
 - Correcting spelling
 - Removing stopwords
 - Tokenization
 - Lemmatization

In [None]:
# Converting all characters to lowercase
def lowercasing(text):
    return text.lower()

# Remove URLs
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

# Remove hashtags
def remove_hashtags(text):
    hashtag_pattern = re.compile(r'#([^\s]+)')
    return hashtag_pattern.sub(r'', text)

# Remove numerical data
def remove_num(text):
    return "".join([i for i in text if not i.isdigit()])

# Emojis to words
def emojiToWord(text):
    demoji = emoji.demojize(text, delimiters=("__", "__"))
    demoji = demoji.replace('__', ' ')
    demoji = demoji.replace('_', ' ')
    return demoji

# Remove extra whitespace
def remove_extraws(text):
    return " ".join(text.split())

# Spell checker
spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

# Remove stopwords
stop_words = set(stopwords.words('english'))
def remove_sw(text):
    filtered_text = []
    for w in text:
        if w not in stop_words:
            filtered_text.append(w)
    return filtered_text

In [None]:
# Preprocessing datasets 
# [code here]