In [54]:
# Load dataset
import pandas as pd

file_path = "Review.csv"
df = pd.read_csv(file_path)

pd.set_option('display.max_colwidth', None)

print(df)

                                                                           Review
0   The product arrived on time. Packaging was great, and the quality is amazing!
1                                        THIS PRODUCT IS JUST AMAZING! I LOVE IT.
2     I bought this phone for $799, and it has a 120Hz display. Totally worth it!
3                         Wow!!! This product is awesome... but a bit expensive??
4                                             The laptop works perfectly fine.   
5    Check out the full product details here: https://example.com/product-details
6         <div><h2>Great Purchase!</h2><p>I am happy with this product.</p></div>
7                The battry life is excelent, but the chargin cable is too short.
8                       I can't believe it's so good! Didn't expect such quality.
9                   Love this product! ???? Fast delivery ??, amazing quality! ??
10                       TBH, I wasnít expecting much, but OMG, this is awesome!!
11              

In [55]:
# Lowercase conversion
def convert_to_lowercase(text):
    return text.lower()

df["lowercased"] = df["Review"].apply(convert_to_lowercase)

pd.set_option('display.max_colwidth', None)

print(df["lowercased"])

0     the product arrived on time. packaging was great, and the quality is amazing!
1                                          this product is just amazing! i love it.
2       i bought this phone for $799, and it has a 120hz display. totally worth it!
3                           wow!!! this product is awesome... but a bit expensive??
4                                               the laptop works perfectly fine.   
5      check out the full product details here: https://example.com/product-details
6           <div><h2>great purchase!</h2><p>i am happy with this product.</p></div>
7                  the battry life is excelent, but the chargin cable is too short.
8                         i can't believe it's so good! didn't expect such quality.
9                     love this product! ???? fast delivery ??, amazing quality! ??
10                         tbh, i wasnít expecting much, but omg, this is awesome!!
11                            this is the best product i have ever used in m

In [56]:
# Removal of URLs
import re

def remove_urls(text):
    return re.sub(r'http\S+|www\S+', '', text)

df["urls_removed"] = df["lowercased"].apply(remove_urls)

pd.set_option('display.max_colwidth', None)

print(df["urls_removed"])

0     the product arrived on time. packaging was great, and the quality is amazing!
1                                          this product is just amazing! i love it.
2       i bought this phone for $799, and it has a 120hz display. totally worth it!
3                           wow!!! this product is awesome... but a bit expensive??
4                                               the laptop works perfectly fine.   
5                                         check out the full product details here: 
6           <div><h2>great purchase!</h2><p>i am happy with this product.</p></div>
7                  the battry life is excelent, but the chargin cable is too short.
8                         i can't believe it's so good! didn't expect such quality.
9                     love this product! ???? fast delivery ??, amazing quality! ??
10                         tbh, i wasnít expecting much, but omg, this is awesome!!
11                            this is the best product i have ever used in m

In [57]:
# Removal of HTML tags
from bs4 import BeautifulSoup

def remove_html_tags(text):
    return BeautifulSoup(text, "html.parser").get_text()

df["html_removed"] = df["urls_removed"].apply(remove_html_tags)

pd.set_option('display.max_colwidth', None)

print(df["html_removed"])

0     the product arrived on time. packaging was great, and the quality is amazing!
1                                          this product is just amazing! i love it.
2       i bought this phone for $799, and it has a 120hz display. totally worth it!
3                           wow!!! this product is awesome... but a bit expensive??
4                                               the laptop works perfectly fine.   
5                                         check out the full product details here: 
6                                      great purchase!i am happy with this product.
7                  the battry life is excelent, but the chargin cable is too short.
8                         i can't believe it's so good! didn't expect such quality.
9                     love this product! ???? fast delivery ??, amazing quality! ??
10                         tbh, i wasnít expecting much, but omg, this is awesome!!
11                            this is the best product i have ever used in m

In [58]:
# Removal of emojis (if any)
import emoji

def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')

df["emojis_removed"] = df["html_removed"].apply(remove_emojis)

pd.set_option('display.max_colwidth', None)

print(df["emojis_removed"])

0     the product arrived on time. packaging was great, and the quality is amazing!
1                                          this product is just amazing! i love it.
2       i bought this phone for $799, and it has a 120hz display. totally worth it!
3                           wow!!! this product is awesome... but a bit expensive??
4                                               the laptop works perfectly fine.   
5                                         check out the full product details here: 
6                                      great purchase!i am happy with this product.
7                  the battry life is excelent, but the chargin cable is too short.
8                         i can't believe it's so good! didn't expect such quality.
9                     love this product! ???? fast delivery ??, amazing quality! ??
10                         tbh, i wasnít expecting much, but omg, this is awesome!!
11                            this is the best product i have ever used in m

In [59]:
# Replace internet slang/chat words

slang_dict = {
    "tbh": "to be honest",
    "omg": "oh my god",
    "lol": "laugh out loud",
    "idk": "I don't know",
    "brb": "be right back",
    "btw": "by the way",
    "imo": "in my opinion",
    "smh": "shaking my head",
    "fyi": "for your information",
    "np": "no problem",
    "ikr": "I know right",
    "asap": "as soon as possible",
    "bff": "best friend forever",
    "gg": "good game",
    "hmu": "hit me up",
    "rofl": "rolling on the floor laughing"
}

def replace_slang(text):
    import re

    escaped_slang_words = []
    for word in slang_dict.keys():
        escaped_slang_words.append(re.escape(word))

    slang_pattern = r'\b(' + '|'.join(escaped_slang_words) + r')\b'

    def replace_match(match):
        slang_word = match.group(0)
        return slang_dict[slang_word.lower()]

    return re.sub(slang_pattern, replace_match, text, flags=re.IGNORECASE)

df["slangs_replaced"] = df["emojis_removed"].apply(replace_slang)

pd.set_option('display.max_colwidth', None)

print(df["slangs_replaced"])

0     the product arrived on time. packaging was great, and the quality is amazing!
1                                          this product is just amazing! i love it.
2       i bought this phone for $799, and it has a 120hz display. totally worth it!
3                           wow!!! this product is awesome... but a bit expensive??
4                                               the laptop works perfectly fine.   
5                                         check out the full product details here: 
6                                      great purchase!i am happy with this product.
7                  the battry life is excelent, but the chargin cable is too short.
8                         i can't believe it's so good! didn't expect such quality.
9                     love this product! ???? fast delivery ??, amazing quality! ??
10          to be honest, i wasnít expecting much, but oh my god, this is awesome!!
11                            this is the best product i have ever used in m

In [60]:
# Replace Contractions
import re

contractions_dict = {
    "wasn't": "was not",
    "isn't": "is not",
    "aren't": "are not",
    "weren't": "were not",
    "doesn't": "does not",
    "don't": "do not",
    "didn't": "did not",
    "can't": "cannot",
    "couldn't": "could not",
    "shouldn't": "should not",
    "wouldn't": "would not",
    "won't": "will not",
    "haven't": "have not",
    "hasn't": "has not",
    "hadn't": "had not",
    "i'm": "i am",
    "you're": "you are",
    "he's": "he is",
    "she's": "she is",
    "it's": "it is",
    "we're": "we are",
    "they're": "they are",
    "i've": "i have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "i'd": "i would",
    "you'd": "you would",
    "he'd": "he would",
    "she'd": "she would",
    "we'd": "we would",
    "they'd": "they would",
    "i'll": "i will",
    "you'll": "you will",
    "he'll": "he will",
    "she'll": "she will",
    "we'll": "we will",
    "they'll": "they will",
    "let's": "let us",
    "that's": "that is",
    "who's": "who is",
    "what's": "what is",
    "where's": "where is",
    "when's": "when is",
    "why's": "why is"
}

escaped_contractions = []
for contraction in contractions_dict.keys():
    escaped_contractions.append(re.escape(contraction))

pattern = r'\b(' + "|".join(escaped_contractions) + r')\b'
compiled_pattern = re.compile(pattern, flags=re.IGNORECASE)

def replace_contractions(text):
    def replace_match(match):
        return contractions_dict[match.group(0).lower()]
    return compiled_pattern.sub(replace_match, text)

df["contractions_replaced"] = df["slangs_replaced"].apply(replace_contractions)

pd.set_option('display.max_colwidth', None)

print(df["contractions_replaced"])

0     the product arrived on time. packaging was great, and the quality is amazing!
1                                          this product is just amazing! i love it.
2       i bought this phone for $799, and it has a 120hz display. totally worth it!
3                           wow!!! this product is awesome... but a bit expensive??
4                                               the laptop works perfectly fine.   
5                                         check out the full product details here: 
6                                      great purchase!i am happy with this product.
7                  the battry life is excelent, but the chargin cable is too short.
8                      i cannot believe it is so good! did not expect such quality.
9                     love this product! ???? fast delivery ??, amazing quality! ??
10          to be honest, i wasnít expecting much, but oh my god, this is awesome!!
11                            this is the best product i have ever used in m

In [61]:
# Remove punctuations and special characters
import string

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df["punctuations_removed"] = df["contractions_replaced"].apply(remove_punctuation)

pd.set_option('display.max_colwidth', None)

print(df["punctuations_removed"])

0     the product arrived on time packaging was great and the quality is amazing
1                                         this product is just amazing i love it
2        i bought this phone for 799 and it has a 120hz display totally worth it
3                                wow this product is awesome but a bit expensive
4                                             the laptop works perfectly fine   
5                                       check out the full product details here 
6                                     great purchasei am happy with this product
7                 the battry life is excelent but the chargin cable is too short
8                     i cannot believe it is so good did not expect such quality
9                             love this product  fast delivery  amazing quality 
10            to be honest i wasnít expecting much but oh my god this is awesome
11                          this is the best product i have ever used in my life
12    the shoes were comfort

In [62]:
# Remove numbers
import re

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

df["numbers_removed"] = df["punctuations_removed"].apply(remove_numbers)

pd.set_option('display.max_colwidth', None)

print(df["numbers_removed"])

0     the product arrived on time packaging was great and the quality is amazing
1                                         this product is just amazing i love it
2              i bought this phone for  and it has a hz display totally worth it
3                                wow this product is awesome but a bit expensive
4                                             the laptop works perfectly fine   
5                                       check out the full product details here 
6                                     great purchasei am happy with this product
7                 the battry life is excelent but the chargin cable is too short
8                     i cannot believe it is so good did not expect such quality
9                             love this product  fast delivery  amazing quality 
10            to be honest i wasnít expecting much but oh my god this is awesome
11                          this is the best product i have ever used in my life
12    the shoes were comfort

In [63]:
# Correct spelling mistakes
from autocorrect import Speller

spell = Speller(lang='en')

def correct_spelling(text):
    return spell(text)

df["spelling_corrected"] = df["numbers_removed"].apply(correct_spelling)

pd.set_option('display.max_colwidth', None)

print(df["spelling_corrected"])

0     the product arrived on time packaging was great and the quality is amazing
1                                         this product is just amazing i love it
2              i bought this phone for  and it has a hz display totally worth it
3                                wow this product is awesome but a bit expensive
4                                             the laptop works perfectly fine   
5                                       check out the full product details here 
6                                     great purchased am happy with this product
7              the battery life is excellent but the charging cable is too short
8                     i cannot believe it is so good did not expect such quality
9                             love this product  fast delivery  amazing quality 
10            to be honest i wasnít expecting much but oh my god this is awesome
11                          this is the best product i have ever used in my life
12    the shoes were comfort

In [64]:
# Remove stopwords
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    filtered_words = []

    for word in words:
        if word.lower() not in stop_words:
            filtered_words.append(word)

    return " ".join(filtered_words)

df["stopwords_removed"] = df["spelling_corrected"].apply(remove_stopwords)

pd.set_option('display.max_colwidth', None)

print(df["stopwords_removed"])

0          product arrived time packaging great quality amazing
1                                          product amazing love
2                         bought phone hz display totally worth
3                             wow product awesome bit expensive
4                                   laptop works perfectly fine
5                                    check full product details
6                                 great purchased happy product
7                   battery life excellent charging cable short
8                            cannot believe good expect quality
9                    love product fast delivery amazing quality
10                  honest wasnít expecting much oh god awesome
11                                  best product ever used life
12    shoes comfortable fitting nicely worked perfectly jogging
Name: stopwords_removed, dtype: object


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/d7fd5ab3-e375-4f2f-91b8-
[nltk_data]     5b636c02958c/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [65]:
# Apply stemming
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def apply_stemming(text):
    words = text.split()
    stemmed_words = []

    for word in words:
        stemmed_words.append(stemmer.stem(word))

    return " ".join(stemmed_words)

df["stemmed"] = df["stopwords_removed"].apply(apply_stemming)

pd.set_option('display.max_colwidth', None)

print(df["stemmed"])

0     product arriv time packag great qualiti amaz
1                                product amaz love
2              bought phone hz display total worth
3                    wow product awesom bit expens
4                       laptop work perfectli fine
5                        check full product detail
6                      great purchas happi product
7              batteri life excel charg cabl short
8                cannot believ good expect qualiti
9          love product fast deliveri amaz qualiti
10         honest wasnít expect much oh god awesom
11                      best product ever use life
12        shoe comfort fit nice work perfectli jog
Name: stemmed, dtype: object


In [66]:
# Apply lemmatization
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

def apply_lemmatization(text):
    words = text.split()
    lemmatized_words = []

    for word in words:
        lemmatized_words.append(lemmatizer.lemmatize(word))

    return " ".join(lemmatized_words)

df["lemmatized"] = df["stopwords_removed"].apply(apply_lemmatization)

pd.set_option('display.max_colwidth', None)

print(df["lemmatized"])

0         product arrived time packaging great quality amazing
1                                         product amazing love
2                        bought phone hz display totally worth
3                            wow product awesome bit expensive
4                                   laptop work perfectly fine
5                                    check full product detail
6                                great purchased happy product
7                  battery life excellent charging cable short
8                           cannot believe good expect quality
9                   love product fast delivery amazing quality
10                 honest wasnít expecting much oh god awesome
11                                 best product ever used life
12    shoe comfortable fitting nicely worked perfectly jogging
Name: lemmatized, dtype: object


[nltk_data] Downloading package wordnet to
[nltk_data]     /home/d7fd5ab3-e375-4f2f-91b8-
[nltk_data]     5b636c02958c/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/d7fd5ab3-e375-4f2f-91b8-
[nltk_data]     5b636c02958c/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [67]:
df.to_csv("Processed_Reviews.csv", index=False)

In [72]:
df = pd.read_csv("UNITENReview.csv")

df["processed"] = df["Review"].apply(preprocess_text)

df.to_csv("UNITEN_Processed.csv", index=False)