In [1]:
# Import necessary libraries
import pandas as pd
import re
import emoji
import string
from bs4 import BeautifulSoup
from autocorrect import Speller
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download NLTK resources (if not already downloaded)
nltk.download('stopwords')
nltk.download('wordnet')

# Step 1: Read the source data
df = pd.read_csv("UNITENReview.csv")

# Step 2: Convert text to lowercase
df["lowercased"] = df["Review"].apply(lambda x: x.lower())

# Step 3: Remove URLs
df["urls_removed"] = df["lowercased"].apply(lambda x: re.sub(r'http\S+|www\S+', '', x))

# Step 4: Remove HTML tags
df["html_removed"] = df["urls_removed"].apply(lambda x: BeautifulSoup(x, "html.parser").get_text())

# Step 5: Remove emojis
df["emojis_removed"] = df["html_removed"].apply(lambda x: emoji.replace_emoji(x, replace=''))

# Step 6: Replace internet slang/chat words
slang_dict = {
    "tbh": "to be honest", "omg": "oh my god", "lol": "laugh out loud", "idk": "I don't know",
    "brb": "be right back", "btw": "by the way", "imo": "in my opinion", "smh": "shaking my head",
    "fy": "for your information", "np": "no problem", "ikr": "I know right", "asap": "as soon as possible",
    "bff": "best friend forever", "gg": "good game", "hmu": "hit me up", "rofl": "rolling on the floor laughing"
}
def replace_slang(text):
    for slang, full_form in slang_dict.items():
        text = re.sub(r'\b' + re.escape(slang) + r'\b', full_form, text, flags=re.IGNORECASE)
    return text
df["slangs_replaced"] = df["emojis_removed"].apply(replace_slang)

# Step 7: Replace contractions
contractions_dict = {
    "wasn't": "was not", "isn't": "is not", "aren't": "are not", "weren't": "were not",
    "doesn't": "does not", "don't": "do not", "didn't": "did not", "can't": "cannot",
    "couldn't": "could not", "shouldn't": "should not", "wouldn't": "would not",
    "won't": "will not", "haven't": "have not", "hasn't": "has not", "hadn't": "had not",
    "i'm": "I am", "you're": "you are", "he's": "he is", "she's": "she is", "it's": "it is",
    "we're": "we are", "they're": "they are", "i've": "I have", "you've": "you have",
    "we've": "we have", "they've": "they have", "i'd": "I would", "you'd": "you would",
    "he'd": "he would", "she'd": "she would", "we'd": "we would", "they'd": "they would",
    "i'll": "I will", "you'll": "you will", "he'll": "he will", "she'll": "she will",
    "we'll": "we will", "they'll": "they will", "let's": "let us", "that's": "that is",
    "who's": "who is", "what's": "what is", "where's": "where is", "when's": "when is",
    "why's": "why is"
}
def replace_contractions(text):
    for contraction, full_form in contractions_dict.items():
        text = re.sub(r'\b' + re.escape(contraction) + r'\b', full_form, text, flags=re.IGNORECASE)
    return text
df["contractions_replaced"] = df["slangs_replaced"].apply(replace_contractions)

# Step 8: Remove punctuation and special characters
df["punctuations_removed"] = df["contractions_replaced"].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

# Step 9: Remove numbers
df["numbers_removed"] = df["punctuations_removed"].apply(lambda x: re.sub(r'\d+', '', x))

# Step 10: Correct spelling mistakes
spell = Speller(lang='en')
df["spelling_corrected"] = df["numbers_removed"].apply(lambda x: spell(x))

# Step 11: Remove stopwords
stop_words = set(stopwords.words('english'))
df["stopwords_removed"] = df["spelling_corrected"].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

# Step 12: Lemmatization
lemmatizer = WordNetLemmatizer()
df["lemmatized"] = df["stopwords_removed"].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

# Save the result to a CSV file
df.to_csv("Processed_UNITENReviews.csv", index=False)

# Display the final processed data
print(df[["Review", "lemmatized"]].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emier\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\emier\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  df["html_removed"] = df["urls_removed"].apply(lambda x: BeautifulSoup(x, "html.parser").get_text())


                                              Review  \
0  Im happy with uniten actually, even the people...   
1  I’m having a pretty good time here, happy to m...   
2        a very neutral place in terms of everything   
3  I would say Uniten it's  a good university  bu...   
4   UNITEN is well-regarded, particularly for its...   

                                          lemmatized  
0             im happy united actually even people w  
1           i’m pretty good time happy meet w people  
2                      neutral place term everything  
3  would say united good university issue need im...  
4  united wellregarded particularly strong engine...  
