In [8]:
import pandas as pd

# Load datase
df = pd.read_csv("cleaned_text_only.csv")
df = df[['text']].dropna()
df.columns = ['Review']  # Rename for consistency with previous pipeline

# Display few entries
df.head()

def convert_to_lowercase(text):
    return text.lower()

df["lowercased"] = df["Review"].apply(convert_to_lowercase)
df["lowercased"].head()

import re

def remove_urls(text):
    return re.sub(r'http\S+|www\S+', '', text)

df["urls_removed"] = df["lowercased"].apply(remove_urls)
df["urls_removed"].head()

from bs4 import BeautifulSoup

def remove_html_tags(text):
    return BeautifulSoup(text, "html.parser").get_text()

df["html_removed"] = df["urls_removed"].apply(remove_html_tags)
df["html_removed"].head()

import emoji

def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')

df["emojis_removed"] = df["html_removed"].apply(remove_emojis)
df["emojis_removed"].head()

# Replace internet slang/chat words 
# Dictionary of slang words and their replacements 
slang_dict = { 
    "tbh": "to be honest", 
    "omg": "oh my god", 
    "lol": "laugh out loud", 
    "idk": "I don't know", 
    "brb": "be right back", 
    "btw": "by the way", 
    "imo": "in my opinion", 
    "smh": "shaking my head", 
    "fyi": "for your information", 
    "np": "no problem",
    "ikr": "I know right", 
    "asap": "as soon as possible", 
    "bff": "best friend forever", 
    "gg": "good game", 
    "hmu": "hit me up", 
    "rofl": "rolling on the floor laughing",
    "imho": "in my humble opinion",
    "sick": "awesome",  # Context-dependent
    "cap": "lie", 
} 
 
# Function to replace slang words 
def replace_slang(text): 
    # Create a list of escaped slang words 
    escaped_slang_words = []  # Empty list to store escaped slang words 
 
    for word in slang_dict.keys(): 
        escaped_word = re.escape(word)  # Ensure special characters are escaped 
        escaped_slang_words.append(escaped_word)  # Add to list 
 
    # Join the words using '|' 
    slang_pattern = r'\b(' + '|'.join(escaped_slang_words) + r')\b' 
 
    # Define a replacement function 
    def replace_match(match): 
        slang_word = match.group(0)  # Extract matched slang word 
        return slang_dict[slang_word.lower()]  # Replace with full form 
 
    # Use regex to replace slang words with full forms 
    replaced_text = re.sub(slang_pattern, replace_match, text, flags=re.IGNORECASE) 
 
    return replaced_text

# Apply the function to the column 
df["slangs_replaced"] = df["emojis_removed"].apply(replace_slang) 
 
# Display column content without truncation 
pd.set_option('display.max_colwidth', None)  # Set to None for unlimited width 
print(df["slangs_replaced"])

# Replace Contractions 
contractions_dict = { 
    "wasn't": "was not",
    "isn't": "is not",
    "aren't": "are not",
    "weren't": "were not",
    "doesn't": "does not",
    "don't": "do not",
    "didn't": "did not",
    "can't": "cannot",
    "couldn't": "could not",
    "shouldn't": "should not",
    "wouldn't": "would not",
    "won't": "will not",
    "haven't": "have not",
    "hasn't": "has not",
    "hadn't": "had not",
    "i'm": "i am",
    "you're": "you are",
    "he's": "he is",
    "she's": "she is",
    "it's": "it is",
    "we're": "we are",
    "they're": "they are",
    "i've": "i have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "i'd": "i would",
    "you'd": "you would",
    "he'd": "he would",
    "she'd": "she would",
    "we'd": "we would",
    "they'd": "they would",
    "i'll": "i will",
    "you'll": "you will",
    "he'll": "he will",
    "she'll": "she will",
    "we'll": "we will",
    "they'll": "they will",
    "let's": "let us",
    "that's": "that is",
    "who's": "who is",
    "what's": "what is",
    "where's": "where is",
    "when's": "when is",
    "why's": "why is",
    "dont": "do not",
    "aint": "is not",
    "heres": "here is",
    "thats": "that is",
    "cant": "cannot",
    "wont": "will not",
    "whats": "what is",
    "doesnt": "does not",
    "didnt": "did not",
    "havent": "have not",
    "couldnt": "could not",
    "shouldnt": "should not",
    "wouldnt": "would not",
    "hes": "he is",
    "theyre": "they are"
} 

# Build the regex pattern for contractions 
escaped_contractions = []  # List to store escaped contractions 

for contraction in contractions_dict.keys(): 
    escaped_contraction = re.escape(contraction)  # Escape special characters (e.g., apostrophes) 
    escaped_contractions.append(escaped_contraction)  # Add to list 

# Join the escaped contractions with '|' 
joined_contractions = "|".join(escaped_contractions) 

# Create a regex pattern with word boundaries (\b) 
contractions_pattern = r'\b(' + joined_contractions + r')\b' 

# Compile the regex 
compiled_pattern = re.compile(contractions_pattern, flags=re.IGNORECASE) 

# Define a function to replace contractions 
def replace_contractions(text): 

# Function to handle each match found 
    def replace_match(match): 
        matched_word = match.group(0)  # Extract matched contraction 
        lower_matched_word = matched_word.lower()  # Convert to lowercase
        expanded_form = contractions_dict[lower_matched_word]  # Get full form from dictionary 
        return expanded_form  # Return the expanded form 

    # Apply regex substitution 
    expanded_text = compiled_pattern.sub(replace_match, text) 
    return expanded_text  # Return modified text 

# Apply the function to a DataFrame column 
df["contractions_replaced"] = df["slangs_replaced"].apply(replace_contractions) 

# Display column content without truncation 
pd.set_option('display.max_colwidth', None)
df["contractions_replaced"].head()

import string

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df["punctuations_removed"] = df["contractions_replaced"].apply(remove_punctuation)
df["punctuations_removed"].head()

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

df["numbers_removed"] = df["punctuations_removed"].apply(remove_numbers)
df["numbers_removed"].head()

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    return " ".join([word for word in words if word.lower() not in stop_words])

df["stopwords_removed"] = df["numbers_removed"].apply(remove_stopwords)
df["stopwords_removed"].head()

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

def stem_text(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

df["stemmed"] = df["stopwords_removed"].apply(stem_text)
df["stemmed"].head()

from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize_text(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    return " ".join([lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags])

df["lemmatized"] = df["stopwords_removed"].apply(lemmatize_text)
df["lemmatized"].head()

# Remove columns with all NaNs
df = df.dropna(axis=1, how='all')

# Remove columns with more than 90% missing values
df = df.loc[:, df.isnull().mean() < 0.9]

# Remove columns with no variance (same value in all rows)
df = df.loc[:, df.nunique() > 1]

# Remove columns that are only whitespace or empty strings
df = df.loc[:, ~(df.apply(lambda col: col.astype(str).str.strip().eq('').all()))]

# Save only the 'lemmatized' column
df[["lemmatized"]].to_csv("Processed_News_Lemmatized.csv", index=False)
print("Saved as Processed_News.csv with only the lemmatized column.")

  return BeautifulSoup(text, "html.parser").get_text()


0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

[nltk_data] Downloading package stopwords to C:\Users\Afiq
[nltk_data]     Fikri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Afiq
[nltk_data]     Fikri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Afiq
[nltk_data]     Fikri\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Afiq Fikri\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Saved as Processed_News.csv with only the lemmatized column.
