In [67]:
# Import dependencies
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

In [68]:
# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4') # For lemmatizer synonyms

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\baqui\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\baqui\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\baqui\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\baqui\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [69]:
# Load dataset
df = pd.read_csv('before_preprocess.csv')
df.head()

Unnamed: 0,Review Text,Label
0,"sir okay armygreen shorts nice""""",1
1,"di pareha yong mga size nila may sobrang liit""""",0
2,super worth it ang ganda Sombra grabi order na...,1
3,"ganda po salamat""""",1
4,"maayos pagkadeliver maganda den sya""""",0


In [70]:
# Define additional Tagalog stopwords
tagalog_stopwords = set([
    'ang', 'ng', 'sa', 'mga', 'si', 'kay', 'ni', 'ito', 'iyon', 'doon', 'dito',
    'ay', 'na', 'pa', 'rin', 'naman', 'para', 'habang', 'kung', 'kasi', 'at',
    'hindi', 'oo', 'huwag', 'wala', 'may', 'meron', 'nasa', 'baka', 'bakit',
    'paano', 'saan', 'kailan', 'lahat', 'amin', 'atin', 'kanila', 'kami', 'kayo',
    'ikaw', 'siya', 'nila', 'niya', 'natin', 'tayo', 'ako', 'ikaw', 'ko', 'mo'
])

In [71]:
# Combine English and Tagalog stopwords
stop_words = set(stopwords.words('english')).union(tagalog_stopwords)

In [72]:
# Initialize lemmatizer for English
lemmatizer = WordNetLemmatizer()

In [73]:
# Define simple rule-based Tagalog lemmatizer
def tagalog_lemmatize(word):
    # Remove common prefixes and suffixes
    prefixes = ['mag', 'nag', 'pag', 'ma', 'ka', 'um', 'in', 'pinaka', 'pina', 'ika']
    suffixes = ['an', 'han', 'in']

    for pre in prefixes:
        if word.startswith(pre) and len(word) > len(pre) + 2:
            word = word[len(pre):]
            break

    for suf in suffixes:
        if word.endswith(suf) and len(word) > len(suf) + 2:
            word = word[:-len(suf)]
            break

    return word

In [74]:
# Define preprocessing function
def preprocess_text(text: str) -> list[str]:
    if not isinstance(text, str):
        return []

    # Lowercase
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove emojis (non-BMP Unicode characters)
    text = re.sub(r'[\U00010000-\U0010ffff]', '', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    tokens = [tok for tok in tokens if tok not in stop_words]

    # Lemmatize (English) and Tagalog simplify
    lemmatized = []
    for tok in tokens:
        if tok in stopwords.words('english'):
            lemmatized.append(lemmatizer.lemmatize(tok))
        else:
            lemmatized.append(tagalog_lemmatize(tok))

    return lemmatized

In [75]:
# Apply to text columns
text_cols = ['Review Text']
for col in text_cols:
    df[f'{col}_processed'] = df[col].apply(preprocess_text)

In [76]:
# Preview the result
df.head()

Unnamed: 0,Review Text,Label,Review Text_processed
0,"sir okay armygreen shorts nice""""",1,"[sir, okay, armygreen, shorts, nice]"
1,"di pareha yong mga size nila may sobrang liit""""",0,"[di, pareha, yong, size, sobrang, liit]"
2,super worth it ang ganda Sombra grabi order na...,1,"[super, worth, ganda, sombra, grabi, order, di..."
3,"ganda po salamat""""",1,"[ganda, po, salamat]"
4,"maayos pagkadeliver maganda den sya""""",0,"[ayos, kadeliver, anda, den, sya]"


In [77]:
df.to_csv('after_process.csv')