<a href="https://colab.research.google.com/github/jiyuutheosum/Machine-Learning/blob/main/Baraocor_Activity_Week13.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Title: Text Normalization and Text Feature Extraction - Week 13 Laboratory Activity


**Name:** `[ Jalanie Baraocor ] `

**Section:** `[4R8]`

---
**Objective:**

This activity aims to enable students to apply text normalization and feature extraction techniques as essential steps in Text Mining and Decision Support Systems. Students will learn how to clean and standardize raw text data (e.g., lowercasing, removing stopwords, stemming, and lemmatization) and transform it into numerical representations such as Bag-of-Words (BoW) and TF-IDF vectors for further machine learning and analytical applications.



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **1. Import and Install Packages**

In [None]:
# Install packages (Colab)
!pip install -q nltk scikit-learn pandas tqdm pyspellchecker textblob

# TextBlob needs corpora
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')

# Imports
from google.colab import drive
import re
import pandas as pd
from tqdm import tqdm
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from spellchecker import SpellChecker
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/7.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.9/7.2 MB[0m [31m28.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━[0m [32m6.3/7.2 MB[0m [31m101.6 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m7.2/7.2 MB[0m [31m93.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m67.3 MB/s[0m eta [36m0:00:00[0m
[?25h

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


# **2. Load the Dataset**

In [None]:
INPUT_CSV = "/content/drive/MyDrive/Colab Notebooks/dataset.csv"
df = pd.read_csv(INPUT_CSV)

print("Rows:", len(df))
print("Columns:", df.columns.tolist())
assert 'review' in df.columns, "Dataset must have a 'Review' column"
df['review'] = df['review'].astype(str)
df.head(5)


Rows: 46742
Columns: ['Unnamed: 0', 'recommendationid', 'language', 'review', 'Reaction']


Unnamed: 0.1,Unnamed: 0,recommendationid,language,review,Reaction
0,0,77057085,english,Is good. Do play.,0
1,1,77052689,english,AAAAAAAA,0
2,2,77049252,english,Fun game,1
3,3,77049089,english,"Great game, worth every penny!",0
4,4,35101272,english,Like,0


# **3. Perform text cleaning on column Review**

In [None]:
# Settings
USE_SPELLCHECKER = True      # pyspellchecker (faster)
USE_TEXTBLOB_CORRECT = False # True = slower but sometimes better
LANG = 'english'

# Helpers
stop_words = set(stopwords.words(LANG))
spell = SpellChecker()  # pyspellchecker
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# map nltk POS tags to wordnet POS for lemmatizer
from nltk import pos_tag
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def basic_clean(text):
    text = text.lower()
    # remove punctuation and non-letters; keep spaces
    text = re.sub(r'[^a-z\s]', ' ', text)
    # remove repetitive characters (e.g. "soooo" → "soo")
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    # collapse multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def correct_spelling_fast(text):
    # fast approach: correct tokens with pyspellchecker
    tokens = text.split()
    corrected = []
    for t in tokens:
        # if short or numeric-like keep it
        if len(t) <= 2:
            corrected.append(t)
            continue
        cand = spell.correction(t)
        corrected.append(cand if cand is not None else t)
    return ' '.join(corrected)

def correct_spelling_textblob(text):
    # slower, uses language model
    return str(TextBlob(text).correct())

def stem_tokens(tokens):
    return [stemmer.stem(t) for t in tokens]

def lemmatize_tokens(tokens):
    pos_tags = pos_tag(tokens)
    return [lemmatizer.lemmatize(t, get_wordnet_pos(p)) for t, p in pos_tags]

def remove_stopwords(tokens):
    return [t for t in tokens if t not in stop_words]

# Full pipeline that returns: cleaned_raw, tokens, stemmed_tokens, lemmatized_tokens, final_text
def process_review(text):
    cleaned = basic_clean(text)
    if USE_SPELLCHECKER:
        cleaned = correct_spelling_fast(cleaned)
    elif USE_TEXTBLOB_CORRECT:
        cleaned = correct_spelling_textblob(cleaned)

    # tokenize
    tokens = word_tokenize(cleaned)

    # perform stemming (copy)
    stemmed = stem_tokens(tokens)

    # lemmatize (use original tokens or stemmed depending on preference)
    lemmatized = lemmatize_tokens(tokens)  # lemmatize original tokens

    # stopword removal for final tokens (here we'll apply to lemmatized tokens)
    final_tokens = remove_stopwords(lemmatized)
    final_text = ' '.join(final_tokens)

    return {
        "cleaned": cleaned,
        "tokens": tokens,
        "stemmed_tokens": stemmed,
        "lemmatized_tokens": lemmatized,
        "final_text": final_text
    }

# Randomly sample 10,000 rows for faster and unbiased testing
df_sample = df.sample(n=5000, random_state=42).copy()

tqdm.pandas()
processed = df_sample['review'].progress_apply(process_review)


# Apply to dataframe (progress bar)
tqdm.pandas()
processed = df_sample['review'].progress_apply(process_review)

# Expand results into columns
df_sample['cleaned'] = processed.apply(lambda x: x['cleaned'])
df_sample['tokens'] = processed.apply(lambda x: x['tokens'])
df_sample['stemmed_tokens'] = processed.apply(lambda x: x['stemmed_tokens'])
df_sample['lemmatized_tokens'] = processed.apply(lambda x: x['lemmatized_tokens'])
df_sample['final_cleaned_review'] = processed.apply(lambda x: x['final_text'])

# Quick peek
df_sample[['review', 'final_cleaned_review']].head(8)


100%|██████████| 5000/5000 [17:31<00:00,  4.75it/s]
100%|██████████| 5000/5000 [16:48<00:00,  4.96it/s]


Unnamed: 0,review,final_cleaned_review
30583,This game has got to be one of the most memora...,game get one memorable game play truly withsto...
28537,The weirdest shit I've ever played and yet ins...,weird shit ever play yet insanely fun definite...
11192,This game + the original Binding of Isaac got ...,game original binding isaac get rouge like gam...
18237,Binding of Isaac: Cumtopia (REMASTERED),binding isaac utopia remastered
16235,great,great
35586,"this game causes the perfect amount of rage, b...",game cause perfect amount rage amount fun addi...
13558,"I love this game, there is some RNG and unfair...",love game ring unfairness never anything overw...
27057,"This is one of my most favorite games ever,wit...",one favorite game ever ton fun gamelan content


# **4. Extract text features using Bag-of-Words (BoW)**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Vectorizer settings - tune these as needed
count_vect = CountVectorizer(
    max_features=None,    # keep top 5000 tokens; set None to keep all
    min_df=1,             # ignore tokens that appear in fewer than min_df docs
    ngram_range=(1,1)     # unigram; use (1,2) for bigrams too
)

# fit on cleaned text
corpus = df_sample['final_cleaned_review'].fillna('').tolist()
X_counts = count_vect.fit_transform(corpus)  # sparse matrix (n_docs x n_features)
print("BoW shape:", X_counts.shape)

# Convert to DataFrame (feature names as columns)
bow_df = pd.DataFrame(X_counts.toarray(), columns=count_vect.get_feature_names_out())
bow_df.index = df_sample.index  # align indices with original df
bow_df.head()

BoW shape: (5000, 5263)


Unnamed: 0,aa,ab,abandon,abby,abel,abilities,ability,able,abnormally,abomination,...,yuan,yum,yummy,yup,yuppie,za,ze,zelda,zero,zz
30583,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28537,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11192,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18237,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16235,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# **5. Save the BOW features into a csv file**

In [None]:
BOW_CSV = "bow_features.csv"
bow_df.to_csv(BOW_CSV, index=True)  # index preserves original row id
print("Saved BoW features to", BOW_CSV)
# If you want to also merge with original reviews:
merged_bow = pd.concat([df_sample[['review', 'final_cleaned_review']].reset_index(drop=True), bow_df.reset_index(drop=True)], axis=1)
merged_bow.to_csv("bow_features_with_reviews.csv", index=False)
print("Saved merged file bow_features_with_reviews.csv")


Saved BoW features to bow_features.csv
Saved merged file bow_features_with_reviews.csv


# **6. Extract text features using TF-IDF**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(
    max_features=None,
    min_df=1,
    ngram_range=(1,1),
    smooth_idf=True,
    sublinear_tf=False
)

X_tfidf = tfidf_vect.fit_transform(corpus)
print("TF-IDF shape:", X_tfidf.shape)

tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vect.get_feature_names_out())
tfidf_df.index = df_sample.index
tfidf_df.head()


TF-IDF shape: (5000, 5263)


Unnamed: 0,aa,ab,abandon,abby,abel,abilities,ability,able,abnormally,abomination,...,yuan,yum,yummy,yup,yuppie,za,ze,zelda,zero,zz
30583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18237,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16235,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# **7. Save the TFIDF features into a csv file**

In [None]:
TFIDF_CSV = "tfidf_features.csv"
tfidf_df.to_csv(TFIDF_CSV, index=True)
print("Saved TF-IDF features to", TFIDF_CSV)

# Optionally merged with original review text
merged_tfidf = pd.concat([df_sample[['review', 'final_cleaned_review']].reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)
merged_tfidf.to_csv("tfidf_features_with_reviews.csv", index=False)
print("Saved merged file tfidf_features_with_reviews.csv")
