# **Perform tokenization, stopword removal, stemming, and lemmatization on a sample dataset. Compare how these preprocessing steps impact the quality of text representation.**

#Importing Libararies



In [None]:
!pip install nltk pandas



In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#Loading Data set

In [None]:
df = pd.read_csv("/content/DisneylandReviews.csv", encoding='latin1')
df.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong


#Data Preprocessing

In [None]:
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    # 1. Tokenization
    tokens = word_tokenize(text.lower())

    # 2. Stopword Removal + keep alphabetic words only
    filtered = [w for w in tokens if w.isalpha() and w not in stop_words]

    # 3. Stemming
    stemmed = [stemmer.stem(w) for w in filtered]

    # 4. Lemmatization
    lemmatized = [lemmatizer.lemmatize(w) for w in filtered]

    return tokens, filtered, stemmed, lemmatized


In [None]:
import nltk
nltk.download('punkt')

df["tokens"] = df["Review_Text"].apply(lambda x: preprocess(x)[0])
df["no_stopwords"] = df["Review_Text"].apply(lambda x: preprocess(x)[1])
df["stemmed"] = df["Review_Text"].apply(lambda x: preprocess(x)[2])
df["lemmatized"] = df["Review_Text"].apply(lambda x: preprocess(x)[3])

df.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,tokens,no_stopwords,stemmed,lemmatized
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,"[if, you, 've, ever, been, to, disneyland, any...","[ever, disneyland, anywhere, find, disneyland,...","[ever, disneyland, anywher, find, disneyland, ...","[ever, disneyland, anywhere, find, disneyland,..."
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,"[its, been, a, while, since, d, last, time, we...","[since, last, time, visit, hk, disneyland, yet...","[sinc, last, time, visit, hk, disneyland, yet,...","[since, last, time, visit, hk, disneyland, yet..."
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,"[thanks, god, it, wasn, t, too, hot, or, too, ...","[thanks, god, hot, humid, visiting, park, othe...","[thank, god, hot, humid, visit, park, otherwis...","[thanks, god, hot, humid, visiting, park, othe..."
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,"[hk, disneyland, is, a, great, compact, park, ...","[hk, disneyland, great, compact, park, unfortu...","[hk, disneyland, great, compact, park, unfortu...","[hk, disneyland, great, compact, park, unfortu..."
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,"[the, location, is, not, in, the, city, ,, too...","[location, city, took, around, hour, kowlon, k...","[locat, citi, took, around, hour, kowlon, kid,...","[location, city, took, around, hour, kowlon, k..."


#Comparing Representation Quality

In [None]:
def get_vocab_size(list_of_docs):
    vocab = set()
    for doc in list_of_docs:
        vocab.update(doc)
    return len(vocab)

# Ensure preprocessing columns are present before calculating vocabulary size
# This addresses potential inconsistencies if previous cells were not run or state was lost
df["tokens"] = df["Review_Text"].apply(lambda x: preprocess(x)[0])
df["no_stopwords"] = df["Review_Text"].apply(lambda x: preprocess(x)[1])
df["stemmed"] = df["Review_Text"].apply(lambda x: preprocess(x)[2])
df["lemmatized"] = df["Review_Text"].apply(lambda x: preprocess(x)[3])

results = {
    "Original Tokens": get_vocab_size(df["tokens"]),
    "After Stopword Removal": get_vocab_size(df["no_stopwords"]),
    "After Stemming": get_vocab_size(df["stemmed"]),
    "After Lemmatization": get_vocab_size(df["lemmatized"])
}

pd.DataFrame(results, index=["Vocabulary Size"])

Unnamed: 0,Original Tokens,After Stopword Removal,After Stemming,After Lemmatization
Vocabulary Size,66541,38817,26670,34982
