In [68]:
import pandas as pd
import re
import string
import nltk
from langdetect import detect  # Import the language detection module
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer



# # Download NLTK data (if not already downloaded)
# nltk.download('punkt')
# nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\Izzham
[nltk_data]     Burhan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [69]:
# Load the CSV file into a pandas DataFrame
df_edu = pd.read_csv('../data/state/Data-education-state.csv',encoding='unicode_escape')
df_pol = pd.read_csv('../data/state/Data-politics-state.csv',encoding='unicode_escape')
df_ent = pd.read_csv('../data/state/Data-entertainment-state.csv',encoding='unicode_escape')
df_foo = pd.read_csv('../data/state/Data-food-state.csv',encoding='unicode_escape')
df_spo = pd.read_csv('../data/state/Data-sports-state.csv',encoding='unicode_escape')
df_tec = pd.read_csv('../data/state/Data-tech-state.csv',encoding='unicode_escape')

In [70]:
df = pd.concat([df_edu, df_pol, df_ent, df_foo, df_spo, df_tec], ignore_index=True)


In [71]:
df = df.drop(['location','interest'], axis=1)

In [72]:
def clean_text(text):
    # Remove URLs (http/https links)
    text = re.sub(r'http\S+', '', text)

    # Remove special characters and non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Remove extra spaces and leading/trailing spaces
    text = ' '.join(text.split())
    text = text.lower()

    return text

In [73]:
# Assuming df is your DataFrame and 'text_column' is the column containing the text
df['cleaned_text'] = df['text'].apply(clean_text)

In [74]:
df

Unnamed: 0,text,cleaned_text
0,Going to a technology conference in Kuala Lumpur,going to a technology conference in kuala lumpur
1,Watching a football match in Johor Bahru,watching a football match in johor bahru
2,Trying a famous local food in Penang,trying a famous local food in penang
3,Attending a music concert in Kuala Lumpur,attending a music concert in kuala lumpur
4,Following the latest political news in KL,following the latest political news in kl
...,...,...
28835,Cybersecurity fellas???!,cybersecurity fellas
28836,@malaysiakini @handsome2288 Not practical in t...,malaysiakini handsome2288 not practical in the...
28837,This course teaches how to use the technology ...,this course teaches how to use the technology ...
28838,@Digital_Muscle Yep bro,digitalmuscle yep bro


In [75]:
# Tokenization
df['tokens'] = df['cleaned_text'].apply(lambda text: word_tokenize(text))

In [76]:
df.drop(['text'], axis=1)

Unnamed: 0,cleaned_text,tokens
0,going to a technology conference in kuala lumpur,"[going, to, a, technology, conference, in, kua..."
1,watching a football match in johor bahru,"[watching, a, football, match, in, johor, bahru]"
2,trying a famous local food in penang,"[trying, a, famous, local, food, in, penang]"
3,attending a music concert in kuala lumpur,"[attending, a, music, concert, in, kuala, lumpur]"
4,following the latest political news in kl,"[following, the, latest, political, news, in, kl]"
...,...,...
28835,cybersecurity fellas,"[cybersecurity, fellas]"
28836,malaysiakini handsome2288 not practical in the...,"[malaysiakini, handsome2288, not, practical, i..."
28837,this course teaches how to use the technology ...,"[this, course, teaches, how, to, use, the, tec..."
28838,digitalmuscle yep bro,"[digitalmuscle, yep, bro]"


In [77]:
stop_words = set(stopwords.words('english'))
df['tokens_stopwords'] = df['tokens'].apply(lambda tokens: [word for word in tokens if word.lower() not in stop_words])

In [78]:
df.drop(['text','cleaned_text'], axis=1)

Unnamed: 0,tokens,tokens_stopwords
0,"[going, to, a, technology, conference, in, kua...","[going, technology, conference, kuala, lumpur]"
1,"[watching, a, football, match, in, johor, bahru]","[watching, football, match, johor, bahru]"
2,"[trying, a, famous, local, food, in, penang]","[trying, famous, local, food, penang]"
3,"[attending, a, music, concert, in, kuala, lumpur]","[attending, music, concert, kuala, lumpur]"
4,"[following, the, latest, political, news, in, kl]","[following, latest, political, news, kl]"
...,...,...
28835,"[cybersecurity, fellas]","[cybersecurity, fellas]"
28836,"[malaysiakini, handsome2288, not, practical, i...","[malaysiakini, handsome2288, practical, era, d..."
28837,"[this, course, teaches, how, to, use, the, tec...","[course, teaches, use, technology, teaching, m..."
28838,"[digitalmuscle, yep, bro]","[digitalmuscle, yep, bro]"


In [79]:
lemmatizer = WordNetLemmatizer()
df['lemmas'] = df['tokens_stopwords'].apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])

In [82]:
df.drop(['text','cleaned_text','tokens'], axis=1)

Unnamed: 0,tokens_stopwords,lemmas
0,"[going, technology, conference, kuala, lumpur]","[going, technology, conference, kuala, lumpur]"
1,"[watching, football, match, johor, bahru]","[watching, football, match, johor, bahru]"
2,"[trying, famous, local, food, penang]","[trying, famous, local, food, penang]"
3,"[attending, music, concert, kuala, lumpur]","[attending, music, concert, kuala, lumpur]"
4,"[following, latest, political, news, kl]","[following, latest, political, news, kl]"
...,...,...
28835,"[cybersecurity, fellas]","[cybersecurity, fella]"
28836,"[malaysiakini, handsome2288, practical, era, d...","[malaysiakini, handsome2288, practical, era, d..."
28837,"[course, teaches, use, technology, teaching, m...","[course, teach, use, technology, teaching, mea..."
28838,"[digitalmuscle, yep, bro]","[digitalmuscle, yep, bro]"
