In [60]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import nltk
nltk.download('punkt')  # Download the Punkt tokenizer

[nltk_data] Downloading package punkt to C:\Users\Izzham
[nltk_data]     Burhan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [61]:
#load dataset
df = pd.read_csv('../data/train/text_dataset_labeled.csv')

# Filter out rows with NaN or float values in the 'text' column
df = df.dropna(subset=['text'])

text = df['text'].tolist()
location = df['location'].tolist() 
interest = df['interest'].tolist() 

df

Unnamed: 0,text,location,interest
0,attempt hari tu act like smart deep tech entre...,Unknown,Technology
1,mosti cradle ni clueless whats going tech tal...,Unknown,Technology
2,hightech export country 942b hong kong 431b g...,Unknown,Technology
3,tbh many tech talent dont actually need focus...,Unknown,Technology
4,make 180000 tech consultant london grew extrem...,Unknown,Technology
...,...,...,...
27592,smartwatch one better sport ea garmin amazfit ...,Terengganu,Sports
27593,abearfromsea heaven football maracana,Terengganu,Sports
27594,king pele former youth sport minister khairykj...,Terengganu,Sports
27595,love football,Terengganu,Sports


# Education

### TF-IDF

In [62]:
text_edu = df[df["interest"] == 'Education']["text"]

tfidf_vectorizer = TfidfVectorizer(
    max_df=0.85,
    min_df=2,
    stop_words='english',
    lowercase=True,
    max_features=8000
)
    # Fit and transform your text data
tfidf_matrix = tfidf_vectorizer.fit_transform(text_edu)
    

feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix.sum(axis=0).A1

# Create a DataFrame to store words and their TF-IDF scores
tfidf_df = pd.DataFrame({'word': feature_names, 'tfidf_score': tfidf_scores})

# Sort the DataFrame by TF-IDF score in descending order
tfidf_df = tfidf_df.sort_values(by='tfidf_score', ascending=False)

# Extracting words and their TF-IDF scores from the output DataFrame
words_tfidf = tfidf_df['word'].tolist()
tfidf_scores = tfidf_df['tfidf_score'].tolist()

In [63]:
# words_tfidf
tfidf_df

Unnamed: 0,word,tfidf_score
5760,school,192.673493
6294,student,159.530308
6948,university,120.084658
2074,education,114.526637
6518,teacher,95.592679
...,...,...
3430,jewish,0.339615
7023,uyghur,0.339615
752,bahai,0.339615
3681,kuasa,0.324386


### Word2Vec

In [64]:
tokenized_text_edu = [nltk.word_tokenize(sentence) for sentence in text_edu]

model = Word2Vec(sentences=tokenized_text_edu, vector_size=100, window=5, min_count=1 , sg=0)

model.save(f"word2vec/word2vec_model/education")

model = Word2Vec.load(f"word2vec/word2vec_model/education")

similar_words = model.wv.most_similar('education', topn=200)

for word, score in similar_words:
    print(f"Word: {word}, Similarity Score: {score}")

# Create a DataFrame
word2vec_df = pd.DataFrame(similar_words, columns=['word', 'word2vec_score'])


Word: student, Similarity Score: 0.999872088432312
Word: curriculum, Similarity Score: 0.9998666644096375
Word: u, Similarity Score: 0.9998417496681213
Word: also, Similarity Score: 0.9998376965522766
Word: new, Similarity Score: 0.9998355507850647
Word: get, Similarity Score: 0.9998329281806946
Word: need, Similarity Score: 0.9998323321342468
Word: one, Similarity Score: 0.9998297691345215
Word: im, Similarity Score: 0.9998227953910828
Word: people, Similarity Score: 0.999820351600647
Word: work, Similarity Score: 0.9998168349266052
Word: study, Similarity Score: 0.99981290102005
Word: time, Similarity Score: 0.9998105764389038
Word: year, Similarity Score: 0.9998093247413635
Word: quota, Similarity Score: 0.999803900718689
Word: n, Similarity Score: 0.9998020529747009
Word: today, Similarity Score: 0.9998016953468323
Word: like, Similarity Score: 0.9997994303703308
Word: make, Similarity Score: 0.9997988939285278
Word: di, Similarity Score: 0.9997968673706055
Word: 1, Similarity Scor

In [65]:
word2vec_df

Unnamed: 0,word,word2vec_score
0,student,0.999872
1,curriculum,0.999867
2,u,0.999842
3,also,0.999838
4,new,0.999836
...,...,...
195,scholarship,0.999436
196,asia,0.999436
197,actually,0.999423
198,higher,0.999422


#### Comparison

In [66]:
# Define weights
tfidf_weight = 0.6
word2vec_weight = 0.4

# Create a combined list of words
combined_words = tfidf_df['word'].tolist() + word2vec_df['word'].tolist()
combined_words

['school',
 'student',
 'university',
 'education',
 'teacher',
 'year',
 'learning',
 'curriculum',
 'day',
 'online',
 'like',
 '2023',
 'im',
 'time',
 'high',
 'international',
 'new',
 'kid',
 'study',
 'need',
 'dont',
 'good',
 'world',
 'make',
 'old',
 'country',
 'life',
 'work',
 'today',
 'research',
 'got',
 'people',
 'course',
 'child',
 'holiday',
 'know',
 'free',
 'quota',
 'friend',
 'week',
 'best',
 'science',
 'want',
 'phd',
 'college',
 'way',
 'love',
 'said',
 'program',
 'yang',
 'degree',
 'tak',
 'opportunity',
 'di',
 'class',
 'scholarship',
 'primary',
 'graduate',
 'join',
 'home',
 'read',
 'state',
 'application',
 'help',
 'english',
 'say',
 'better',
 'parent',
 'learn',
 'prize',
 'minister',
 'job',
 'national',
 'going',
 'great',
 'public',
 'let',
 'apply',
 'really',
 'think',
 'team',
 'future',
 'look',
 'right',
 'thing',
 'uk',
 'change',
 'universiti',
 'project',
 'event',
 'thank',
 'guy',
 'nak',
 'video',
 'teaching',
 'dan',
 'ni',


In [67]:
combined_scores = {}

for word in combined_words:
    # Check if the word is present in the TF-IDF DataFrame
    if word in tfidf_df['word'].values:
        tfidf_score = tfidf_df.loc[tfidf_df['word'] == word, 'tfidf_score'].values[0]
    else:
        tfidf_score = 0.0  # Set to 0 if not found in TF-IDF data
    
    # Check if the word is present in the Word2Vec DataFrame
    if word in word2vec_df['word'].values:
        word2vec_similarity = word2vec_df.loc[word2vec_df['word'] == word, 'word2vec_score'].values[0]
    else:
        word2vec_similarity = 0.0  # Set to 0 if not found in Word2Vec data
    
    # Calculate the combined score
    combined_score = (tfidf_weight * tfidf_score) + (word2vec_weight * word2vec_similarity)
    
    # Store the combined score in a dictionary
    combined_scores[word] = combined_score

In [68]:
combined_scores

{'school': 116.00400998423316,
 'student': 96.11813334206742,
 'university': 72.45070844446712,
 'education': 68.71598216145863,
 'teacher': 57.75551914448821,
 'year': 31.292362954810937,
 'learning': 30.397753804756352,
 'curriculum': 28.573020526829794,
 'day': 27.682140346653934,
 'online': 27.25323656968427,
 'like': 25.95806908349412,
 '2023': 25.926802800864944,
 'im': 25.46561237837841,
 'time': 25.142718171764596,
 'high': 25.008410582797758,
 'international': 22.4222158121062,
 'new': 21.171648407689332,
 'kid': 20.718768165455806,
 'study': 20.53717933552658,
 'need': 20.185791379125988,
 'dont': 19.94266987491288,
 'good': 17.45492970562201,
 'world': 17.331752710879883,
 'make': 16.9907387591548,
 'old': 16.71864896176268,
 'country': 16.639967027845607,
 'life': 16.5450312945571,
 'work': 16.524534610251045,
 'today': 16.02370727451083,
 'research': 16.00831874740898,
 'got': 15.901419593391608,
 'people': 15.8824942605305,
 'course': 15.797845864100008,
 'child': 15.7020

In [69]:
# Sort the words by combined scores in descending order
sorted_words = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)

# Select the top N words (e.g., top 10)
top_words = [word for word, score in sorted_words[:70]]
top_words

['school',
 'student',
 'university',
 'education',
 'teacher',
 'year',
 'learning',
 'curriculum',
 'day',
 'online',
 'like',
 '2023',
 'im',
 'time',
 'high',
 'international',
 'new',
 'kid',
 'study',
 'need',
 'dont',
 'good',
 'world',
 'make',
 'old',
 'country',
 'life',
 'work',
 'today',
 'research',
 'got',
 'people',
 'course',
 'child',
 'know',
 'free',
 'holiday',
 'quota',
 'friend',
 'week',
 'best',
 'science',
 'want',
 'college',
 'way',
 'love',
 'phd',
 'program',
 'yang',
 'degree',
 'tak',
 'said',
 'opportunity',
 'di',
 'class',
 'scholarship',
 'primary',
 'graduate',
 'join',
 'read',
 'state',
 'help',
 'english',
 'say',
 'better',
 'home',
 'parent',
 'learn',
 'application',
 'minister']

In [70]:
# File path where you want to save the list of words
output_file_path = "comparison_50/word_education_50.txt"

# Open the file in write mode and write the words
with open(output_file_path, 'w') as file:
    for word in top_words:
        file.write(word + '\n')

# Close the file
file.close()

print("Words saved to", output_file_path)

Words saved to comparison_50/word_education_50.txt


-------------------------------------------------

# Sports

### TF-IDF

In [71]:
text_spo = df[df["interest"] == 'Sports']["text"]

tfidf_vectorizer = TfidfVectorizer(
    max_df=0.85,
    min_df=2,
    stop_words='english',
    lowercase=True,
    max_features=8000
)
    # Fit and transform your text data
tfidf_matrix = tfidf_vectorizer.fit_transform(text_spo)
    

feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix.sum(axis=0).A1

# Create a DataFrame to store words and their TF-IDF scores
tfidf_df = pd.DataFrame({'word': feature_names, 'tfidf_score': tfidf_scores})

# Sort the DataFrame by TF-IDF score in descending order
tfidf_df = tfidf_df.sort_values(by='tfidf_score', ascending=False)

# Extracting words and their TF-IDF scores from the output DataFrame
words_tfidf = tfidf_df['word'].tolist()
tfidf_scores = tfidf_df['tfidf_score'].tolist()

In [72]:
# words_tfidf
len(tfidf_df)

6605

### Word2Vec

In [73]:
tokenized_text_spo = [nltk.word_tokenize(sentence) for sentence in text_spo]

model = Word2Vec(sentences=tokenized_text_spo, vector_size=100, window=5, min_count=1 , sg=0)

model.save(f"word2vec/word2vec_model/sports")

model = Word2Vec.load(f"word2vec/word2vec_model/sports")

similar_words = model.wv.most_similar('sports', topn=2000)

for word, score in similar_words:
    print(f"Word: {word}, Similarity Score: {score}")

# Create a DataFrame
word2vec_df = pd.DataFrame(similar_words, columns=['word', 'word2vec_score'])

Word: slovenia, Similarity Score: 0.7817763686180115
Word: hail, Similarity Score: 0.780600368976593
Word: now, Similarity Score: 0.7739757299423218
Word: packed, Similarity Score: 0.7702217698097229
Word: tag, Similarity Score: 0.769760251045227
Word: deserved, Similarity Score: 0.7696970105171204
Word: politics, Similarity Score: 0.7693976759910583
Word: kissing, Similarity Score: 0.7678263783454895
Word: turned, Similarity Score: 0.7672895789146423
Word: eg, Similarity Score: 0.7671666741371155
Word: nz, Similarity Score: 0.766899049282074
Word: offered, Similarity Score: 0.7658381462097168
Word: cheap, Similarity Score: 0.7654291391372681
Word: rm7917, Similarity Score: 0.7652967572212219
Word: cousin, Similarity Score: 0.765125572681427
Word: anyway, Similarity Score: 0.7650758624076843
Word: register, Similarity Score: 0.7649638056755066
Word: pmmyt, Similarity Score: 0.7648493051528931
Word: gopro, Similarity Score: 0.7644664645195007
Word: luxury, Similarity Score: 0.7644196152

In [74]:
# len(word2vec_df)
word2vec_df

Unnamed: 0,word,word2vec_score
0,slovenia,0.781776
1,hail,0.780600
2,now,0.773976
3,packed,0.770222
4,tag,0.769760
...,...,...
1995,traffic,0.739643
1996,sold,0.739641
1997,loses,0.739617
1998,boxing,0.739604


#### Comparison

In [75]:
# Create a combined list of words
combined_words = tfidf_df['word'].tolist() + word2vec_df['word'].tolist()
len(combined_words)

8605

In [76]:
combined_scores = {}

for word in combined_words:
    # Check if the word is present in the TF-IDF DataFrame
    if word in tfidf_df['word'].values:
        tfidf_score = tfidf_df.loc[tfidf_df['word'] == word, 'tfidf_score'].values[0]
    else:
        tfidf_score = 0.0  # Set to 0 if not found in TF-IDF data
    
    # Check if the word is present in the Word2Vec DataFrame
    if word in word2vec_df['word'].values:
        word2vec_similarity = word2vec_df.loc[word2vec_df['word'] == word, 'word2vec_score'].values[0]
    else:
        word2vec_similarity = 0.0  # Set to 0 if not found in Word2Vec data
    
    # Calculate the combined score
    combined_score = (tfidf_weight * tfidf_score) + (word2vec_weight * word2vec_similarity)
    
    # Store the combined score in a dictionary
    combined_scores[word] = combined_score

In [77]:
combined_scores

{'football': 125.56616608245227,
 'badminton': 93.22156733752377,
 'hiking': 79.90585991011119,
 'sport': 65.37082192215118,
 'cycling': 54.36860385120404,
 'world': 48.30401124792217,
 'jogging': 41.85631320892057,
 'team': 40.1639009510491,
 'demi': 35.64159626560725,
 'time': 34.541526998388626,
 'day': 33.49739460669072,
 'footballtweet': 30.14807655512172,
 'game': 29.999535958086334,
 'year': 27.494749334496703,
 'like': 27.112264056686193,
 'player': 26.53904487634405,
 'club': 25.412490792376133,
 'result': 24.630351842508748,
 'love': 24.162526214505693,
 'final': 24.01200618239161,
 'best': 23.445213457601973,
 'good': 23.32962155666918,
 'fan': 23.24738298624871,
 'dont': 22.832950832758222,
 'new': 22.253567398120364,
 'play': 21.704649044232223,
 'chionship': 20.75731638761515,
 'ma': 20.6843477492413,
 'twtjogging': 20.140874643281272,
 'takraw': 20.081830871422493,
 'im': 19.787316473051042,
 'men': 19.71322728532648,
 'def': 19.50566781865676,
 'today': 19.4917938311700

In [78]:
# Sort the words by combined scores in descending order
sorted_words = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)

# Select the top N words (e.g., top 10)
top_words = [word for word, score in sorted_words[:70]]
top_words

['football',
 'badminton',
 'hiking',
 'sport',
 'cycling',
 'world',
 'jogging',
 'team',
 'demi',
 'time',
 'day',
 'footballtweet',
 'game',
 'year',
 'like',
 'player',
 'club',
 'result',
 'love',
 'final',
 'best',
 'good',
 'fan',
 'dont',
 'new',
 'play',
 'chionship',
 'ma',
 'twtjogging',
 'takraw',
 'im',
 'men',
 'def',
 'today',
 'win',
 'athlete',
 '2023',
 'sepak',
 'know',
 'chion',
 'national',
 'need',
 'woman',
 'playing',
 'watch',
 'cup',
 'medal',
 'badmintonphoto',
 'match',
 'nature',
 'week',
 'people',
 'event',
 'penang',
 'lee',
 'thailand',
 'think',
 'gemilangkanlagi',
 'man',
 'come',
 'league',
 'make',
 'thank',
 'track',
 'coach',
 'thing',
 'open',
 'country',
 'friend',
 'court']

In [79]:
# File path where you want to save the list of words
output_file_path = "comparison_50/word_sports_50.txt"

# Open the file in write mode and write the words
with open(output_file_path, 'w') as file:
    for word in top_words:
        file.write(word + '\n')

# Close the file
file.close()

print("Words saved to", output_file_path)

Words saved to comparison_50/word_sports_50.txt


# Politics

### TF-IDF

In [80]:
text_pol = df[df["interest"] == 'Politics']["text"]

tfidf_vectorizer = TfidfVectorizer(
    max_df=0.85,
    min_df=2,
    stop_words='english',
    lowercase=True,
    max_features=8000
)
    # Fit and transform your text data
tfidf_matrix = tfidf_vectorizer.fit_transform(text_pol)
    

feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix.sum(axis=0).A1

# Create a DataFrame to store words and their TF-IDF scores
tfidf_df = pd.DataFrame({'word': feature_names, 'tfidf_score': tfidf_scores})

# Sort the DataFrame by TF-IDF score in descending order
tfidf_df = tfidf_df.sort_values(by='tfidf_score', ascending=False)

# Extracting words and their TF-IDF scores from the output DataFrame
words_tfidf = tfidf_df['word'].tolist()
tfidf_scores = tfidf_df['tfidf_score'].tolist()

In [81]:
# words_tfidf
tfidf_df

Unnamed: 0,word,tfidf_score
2307,government,130.098232
4270,politics,97.830979
4269,politician,90.272994
4022,parliament,67.119402
1781,election,64.014159
...,...,...
5636,teka,0.335839
4545,rakus,0.335839
5663,tertua,0.335839
5762,toksah,0.335839


### Word2Vec

In [82]:
tokenized_text_pol = [nltk.word_tokenize(sentence) for sentence in text_pol]

model = Word2Vec(sentences=tokenized_text_pol, vector_size=100, window=5, min_count=1 , sg=0)

model.save(f"word2vec/word2vec_model/politics")

model = Word2Vec.load(f"word2vec/word2vec_model/politics")

similar_words = model.wv.most_similar('politics', topn=50)

for word, score in similar_words:
    print(f"Word: {word}, Similarity Score: {score}")

# Create a DataFrame
word2vec_df = pd.DataFrame(similar_words, columns=['word', 'word2vec_score'])

Word: government, Similarity Score: 0.9998753070831299
Word: politician, Similarity Score: 0.9998558759689331
Word: country, Similarity Score: 0.9998437762260437
Word: like, Similarity Score: 0.9998337626457214
Word: n, Similarity Score: 0.9998278021812439
Word: u, Similarity Score: 0.9998217225074768
Word: also, Similarity Score: 0.9997979402542114
Word: parliament, Similarity Score: 0.999794602394104
Word: time, Similarity Score: 0.9997850060462952
Word: people, Similarity Score: 0.9997835159301758
Word: one, Similarity Score: 0.9997792840003967
Word: need, Similarity Score: 0.9997355937957764
Word: many, Similarity Score: 0.999717652797699
Word: governance, Similarity Score: 0.9997022747993469
Word: issue, Similarity Score: 0.9996989369392395
Word: new, Similarity Score: 0.9996934533119202
Word: dont, Similarity Score: 0.9996836185455322
Word: dan, Similarity Score: 0.9996824860572815
Word: day, Similarity Score: 0.999679684638977
Word: would, Similarity Score: 0.9996772408485413
Wo

In [83]:
word2vec_df

Unnamed: 0,word,word2vec_score
0,government,0.999875
1,politician,0.999856
2,country,0.999844
3,like,0.999834
4,n,0.999828
5,u,0.999822
6,also,0.999798
7,parliament,0.999795
8,time,0.999785
9,people,0.999784


#### comparison

In [84]:
# Define weights
tfidf_weight = 0.6
word2vec_weight = 0.4

# Create a combined list of words
combined_words = tfidf_df['word'].tolist() + word2vec_df['word'].tolist()
combined_words

['government',
 'politics',
 'politician',
 'parliament',
 'election',
 'state',
 'country',
 'like',
 'people',
 'governance',
 'dont',
 'need',
 'said',
 'say',
 'minister',
 'time',
 'party',
 'ni',
 'good',
 'right',
 'political',
 'day',
 'new',
 'know',
 'year',
 'anwar',
 'rakyat',
 'issue',
 'pm',
 'think',
 'today',
 'energy',
 'work',
 'want',
 'vote',
 'national',
 'pa',
 'make',
 'change',
 'tak',
 'support',
 'prime',
 'better',
 'world',
 'law',
 'power',
 'yg',
 'yang',
 '2023',
 'american',
 'anwaribrahim',
 'thing',
 'come',
 'ph',
 'mp',
 'leader',
 'member',
 'ibrahim',
 'life',
 'nak',
 'malay',
 'pn',
 'let',
 'im',
 'public',
 'unity',
 'local',
 'race',
 'religion',
 'plan',
 'really',
 'news',
 'going',
 'money',
 'dan',
 'stillgray',
 'policy',
 'way',
 'chinese',
 'current',
 'mean',
 'action',
 'dia',
 'upcoming',
 'look',
 'selangor',
 'help',
 'transition',
 'end',
 'semua',
 'umno',
 'je',
 'hope',
 'indonesia',
 'dr',
 'ada',
 'medium',
 'thats',
 'corrup

In [85]:
combined_scores = {}

for word in combined_words:
    # Check if the word is present in the TF-IDF DataFrame
    if word in tfidf_df['word'].values:
        tfidf_score = tfidf_df.loc[tfidf_df['word'] == word, 'tfidf_score'].values[0]
    else:
        tfidf_score = 0.0  # Set to 0 if not found in TF-IDF data
    
    # Check if the word is present in the Word2Vec DataFrame
    if word in word2vec_df['word'].values:
        word2vec_similarity = word2vec_df.loc[word2vec_df['word'] == word, 'word2vec_score'].values[0]
    else:
        word2vec_similarity = 0.0  # Set to 0 if not found in Word2Vec data
    
    # Calculate the combined score
    combined_score = (tfidf_weight * tfidf_score) + (word2vec_weight * word2vec_similarity)
    
    # Store the combined score in a dictionary
    combined_scores[word] = combined_score

combined_scores

{'government': 78.458889566569,
 'politics': 58.69858723954094,
 'politician': 54.56373891679606,
 'parliament': 40.67155914769631,
 'election': 38.40849511642457,
 'state': 33.07213195502215,
 'country': 26.111512624303476,
 'like': 22.85682579216477,
 'people': 22.610580277286832,
 'governance': 21.753445020071837,
 'dont': 19.864536766237737,
 'need': 18.77881052168117,
 'said': 18.645345429917306,
 'say': 18.627853198293415,
 'minister': 18.148426315822665,
 'time': 18.23258425115572,
 'party': 17.99331838606234,
 'ni': 16.72731227843162,
 'good': 16.293832732596595,
 'right': 16.551801266561213,
 'political': 16.04399723257078,
 'day': 15.564345900039719,
 'new': 15.440995180761501,
 'know': 15.152279112632634,
 'year': 14.803416982990386,
 'anwar': 14.000525298731167,
 'rakyat': 14.104133111987434,
 'issue': 13.975241366858006,
 'pm': 13.603605118844467,
 'think': 13.408754072907753,
 'today': 12.823015713902027,
 'energy': 12.324614343738999,
 'work': 11.546928908625052,
 'want'

In [86]:
# Sort the words by combined scores in descending order
sorted_words = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)

# Select the top N words (e.g., top 10)
top_words = [word for word, score in sorted_words[:70]]
top_words

['government',
 'politics',
 'politician',
 'parliament',
 'election',
 'state',
 'country',
 'like',
 'people',
 'governance',
 'dont',
 'need',
 'said',
 'say',
 'time',
 'minister',
 'party',
 'ni',
 'right',
 'good',
 'political',
 'day',
 'new',
 'know',
 'year',
 'rakyat',
 'anwar',
 'issue',
 'pm',
 'think',
 'today',
 'energy',
 'want',
 'make',
 'work',
 'vote',
 'national',
 'pa',
 'tak',
 'support',
 'change',
 'prime',
 'power',
 'yg',
 'yang',
 'better',
 'world',
 'law',
 'ph',
 '2023',
 'mp',
 'american',
 'anwaribrahim',
 'thing',
 'come',
 'leader',
 'nak',
 'member',
 'ibrahim',
 'life',
 'malay',
 'pn',
 'let',
 'im',
 'public',
 'unity',
 'local',
 'race',
 'religion',
 'dan']

In [87]:
# File path where you want to save the list of words
output_file_path = "comparison_50/word_politics_50.txt"

# Open the file in write mode and write the words
with open(output_file_path, 'w') as file:
    for word in top_words:
        file.write(word + '\n')

# Close the file
file.close()

print("Words saved to", output_file_path)

Words saved to comparison_50/word_politics_50.txt


# Technology

### TF-IDF

In [88]:
text_tec = df[df["interest"] == 'Technology']["text"]

tfidf_vectorizer = TfidfVectorizer(
    max_df=0.85,
    min_df=2,
    stop_words='english',
    lowercase=True,
    max_features=8000
)
    # Fit and transform your text data
tfidf_matrix = tfidf_vectorizer.fit_transform(text_tec)
    

feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix.sum(axis=0).A1

# Create a DataFrame to store words and their TF-IDF scores
tfidf_df = pd.DataFrame({'word': feature_names, 'tfidf_score': tfidf_scores})

# Sort the DataFrame by TF-IDF score in descending order
tfidf_df = tfidf_df.sort_values(by='tfidf_score', ascending=False)

# Extracting words and their TF-IDF scores from the output DataFrame
words_tfidf = tfidf_df['word'].tolist()
tfidf_scores = tfidf_df['tfidf_score'].tolist()

### Word2Vec

In [89]:
tokenized_text_tec = [nltk.word_tokenize(sentence) for sentence in text_tec]

model = Word2Vec(sentences=tokenized_text_tec, vector_size=100, window=5, min_count=1 , sg=0)

model.save(f"word2vec/word2vec_model/technology")

model = Word2Vec.load(f"word2vec/word2vec_model/technology")

similar_words = model.wv.most_similar('technology', topn=2000)

for word, score in similar_words:
    print(f"Word: {word}, Similarity Score: {score}")

# Create a DataFrame
word2vec_df = pd.DataFrame(similar_words, columns=['word', 'word2vec_score'])

Word: first, Similarity Score: 0.9998703002929688
Word: tech, Similarity Score: 0.9998647570610046
Word: new, Similarity Score: 0.9998621344566345
Word: also, Similarity Score: 0.9998592734336853
Word: world, Similarity Score: 0.9998573660850525
Word: one, Similarity Score: 0.9998571872711182
Word: like, Similarity Score: 0.9998548626899719
Word: u, Similarity Score: 0.9998469352722168
Word: n, Similarity Score: 0.999843418598175
Word: global, Similarity Score: 0.9998372793197632
Word: software, Similarity Score: 0.9998306035995483
Word: cybersecurity, Similarity Score: 0.9998250603675842
Word: ai, Similarity Score: 0.999822199344635
Word: business, Similarity Score: 0.9998213052749634
Word: system, Similarity Score: 0.9998210072517395
Word: time, Similarity Score: 0.9998197555541992
Word: country, Similarity Score: 0.9998185038566589
Word: iot, Similarity Score: 0.9998176097869873
Word: 5g, Similarity Score: 0.9998169541358948
Word: team, Similarity Score: 0.9998154640197754
Word: uni

#### Comparison

In [90]:
# Create a combined list of words
combined_words = tfidf_df['word'].tolist() + word2vec_df['word'].tolist()
len(combined_words)

8796

In [91]:
combined_scores = {}

for word in combined_words:
    # Check if the word is present in the TF-IDF DataFrame
    if word in tfidf_df['word'].values:
        tfidf_score = tfidf_df.loc[tfidf_df['word'] == word, 'tfidf_score'].values[0]
    else:
        tfidf_score = 0.0  # Set to 0 if not found in TF-IDF data
    
    # Check if the word is present in the Word2Vec DataFrame
    if word in word2vec_df['word'].values:
        word2vec_similarity = word2vec_df.loc[word2vec_df['word'] == word, 'word2vec_score'].values[0]
    else:
        word2vec_similarity = 0.0  # Set to 0 if not found in Word2Vec data
    
    # Calculate the combined score
    combined_score = (tfidf_weight * tfidf_score) + (word2vec_weight * word2vec_similarity)
    
    # Store the combined score in a dictionary
    combined_scores[word] = combined_score

In [92]:
combined_scores

{'digital': 93.55883197180103,
 'technology': 75.88850955168283,
 'cybersecurity': 42.9583770160402,
 '5g': 42.24919639052953,
 'software': 39.528955687342844,
 'tech': 32.077254439400356,
 'new': 30.222651918356068,
 'security': 26.126652976361214,
 '2023': 25.168019763427083,
 'cyber': 22.831193099888228,
 'year': 22.649606499099626,
 'minister': 22.028671723238105,
 'day': 21.474011633763794,
 'world': 21.423746984076796,
 'innovation': 20.49948788596028,
 'like': 20.308824697796435,
 'art': 20.271065240248742,
 'economy': 20.13740349530091,
 'data': 20.027034306458514,
 'country': 19.851192280021763,
 'ai': 19.464183595944284,
 'development': 18.414396041866166,
 'company': 18.34453717783466,
 'science': 18.074463982553457,
 'use': 17.901937858179256,
 'good': 17.466126476527638,
 'coding': 16.821227615064647,
 'future': 16.786415146529418,
 'communication': 16.64482341746911,
 'work': 16.543520466390547,
 'industry': 16.538422655629997,
 'global': 16.323211952387016,
 'university'

In [93]:
# Sort the words by combined scores in descending order
sorted_words = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)

# Select the top N words (e.g., top 10)
top_words = [word for word, score in sorted_words[:70]]
top_words

['digital',
 'technology',
 'cybersecurity',
 '5g',
 'software',
 'tech',
 'new',
 'security',
 '2023',
 'cyber',
 'year',
 'minister',
 'day',
 'world',
 'innovation',
 'like',
 'art',
 'economy',
 'data',
 'country',
 'ai',
 'development',
 'company',
 'science',
 'use',
 'good',
 'coding',
 'future',
 'communication',
 'work',
 'industry',
 'global',
 'university',
 'programming',
 'iot',
 'learn',
 'network',
 'blockchain',
 'time',
 'make',
 'need',
 'dont',
 'business',
 'im',
 'read',
 'government',
 'best',
 'people',
 'experience',
 'engineer',
 'team',
 'opportunity',
 'know',
 'said',
 'great',
 'penang',
 'service',
 'today',
 'research',
 'using',
 '2022',
 'say',
 'bank',
 'sarawak',
 'currency',
 'intelligence',
 'asia',
 'student',
 'project',
 'story']

In [94]:
# File path where you want to save the list of words
output_file_path = "comparison_50/word_technology_50.txt"

# Open the file in write mode and write the words
with open(output_file_path, 'w') as file:
    for word in top_words:
        file.write(word + '\n')

# Close the file
file.close()

print("Words saved to", output_file_path)

Words saved to comparison_50/word_technology_50.txt


# FoodnDrink

### TF-IDF

In [95]:
text_foo = df[df["interest"] == 'FoodnDrink']["text"]

tfidf_vectorizer = TfidfVectorizer(
    max_df=0.85,
    min_df=2,
    stop_words='english',
    lowercase=True,
    max_features=8000
)
    # Fit and transform your text data
tfidf_matrix = tfidf_vectorizer.fit_transform(text_foo)
    

feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix.sum(axis=0).A1

# Create a DataFrame to store words and their TF-IDF scores
tfidf_df = pd.DataFrame({'word': feature_names, 'tfidf_score': tfidf_scores})

# Sort the DataFrame by TF-IDF score in descending order
tfidf_df = tfidf_df.sort_values(by='tfidf_score', ascending=False)

# Extracting words and their TF-IDF scores from the output DataFrame
words_tfidf = tfidf_df['word'].tolist()
tfidf_scores = tfidf_df['tfidf_score'].tolist()

In [96]:
# words_tfidf
tfidf_df

Unnamed: 0,word,tfidf_score
1899,food,232.236842
4253,restaurant,95.005595
2960,like,59.125307
2079,good,54.743759
1516,drink,50.349275
...,...,...
3809,physio,0.311794
40,16th,0.311794
800,burial,0.311794
13,10th,0.311794


### Word2Vec

In [97]:
tokenized_text_foo = [nltk.word_tokenize(sentence) for sentence in text_foo]

model = Word2Vec(sentences=tokenized_text_foo, vector_size=100, window=5, min_count=1 , sg=0)

model.save(f"word2vec/word2vec_model/food")

model = Word2Vec.load(f"word2vec/word2vec_model/food")

similar_words = model.wv.most_similar('food', topn=50)

for word, score in similar_words:
    print(f"Word: {word}, Similarity Score: {score}")

# Create a DataFrame
word2vec_df = pd.DataFrame(similar_words, columns=['word', 'word2vec_score'])

Word: restaurant, Similarity Score: 0.9998951554298401
Word: n, Similarity Score: 0.9998536109924316
Word: cooking, Similarity Score: 0.9998423457145691
Word: today, Similarity Score: 0.9998181462287903
Word: day, Similarity Score: 0.9998169541358948
Word: kedah, Similarity Score: 0.9998127818107605
Word: people, Similarity Score: 0.9998003840446472
Word: go, Similarity Score: 0.9998002648353577
Word: im, Similarity Score: 0.9997991919517517
Word: time, Similarity Score: 0.9997960329055786
Word: even, Similarity Score: 0.9997957944869995
Word: one, Similarity Score: 0.999788761138916
Word: new, Similarity Score: 0.9997815489768982
Word: like, Similarity Score: 0.999779224395752
Word: love, Similarity Score: 0.9997664093971252
Word: u, Similarity Score: 0.9997659921646118
Word: good, Similarity Score: 0.999764621257782
Word: drink, Similarity Score: 0.9997587203979492
Word: need, Similarity Score: 0.9997579455375671
Word: 2, Similarity Score: 0.9997574090957642
Word: country, Similarity

In [98]:
word2vec_df

Unnamed: 0,word,word2vec_score
0,restaurant,0.999895
1,n,0.999854
2,cooking,0.999842
3,today,0.999818
4,day,0.999817
5,kedah,0.999813
6,people,0.9998
7,go,0.9998
8,im,0.999799
9,time,0.999796


#### Comparison

In [99]:
# Create a combined list of words
combined_words = tfidf_df['word'].tolist() + word2vec_df['word'].tolist()
combined_words

['food',
 'restaurant',
 'like',
 'good',
 'drink',
 'satay',
 'im',
 'review',
 'teh',
 'cooking',
 'tarik',
 'eat',
 'kedah',
 'day',
 'dont',
 'kuala',
 'chicken',
 'best',
 'lumpur',
 'time',
 'love',
 'durian',
 'penang',
 'people',
 'make',
 'roti',
 'want',
 'place',
 'need',
 'canai',
 'know',
 'today',
 'great',
 'new',
 'street',
 'got',
 'come',
 'water',
 'really',
 'jacksonwang852',
 'delicious',
 'dinner',
 'rice',
 'year',
 'town',
 'home',
 'work',
 'nasi',
 'better',
 'look',
 'lot',
 'thing',
 'country',
 'friend',
 'world',
 'said',
 'free',
 'try',
 'price',
 'local',
 'na',
 'halal',
 'oil',
 'way',
 'lunch',
 'nice',
 'say',
 'gt',
 'spicy',
 'cafe',
 'family',
 'fast',
 'tea',
 'feel',
 'think',
 'ive',
 'court',
 'order',
 'eating',
 'meal',
 'travel',
 'night',
 'taste',
 'korean',
 'thank',
 'kl',
 'buy',
 'la',
 'went',
 'foodie',
 'going',
 'security',
 'sarawak',
 'thai',
 'poisoning',
 'life',
 'thats',
 'guy',
 'kuching',
 'ni',
 'video',
 'george',
 'com

In [100]:
combined_scores = {}

for word in combined_words:
    # Check if the word is present in the TF-IDF DataFrame
    if word in tfidf_df['word'].values:
        tfidf_score = tfidf_df.loc[tfidf_df['word'] == word, 'tfidf_score'].values[0]
    else:
        tfidf_score = 0.0  # Set to 0 if not found in TF-IDF data
    
    # Check if the word is present in the Word2Vec DataFrame
    if word in word2vec_df['word'].values:
        word2vec_similarity = word2vec_df.loc[word2vec_df['word'] == word, 'word2vec_score'].values[0]
    else:
        word2vec_similarity = 0.0  # Set to 0 if not found in Word2Vec data
    
    # Calculate the combined score
    combined_score = (tfidf_weight * tfidf_score) + (word2vec_weight * word2vec_similarity)
    
    # Store the combined score in a dictionary
    combined_scores[word] = combined_score

combined_scores

{'food': 139.34210534345974,
 'restaurant': 57.40331508968644,
 'like': 35.87509570058808,
 'good': 33.24616136568793,
 'drink': 30.60946842682055,
 'satay': 30.025918409036834,
 'im': 28.470873288909523,
 'review': 27.56006726394704,
 'teh': 25.72361785726063,
 'cooking': 25.761595177773806,
 'tarik': 24.79008545293502,
 'eat': 24.83605944141941,
 'kedah': 24.835281952912673,
 'day': 22.720437965859848,
 'dont': 22.039434456970422,
 'kuala': 21.470595182158988,
 'chicken': 20.904455759847842,
 'best': 21.240951200017744,
 'lumpur': 20.49436760088783,
 'time': 20.87676325776943,
 'love': 20.847633245018276,
 'durian': 19.69291454362053,
 'penang': 19.098557424867632,
 'people': 19.282136775078985,
 'make': 18.24558882954721,
 'roti': 17.566433761687026,
 'want': 17.489131104941592,
 'place': 16.486827123833308,
 'need': 16.44841335500009,
 'canai': 15.69637401027381,
 'know': 15.970637584226521,
 'today': 15.7280789328046,
 'great': 14.030442886967002,
 'new': 14.401871469555795,
 'str

In [101]:
# Sort the words by combined scores in descending order
sorted_words = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)

# Select the top N words (e.g., top 10)
top_words = [word for word, score in sorted_words[:70]]
top_words

['food',
 'restaurant',
 'like',
 'good',
 'drink',
 'satay',
 'im',
 'review',
 'cooking',
 'teh',
 'eat',
 'kedah',
 'tarik',
 'day',
 'dont',
 'kuala',
 'best',
 'chicken',
 'time',
 'love',
 'lumpur',
 'durian',
 'people',
 'penang',
 'make',
 'roti',
 'want',
 'place',
 'need',
 'know',
 'today',
 'canai',
 'new',
 'great',
 'street',
 'got',
 'come',
 'water',
 'really',
 'jacksonwang852',
 'delicious',
 'dinner',
 'year',
 'rice',
 'town',
 'home',
 'work',
 'nasi',
 'country',
 'friend',
 'world',
 'better',
 'look',
 'lot',
 'thing',
 'try',
 'price',
 'local',
 'said',
 'free',
 'na',
 'halal',
 'family',
 'oil',
 'think',
 'way',
 'lunch',
 'nice',
 'say',
 'gt']

In [102]:
# File path where you want to save the list of words
output_file_path = "comparison_50/word_foodndrink_50.txt"

# Open the file in write mode and write the words
with open(output_file_path, 'w') as file:
    for word in top_words:
        file.write(word + '\n')

# Close the file
file.close()

print("Words saved to", output_file_path)

Words saved to comparison_50/word_foodndrink_50.txt


# Entertainment

### TF-iDF

In [103]:
text_ent = df[df["interest"] == 'Entertainment']["text"]

tfidf_vectorizer = TfidfVectorizer(
    max_df=0.85,
    min_df=2,
    stop_words='english',
    lowercase=True,
    max_features=8000
)
    # Fit and transform your text data
tfidf_matrix = tfidf_vectorizer.fit_transform(text_ent)
    

feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix.sum(axis=0).A1

# Create a DataFrame to store words and their TF-IDF scores
tfidf_df = pd.DataFrame({'word': feature_names, 'tfidf_score': tfidf_scores})

# Sort the DataFrame by TF-IDF score in descending order
tfidf_df = tfidf_df.sort_values(by='tfidf_score', ascending=False)

# Extracting words and their TF-IDF scores from the output DataFrame
words_tfidf = tfidf_df['word'].tolist()
tfidf_scores = tfidf_df['tfidf_score'].tolist()

In [104]:
tfidf_df

Unnamed: 0,word,tfidf_score
2471,game,197.178516
4042,movie,146.551785
4083,music,128.877174
6096,tiktok,123.874164
6405,video,97.133297
...,...,...
1528,cosmopolitan,0.251063
4978,relay,0.251063
4573,piki,0.251063
1324,choom,0.251063


### Word2Vec

In [105]:
tokenized_text_ent = [nltk.word_tokenize(sentence) for sentence in text_ent]

model = Word2Vec(sentences=tokenized_text_ent, vector_size=100, window=5, min_count=1 , sg=0)

model.save(f"word2vec/word2vec_model/entertainment")

model = Word2Vec.load(f"word2vec/word2vec_model/entertainment")

similar_words = model.wv.most_similar('entertainment', topn=50)

for word, score in similar_words:
    print(f"Word: {word}, Similarity Score: {score}")

# Create a DataFrame
word2vec_df = pd.DataFrame(similar_words, columns=['word', 'word2vec_score'])

Word: fan, Similarity Score: 0.9996376633644104
Word: event, Similarity Score: 0.9996209144592285
Word: today, Similarity Score: 0.9996187090873718
Word: team, Similarity Score: 0.9996159672737122
Word: back, Similarity Score: 0.9996153712272644
Word: world, Similarity Score: 0.9996153116226196
Word: gaming, Similarity Score: 0.9996089339256287
Word: live, Similarity Score: 0.9996035099029541
Word: let, Similarity Score: 0.9995855093002319
Word: got, Similarity Score: 0.999583899974823
Word: man, Similarity Score: 0.999582827091217
Word: movie, Similarity Score: 0.9995821714401245
Word: still, Similarity Score: 0.9995821118354797
Word: come, Similarity Score: 0.9995748996734619
Word: culture, Similarity Score: 0.9995720982551575
Word: even, Similarity Score: 0.9995704889297485
Word: n, Similarity Score: 0.9995697736740112
Word: love, Similarity Score: 0.9995670318603516
Word: also, Similarity Score: 0.9995642900466919
Word: way, Similarity Score: 0.9995582699775696
Word: make, Similari

In [106]:
len(word2vec_df)

50

##### Comparison

In [107]:
# Create a combined list of words
combined_words = tfidf_df['word'].tolist() + word2vec_df['word'].tolist()
combined_words

['game',
 'movie',
 'music',
 'tiktok',
 'video',
 'gaming',
 'check',
 'festival',
 'like',
 'new',
 'good',
 'day',
 'time',
 'love',
 'im',
 'artist',
 'comedy',
 'culture',
 'best',
 'dance',
 '2023',
 'play',
 'event',
 'know',
 'indonesia',
 'dont',
 'philippine',
 'watch',
 'let',
 'apple',
 'need',
 'live',
 'youtube',
 'people',
 '10',
 'make',
 'year',
 'guy',
 'thunivu',
 'want',
 'japan',
 'kuala',
 'today',
 'world',
 'fan',
 'thailand',
 'song',
 'country',
 'lumpur',
 'na',
 'watching',
 'film',
 'trending',
 'seen',
 'right',
 'really',
 'wait',
 'korea',
 'got',
 'way',
 'think',
 'thank',
 'kpop',
 'come',
 'varisu',
 'esports',
 'ticket',
 'international',
 'tonight',
 'release',
 'great',
 '2022',
 'team',
 'friday',
 'going',
 'concert',
 'life',
 'tamil',
 'vietnam',
 'south',
 'week',
 'look',
 'night',
 'hope',
 'big',
 'chart',
 'ajithkumar',
 'vibe',
 'tipcoineth',
 'man',
 'official',
 'end',
 'jailer',
 'player',
 'happy',
 'performance',
 'thanks',
 'main',

In [108]:
combined_scores = {}

for word in combined_words:
    # Check if the word is present in the TF-IDF DataFrame
    if word in tfidf_df['word'].values:
        tfidf_score = tfidf_df.loc[tfidf_df['word'] == word, 'tfidf_score'].values[0]
    else:
        tfidf_score = 0.0  # Set to 0 if not found in TF-IDF data
    
    # Check if the word is present in the Word2Vec DataFrame
    if word in word2vec_df['word'].values:
        word2vec_similarity = word2vec_df.loc[word2vec_df['word'] == word, 'word2vec_score'].values[0]
    else:
        word2vec_similarity = 0.0  # Set to 0 if not found in Word2Vec data
    
    # Calculate the combined score
    combined_score = (tfidf_weight * tfidf_score) + (word2vec_weight * word2vec_similarity)
    
    # Store the combined score in a dictionary
    combined_scores[word] = combined_score

combined_scores

{'game': 118.30710985019554,
 'movie': 88.3309036634639,
 'music': 77.32630441485999,
 'tiktok': 74.32449848053206,
 'video': 58.27997792899668,
 'gaming': 50.56703504770939,
 'check': 49.16463239060198,
 'festival': 45.96888502343048,
 'like': 35.026750282737396,
 'new': 34.24291467080164,
 'good': 31.387540572315963,
 'day': 31.082277282240273,
 'time': 28.449696512812604,
 'love': 27.712593842260127,
 'im': 25.099095447183092,
 'artist': 24.423113316808625,
 'comedy': 24.146665332601263,
 'culture': 24.13428388906071,
 'best': 24.02917459738461,
 'dance': 23.626848376582977,
 '2023': 23.302399525406653,
 'play': 22.258463631321376,
 'event': 20.86778959248044,
 'know': 19.63174057875734,
 'indonesia': 19.616162715902185,
 'dont': 19.48633427888858,
 'philippine': 19.485202431579044,
 'watch': 18.540511976542877,
 'let': 18.924383402012417,
 'apple': 17.519824595044437,
 'need': 17.737954731846642,
 'live': 17.653170597057713,
 'youtube': 17.133511216984537,
 'people': 16.55536950213

In [109]:
# Sort the words by combined scores in descending order
sorted_words = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)

# Select the top N words (e.g., top 10)
top_words = [word for word, score in sorted_words[:70]]
top_words

['game',
 'movie',
 'music',
 'tiktok',
 'video',
 'gaming',
 'check',
 'festival',
 'like',
 'new',
 'good',
 'day',
 'time',
 'love',
 'im',
 'artist',
 'comedy',
 'culture',
 'best',
 'dance',
 '2023',
 'play',
 'event',
 'know',
 'indonesia',
 'dont',
 'philippine',
 'let',
 'watch',
 'need',
 'live',
 'apple',
 'youtube',
 'people',
 'make',
 '10',
 'year',
 'guy',
 'thunivu',
 'today',
 'world',
 'fan',
 'want',
 'japan',
 'kuala',
 'thailand',
 'song',
 'country',
 'lumpur',
 'na',
 'watching',
 'film',
 'trending',
 'really',
 'seen',
 'got',
 'right',
 'way',
 'think',
 'wait',
 'korea',
 'come',
 'thank',
 'kpop',
 'varisu',
 'esports',
 'ticket',
 'international',
 'release',
 'team']

In [110]:
# File path where you want to save the list of words
output_file_path = "comparison_50/word_entertainment_50.txt"

# Open the file in write mode and write the words
with open(output_file_path, 'w') as file:
    for word in top_words:
        file.write(word + '\n')

# Close the file
file.close()

print("Words saved to", output_file_path)

Words saved to comparison_50/word_entertainment_50.txt
