# Word Embedding

## Load Data

In [1]:
import polars as pl
import pandas as pd
from utils.load_data import load_parquet_data

In [2]:
data = load_parquet_data("xl")

In [3]:
data.head()

text,label,processed_text
str,str,list[str]
"""Kylie Jenner W...","""true""","[""kylie"", ""jenner"", ... ""store""]"
"""All the Detail...","""true""","[""details"", ""pippa"", ... ""cake""]"
"""Inside Ginnife...","""true""","[""inside"", ""ginnifer"", ... ""world""]"
"""( ! #Axeliito_...","""false""","[""rihanna"", ""stop"", ... ""wizkhalifa""]"
"""Kylie Jenner F...","""true""","[""kylie"", ""jenner"", ... ""online""]"


In [4]:
corpus = data.select(pl.col('processed_text'))['processed_text'].to_list()

## Word2Vec

In [115]:
from gensim.models import word2vec, KeyedVectors
import numpy as np

In [116]:
# Load Pretrain
pretrained_model = KeyedVectors.load_word2vec_format('./input/GoogleNews-vectors-negative300.bin', binary=True)

In [117]:
model_word2vec = word2vec.Word2Vec(vector_size=300)
model_word2vec.build_vocab(corpus)

total_examples = model_word2vec.corpus_count
model_word2vec.wv.vectors_lockf = np.zeros(len(model_word2vec.wv), dtype=np.float32)  

model_word2vec.wv.intersect_word2vec_format("./input/GoogleNews-vectors-negative300.bin", binary=True, lockf=0)
model_word2vec.train(corpus, total_examples=total_examples, epochs=5)


(76748436, 78729690)

In [118]:
# https://www.kaggle.com/code/jerrykuo7727/word2vec
def most_similar(w2v_model, words, topn=10):
    similar_df = pd.DataFrame()
    for word in words:
        try:
            similar_words = pd.DataFrame(w2v_model.wv.most_similar(word, topn=topn), columns=[word, word + '_cos'])
            similar_df = pd.concat([similar_df, similar_words], axis=1)
        except:
            print(word, "not found in Word2Vec model!")
    return similar_df

In [119]:
keywords =  ["breaking", "apple", "human", "vote", "vehicle", "automobile", "accident", "school", "keyboard", "fake"]
most_similar(model_word2vec, keywords)

Unnamed: 0,breaking,breaking_cos,apple,apple_cos,human,human_cos,vote,vote_cos,vehicle,vehicle_cos,automobile,automobile_cos,accident,accident_cos,school,school_cos,keyboard,keyboard_cos,fake,fake_cos
0,broke,0.701365,pear,0.64507,humans,0.591796,votes,0.767614,car,0.78211,auto,0.785605,crash,0.803911,elementary,0.786863,guitar,0.546592,bogus,0.742246
1,broken,0.646636,fruit,0.641015,humankind,0.563631,voting,0.727397,vehicles,0.749218,automotive,0.700227,collision,0.715666,schools,0.741191,piano,0.541354,phony,0.729877
2,break,0.619844,berry,0.63023,mankind,0.534641,voted,0.648475,truck,0.702707,car,0.583837,mishap,0.709202,kindergarten,0.652981,laptop,0.527177,fakes,0.632785
3,smashing,0.564886,strawberry,0.605826,humanity,0.526227,ballot,0.621582,van,0.652571,autos,0.547123,wreck,0.683038,teacher,0.638241,typing,0.523362,phoney,0.615343
4,shattering,0.561211,peach,0.602587,animal,0.498778,referendum,0.619166,cruiser,0.629438,motorcycle,0.546715,incident,0.644203,students,0.630152,lockscreen,0.489538,counterfeit,0.613548
5,breaks,0.499756,potato,0.596094,bodily,0.489653,election,0.602672,motorcycle,0.607554,motor,0.537497,crashes,0.609865,classroom,0.628162,headphones,0.489038,fraudulent,0.560973
6,busting,0.468841,grape,0.593586,beings,0.468619,voters,0.584192,tractor,0.58814,wheeler,0.532463,fatality,0.59388,teachers,0.615827,knob,0.483956,false,0.555666
7,tearing,0.466209,blueberry,0.586667,sentient,0.453971,elections,0.566956,suv,0.585731,vehicle,0.529985,motorcyclist,0.530279,college,0.60819,headset,0.474204,fraudulently,0.514293
8,breaker,0.450508,mango,0.575186,natural,0.448333,ballots,0.565094,cars,0.579368,vehicles,0.527167,explosion,0.513803,student,0.605563,mouse,0.473759,imposters,0.492663
9,breakers,0.449866,melon,0.571999,earthly,0.412962,polls,0.542506,scooter,0.577402,cars,0.520825,tragedy,0.512204,preschool,0.567553,syncing,0.468594,impostor,0.487694


In [120]:
def evaluate_model(per_list):
    result = list(map(lambda x: {"section": x["section"], "correct": len(x["correct"]), "incorrect": len(x["incorrect"]), "score": len(x["correct"]) / sum([len(x["correct"]), len(x["incorrect"])])}, per_list))
    return pd.DataFrame(result)

In [121]:
# https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html
from gensim.test.utils import datapath

performance = model_word2vec.wv.evaluate_word_analogies(datapath("questions-words.txt"))

evaluate_model(performance[1])

Unnamed: 0,section,correct,incorrect,score
0,capital-common-countries,114,392,0.225296
1,capital-world,128,1035,0.11006
2,currency,31,77,0.287037
3,city-in-state,270,1866,0.126404
4,family,376,44,0.895238
5,gram1-adjective-to-adverb,273,483,0.361111
6,gram2-opposite,275,277,0.498188
7,gram3-comparative,756,56,0.931034
8,gram4-superlative,414,48,0.896104
9,gram5-present-participle,338,82,0.804762


In [122]:
model_word2vec.save('./output/word2vec.model')

In [114]:
# Google Score
# section	        correct	incorrect	score
# Total accuracy	14307	5023	0.740145
# performance = google.evaluate_word_analogies(datapath("questions-words.txt"))

# evaluate_model(performance[1])

Unnamed: 0,section,correct,incorrect,score
0,capital-common-countries,421,85,0.832016
1,capital-world,3552,816,0.813187
2,currency,230,578,0.284653
3,city-in-state,1779,688,0.721119
4,family,436,70,0.86166
5,gram1-adjective-to-adverb,290,702,0.292339
6,gram2-opposite,353,459,0.434729
7,gram3-comparative,1216,116,0.912913
8,gram4-superlative,987,135,0.879679
9,gram5-present-participle,829,227,0.785038


## FastText

In [147]:
from gensim.models.fasttext import FastText

In [148]:
model_FastText = FastText(
    corpus,
    min_count=5,
    vector_size=500,
    workers=4,
    epochs=10,
    window=8,
    sg=0,
)

In [149]:
keywords =  ["breaking", "apple", "human", "vote", "vehicle", "automobile", "accident"]
most_similar(model_FastText, keywords)

Unnamed: 0,breaking,breaking_cos,apple,apple_cos,human,human_cos,vote,vote_cos,vehicle,vehicle_cos,automobile,automobile_cos,accident,accident_cos
0,breakingbad,0.929433,applebee,0.847189,subhuman,0.87483,votevotevote,0.89497,vehicles,0.751481,mobile,0.758848,caraccident,0.889498
1,breakingdawn,0.889907,applecup,0.819756,humanly,0.861708,ivote,0.872577,cubicle,0.590498,automation,0.618116,accidental,0.857783
2,breakin,0.859077,appl,0.783729,inhuman,0.837466,votem,0.840758,car,0.470535,automotive,0.615731,accidentally,0.786618
3,breaki,0.826572,grapple,0.759658,humanoid,0.816815,govote,0.838701,cle,0.457736,mobil,0.611205,accio,0.68283
4,breakingnew,0.781423,pineapple,0.730572,humankind,0.784458,votedem,0.827706,popsicle,0.420781,mobilephone,0.610657,incident,0.640525
5,breakingnews,0.718127,pple,0.725114,humanright,0.772316,votegop,0.824126,motorcycle,0.411498,automate,0.605889,trident,0.61039
6,groundbreaking,0.704342,applehead,0.707455,humans,0.756182,voteoutgop,0.799495,cycle,0.410812,mobilemarkete,0.565845,président,0.557403
7,leaking,0.695665,applenews,0.689499,humane,0.752255,voten,0.794324,tentacle,0.410409,mobilise,0.561255,accsc,0.536898
8,braking,0.671086,applegate,0.64283,humayun,0.74916,covote,0.792506,bicycle,0.392035,exxonmobil,0.533392,flaccid,0.522833
9,freaking,0.660106,appleton,0.631131,humanize,0.744237,votegopout,0.786066,barnacle,0.384174,automaker,0.530886,acc,0.517022


In [150]:
# https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html
from gensim.test.utils import datapath

performance = model_FastText.wv.evaluate_word_analogies(datapath("questions-words.txt"))

evaluate_model(performance[1])

Unnamed: 0,section,correct,incorrect,score
0,capital-common-countries,5,501,0.009881
1,capital-world,3,1160,0.00258
2,currency,0,108,0.0
3,city-in-state,9,2127,0.004213
4,family,38,382,0.090476
5,gram1-adjective-to-adverb,488,268,0.645503
6,gram2-opposite,397,155,0.719203
7,gram3-comparative,175,637,0.215517
8,gram4-superlative,175,287,0.378788
9,gram5-present-participle,46,374,0.109524


In [151]:
model_FastText.save('./output/fastText.model')