# Word2Vec

Custom word2vec model

In [55]:
import pandas as pd
from gensim.models.word2vec import Word2Vec

In [56]:
master_df = pd.read_pickle("../../dataset/FakeNewsNet/data/FakeNewsNet.pkl")
master_df.head()

Unnamed: 0,text,processed_text,label
0,On Air with Ryan Seacrest is offering you a ch...,"[air, ryan, seacrest, offer, chance, win, nigh...",False
1,‘American Idol’ final: How to vote for the sea...,"[american, idol, final, vote, season, winner, ...",False
2,@ScottDisick @KrisJenner @khloekardashian — LA...,"[latest, art, shame, revenge, prank, banksy, s...",False
3,@foquinha Youngblood - 5 Seconds of Summer \nO...,"[youngblood, seconds, summer, little, mix, del...",False
4,Kylie Jenner ‘Open’ To Reconciliation With Tyg...,"[kylie, jenner, open, reconciliation, tyga, pr...",False


In [57]:
corpus = master_df['processed_text']
corpus.head()

0    [air, ryan, seacrest, offer, chance, win, nigh...
1    [american, idol, final, vote, season, winner, ...
2    [latest, art, shame, revenge, prank, banksy, s...
3    [youngblood, seconds, summer, little, mix, del...
4    [kylie, jenner, open, reconciliation, tyga, pr...
Name: processed_text, dtype: object

In [58]:
# Total number of word
from functools import reduce

word_count = reduce(lambda acc, cur: len(cur) + acc, corpus.to_list(), 0)

print("total word count: ", word_count)


total word count:  14274283


In [59]:
model = Word2Vec(corpus)


In [60]:
model.save('word2vec.model')
model = Word2Vec.load('word2vec.model')

In [61]:
# https://www.kaggle.com/code/jerrykuo7727/word2vec
def most_similar(w2v_model, words, topn=10):
    similar_df = pd.DataFrame()
    for word in words:
        try:
            similar_words = pd.DataFrame(w2v_model.wv.most_similar(word, topn=topn), columns=[word, 'cos'])
            similar_df = pd.concat([similar_df, similar_words], axis=1)
        except:
            print(word, "not found in Word2Vec model!")
    return similar_df

In [62]:
most_similar(model, ["vote"])

Unnamed: 0,vote,cos
0,ballot,0.618587
1,voting,0.592086
2,primary,0.571004
3,voter,0.567515
4,bluewave,0.566575
5,democrats,0.560515
6,republican,0.555501
7,gop,0.555487
8,voteblue,0.554184
9,poll,0.552276


## Training

In [76]:
model_500_5 = Word2Vec(corpus, vector_size=500, epochs=5, workers=23)


In [77]:
model_500_6 = Word2Vec(corpus, vector_size=500, epochs=6, workers=23)


In [78]:
model_500_7 = Word2Vec(corpus, vector_size=500, epochs=7, workers=23)


In [79]:
model_500_10 = Word2Vec(corpus, vector_size=500, epochs=10, workers=23)


In [80]:
model_500_sg = Word2Vec(corpus, vector_size=500, epochs=5, workers=23)


In [122]:
model_250 = Word2Vec(corpus, vector_size=250, epochs=7, workers=23)


In [87]:
keywords =  ["breaking", "apple", "human", "vote", "vehicle", "accident"]

In [88]:
most_similar(model_500_5, keywords)

Unnamed: 0,breaking,cos,apple,cos.1,human,cos.2,vote,cos.3,vehicle,cos.4,accident,cos.5
0,exchanges,0.438959,yin,0.451181,being,0.520337,voting,0.539416,mph,0.466876,pools,0.539132
1,buzzybuzz,0.382622,melon,0.424632,flesh,0.43036,voter,0.496674,truck,0.464131,motorcycle,0.50756
2,cyprium,0.377038,meanie,0.416631,dignity,0.427236,ballot,0.48084,motorcycle,0.455011,crash,0.498002
3,zonamya,0.372098,pharma,0.410145,compassion,0.392797,voteblue,0.464302,thames,0.447322,injures,0.48778
4,buzzort,0.371601,rolex,0.408076,nature,0.391212,democrats,0.453721,extraterrestrial,0.441024,slidey,0.477539
5,urbannews,0.369779,plexpy,0.406957,creature,0.383316,alpha,0.452392,traffic,0.418433,bikes,0.477509
6,mbeattv,0.36564,tremendously,0.390824,humanity,0.38226,gop,0.432233,helicopter,0.417836,atv,0.461765
7,flawlessdabarber,0.363069,tit,0.390436,consumption,0.382223,bluewave,0.419087,manufacturer,0.416941,explosions,0.453067
8,eci,0.354194,mome,0.383456,environment,0.380569,democrat,0.413391,construction,0.403104,moped,0.444621
9,trendi,0.34451,antler,0.381947,imperfect,0.380404,dems,0.409947,emit,0.399922,fayetteville,0.431429


In [89]:
most_similar(model_500_6, keywords)

Unnamed: 0,breaking,cos,apple,cos.1,human,cos.2,vote,cos.3,vehicle,cos.4,accident,cos.5
0,exchanges,0.417054,yin,0.435317,being,0.46822,voting,0.54239,truck,0.451054,pools,0.510555
1,cyprium,0.374235,meanie,0.413739,fundamental,0.418462,voteblue,0.481289,motorcycle,0.43024,crash,0.493152
2,buzzort,0.340663,antler,0.400904,flesh,0.40529,voter,0.457329,mph,0.429411,injures,0.491444
3,buzzybuzz,0.339859,rolex,0.399191,creature,0.398972,ballot,0.447271,thames,0.412026,atv,0.464071
4,eci,0.337216,mome,0.394749,dignity,0.388367,gop,0.431104,cdc,0.388085,fayetteville,0.459692
5,urbannews,0.328643,pharma,0.386509,individual,0.38082,democrats,0.429731,helicopter,0.380238,bikes,0.459205
6,flawlessdabarber,0.328396,comparably,0.373826,nature,0.36241,dems,0.407165,car,0.375431,motorcycle,0.44633
7,handcuffs,0.327043,stockmonstervip,0.369559,useless,0.354725,votethemout,0.406074,manufacturer,0.374366,moped,0.432866
8,wiretap,0.326903,disenfranchisement,0.368192,humanright,0.353567,republican,0.402777,construction,0.372601,explosions,0.427052
9,trendi,0.326777,mikemyers,0.368114,humanity,0.347885,democrat,0.387587,fatality,0.368722,fatal,0.422043


In [90]:
most_similar(model_500_7, keywords)

Unnamed: 0,breaking,cos,apple,cos.1,human,cos.2,vote,cos.3,vehicle,cos.4,accident,cos.5
0,exchanges,0.426331,unifo,0.432653,being,0.414453,voting,0.538985,mph,0.425717,pools,0.525801
1,cyprium,0.380248,meanie,0.417196,flesh,0.370228,voter,0.477641,truck,0.403937,bikes,0.455962
2,buzzort,0.375402,pharma,0.391806,creature,0.353799,ballot,0.457414,motorcycle,0.392672,crash,0.449442
3,buzzybuzz,0.373887,yin,0.390505,fundamental,0.34397,voteblue,0.457156,thames,0.382135,atv,0.442774
4,flawlessdabarber,0.357561,mome,0.377874,individual,0.343883,gop,0.42834,helicopter,0.370444,motorcycle,0.43044
5,urbannews,0.349999,rolex,0.377766,humanright,0.334656,democrats,0.413847,manufacturer,0.366754,prone,0.402015
6,funclickearn,0.349615,fea,0.374285,dignity,0.332811,senate,0.405516,electric,0.358797,fayetteville,0.39628
7,zonamya,0.345827,plexpy,0.371703,humanity,0.327826,bluewave,0.38281,technology,0.358062,slidey,0.395151
8,loweel,0.342385,adieu,0.371311,innocent,0.325946,votethemout,0.381064,car,0.356985,injures,0.384673
9,lfn,0.335878,comparably,0.366774,americans,0.323674,republicans,0.379249,extraterrestrial,0.353255,explosions,0.383183


In [91]:
most_similar(model_500_10, keywords)

Unnamed: 0,breaking,cos,apple,cos.1,human,cos.2,vote,cos.3,vehicle,cos.4,accident,cos.5
0,exchanges,0.416924,meanie,0.357662,being,0.371634,voting,0.50136,thames,0.367521,pools,0.46624
1,eci,0.336456,mikemyers,0.347859,dignity,0.327175,voter,0.4249,mph,0.33557,injures,0.409859
2,buzzort,0.334464,antler,0.345965,humanright,0.307693,voteblue,0.414003,truck,0.332306,crash,0.394994
3,cyprium,0.329362,mome,0.345242,americans,0.304852,ballot,0.393309,electric,0.331347,bikes,0.387638
4,flawlessdabarber,0.326311,yin,0.34117,innocent,0.303304,votethemout,0.39092,motorcycle,0.330665,atv,0.386451
5,mbeattv,0.320425,unifo,0.341032,creature,0.300423,democrats,0.379051,car,0.32415,fayetteville,0.372428
6,zonamya,0.318399,plexpy,0.33498,humanity,0.29608,gop,0.37205,extraterrestrial,0.320559,moped,0.35231
7,sallbuzz,0.314246,fea,0.32722,flesh,0.293997,election,0.363062,cdc,0.314493,amtrak,0.329778
8,urbannews,0.314149,stockmonstervip,0.324505,fundamental,0.292063,republicans,0.358044,rodeo,0.308429,explosions,0.325625
9,buzzybuzz,0.314117,melon,0.316253,people,0.290871,dems,0.357225,manufacturer,0.30789,motorcycle,0.322056


In [95]:
most_similar(model, keywords)

Unnamed: 0,breaking,cos,apple,cos.1,human,cos.2,vote,cos.3,vehicle,cos.4,accident,cos.5
0,exchanges,0.500357,yin,0.528692,nature,0.595733,ballot,0.618587,dealership,0.5733,bikes,0.606011
1,cyprium,0.465331,rolex,0.500336,being,0.594188,voting,0.592086,building,0.572427,pools,0.605606
2,indemand,0.455227,pharma,0.495481,compassion,0.552437,primary,0.571004,traffic,0.570699,motorcycle,0.601385
3,mbeattv,0.441607,tremendously,0.495385,dignity,0.533861,voter,0.567515,motorcycle,0.569911,crash,0.571329
4,loweel,0.4297,adieu,0.493164,moral,0.533005,bluewave,0.566575,truck,0.566742,sardinia,0.569161
5,zonamya,0.426993,tit,0.491788,consequence,0.531268,democrats,0.560515,helicopter,0.562758,car,0.558914
6,eci,0.419441,plexpy,0.463487,compassionate,0.526458,republican,0.555501,airline,0.557669,atv,0.538764
7,handcuffs,0.41627,itune,0.453315,equal,0.525899,gop,0.555487,car,0.557085,prone,0.498045
8,urbannews,0.414062,yeezysupply,0.446986,sensitive,0.523872,voteblue,0.554184,plane,0.547339,bike,0.494187
9,ecumenical,0.410702,techno,0.443304,empathy,0.522454,poll,0.552276,construction,0.542437,moped,0.493632


In [123]:
most_similar(model_250, keywords)

Unnamed: 0,breaking,cos,apple,cos.1,human,cos.2,vote,cos.3,vehicle,cos.4,accident,cos.5
0,exchanges,0.437924,yin,0.488874,dignity,0.444096,voting,0.567061,truck,0.463257,crash,0.531878
1,cyprium,0.368732,meanie,0.458439,being,0.437808,voteblue,0.553423,mph,0.444558,pools,0.516852
2,lfn,0.368585,pharma,0.436913,nature,0.409769,voter,0.543077,motorcycle,0.436871,bikes,0.486595
3,mbeattv,0.36827,melon,0.424928,compassionate,0.392117,ballot,0.480696,dealership,0.423763,injures,0.473686
4,eci,0.350465,mikemyers,0.411653,equal,0.3846,votethemout,0.474488,helicopter,0.417763,atv,0.456459
5,buzzort,0.350216,rolex,0.407189,constitute,0.383155,democrats,0.469075,airline,0.410671,slidey,0.413118
6,loweel,0.345359,mome,0.398567,fundamental,0.373746,dems,0.455365,electric,0.410222,fayetteville,0.406551
7,flawlessdabarber,0.341343,fea,0.39647,individual,0.3734,gop,0.454687,plane,0.405979,stunk,0.401609
8,urbannews,0.338753,saturdaythoughts,0.396042,flesh,0.369197,bluewave,0.440488,bike,0.40064,motorcycle,0.397158
9,funclickearn,0.337582,laments,0.392178,people,0.367715,republican,0.433325,extraterrestrial,0.399638,saratoga,0.393342


## Evaluation

In [127]:
def evaluate_model(per_list):
    result = list(map(lambda x: {"section": x["section"], "correct": len(x["correct"]), "incorrect": len(x["incorrect"]), "score": len(x["correct"]) / sum([len(x["correct"]), len(x["incorrect"])])}, per_list))
    return pd.DataFrame(result)


In [128]:
# https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html
from gensim.test.utils import datapath

performance = model_250.wv.evaluate_word_analogies(datapath("questions-words.txt"))

evaluate_model(performance[1])

Unnamed: 0,section,correct,incorrect,score
0,capital-common-countries,15,447,0.032468
1,capital-world,18,786,0.022388
2,currency,0,88,0.0
3,city-in-state,13,2041,0.006329
4,family,58,404,0.125541
5,gram1-adjective-to-adverb,0,756,0.0
6,gram2-opposite,0,380,0.0
7,gram3-comparative,1,599,0.001667
8,gram4-superlative,8,498,0.01581
9,gram5-present-participle,16,364,0.042105


In [129]:
"""
    https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html
    
    By default it uses an academic dataset WS-353 but one can create a dataset specific to your business based on it. 
    It contains word pairs together with human-assigned similarity judgments. 
    It measures the relatedness or co-occurrence of two words. 
    For example, "coast" and "shore" are very similar as they appear in the same context. 
    At the same time "clothes" and "closet" are less similar because they are related but not interchangeable.
"""

performance = model_250.wv.evaluate_word_pairs(datapath("wordsim353.tsv"))

performance


(PearsonRResult(statistic=0.4149751119646313, pvalue=4.8304376628154305e-15),
 SignificanceResult(statistic=0.4114452045704089, pvalue=8.626547954583478e-15),
 7.365439093484419)

In [132]:
model_250.save('word2vec.model')

## Visualize

In [None]:
# TODO