# Building a vectorizer model with gensim

In [2]:
import gensim
import pandas as pd
from tqdm.notebook import tqdm

In [4]:
df=pd.read_csv("../data/translated/kurdish-reviews5.csv")
# df=pd.read_csv("../data/medical-kurdish-dataset.csv")

In [5]:
df.shape    

(47, 4)

In [6]:
df['KurdishText'] = df['KurdishText'].fillna('')

In [7]:

# res={}
# for i,row in tqdm(df.iterrows(),total=len(df)):
#     try:
#         text=row.KurdishText
#         myId=row.Id
#         gensim_results=gensim.utils.simple_preprocess(text)
#         res[myId]=gensim_results
#     except TypeError:
#         print("typerror for text: ",text)
#         continue



### tokenizing the kurdih reviews

In [8]:
# continuing with old way not looping
review_text=df.KurdishText.apply(gensim.utils.simple_preprocess)

### building the model and training word2vec

In [9]:
model=gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4
)

In [10]:
model.build_vocab(review_text,progress_per=100)

In [11]:
model.epochs

5

In [12]:
model.corpus_count

47

In [13]:
model.train(review_text,total_examples=model.corpus_count,epochs=model.epochs)

(47571, 68625)

In [248]:
model.save("../models/kurdish-word2vec-20000.model")

In [15]:
model.wv.most_similar("دڵخۆش")

[('پتەوکردنی', 0.9288321733474731),
 ('بکەن', 0.9254183769226074),
 ('شەوانەکان', 0.9253820180892944),
 ('كات', 0.9253047108650208),
 ('شوباتی', 0.9234508275985718),
 ('ڕوداوە', 0.922707200050354),
 ('بڕیارە', 0.9226899147033691),
 ('هانی', 0.9222294688224792),
 ('خەڵک', 0.9221873879432678),
 ('قەوارەی', 0.9221734404563904)]

In [250]:
model.wv.similarity(w1="دڵتەنگ",w2="دڵخۆش")

0.6588227

In [251]:
model.wv.doesnt_match(["گونجاو","هەرزان","باش","قاوە"])

'قاوە'

In [252]:
model.wv.doesnt_match(["چا","چای","چوکلێت","قاوە","کچ"])


'کچ'

In [253]:
model.wv.most_similar(positive=["شاژن","پاشا"],negative=["پیاو"])

[('ئاشکرا', 0.4046480059623718),
 ('سریلانکا', 0.4013921618461609),
 ('ژاپۆنی', 0.38860073685646057),
 ('گەلانی', 0.3874826729297638),
 ('وردکراوەکە', 0.38216421008110046),
 ('بڕوای', 0.3812050521373749),
 ('ئەمریکایە', 0.3808383345603943),
 ('سۆگۆ', 0.3801935911178589),
 ('ناوزەد', 0.37941908836364746),
 ('تەقلیدیانەی', 0.374645859003067)]

In [244]:
model.wv["باش"]

array([-2.2584677 , -1.3639876 , -0.8485835 ,  1.2601333 , -0.23954327,
        2.2976618 , -0.8422657 , -2.2540371 ,  0.5392582 , -1.4882792 ,
       -4.607214  , -2.207439  ,  2.427937  ,  1.2737063 , -2.8636103 ,
       -1.5466975 , -1.7363892 ,  3.0353637 ,  1.4955368 ,  4.617238  ,
       -2.529456  , -3.2289004 , -0.7921148 , -3.0685492 ,  2.8004937 ,
       -3.1982856 ,  6.767797  , -0.73264503,  2.174561  , -3.4936006 ,
       -1.0328193 ,  0.6688558 ,  3.44486   , -2.4575179 ,  2.6412258 ,
       -0.6756578 ,  2.9294686 ,  4.7435136 ,  0.51138073,  1.2657015 ,
        1.681758  ,  3.8996522 ,  0.292119  , -1.7589962 , -1.7251204 ,
       -4.079862  ,  0.8179134 , -5.6168694 ,  1.5404359 , -2.312481  ,
        0.8291275 ,  0.18192616, -2.182993  ,  2.6200056 , -3.3055346 ,
       -1.3190445 ,  4.478493  , -1.1130794 , -4.6242695 ,  0.62258834,
        1.4787244 ,  1.7091652 ,  1.2086477 , -0.674437  , -2.420332  ,
       -4.8585763 , -1.5338455 , -0.01571877, -0.915234  , -1.97

In [245]:
model.wv.distance(w1="ناوازە",w2="نایاب")

0.649310827255249

## Comparison between 2000 model and 20,000 model

In [256]:
two_model=gensim.models.Word2Vec.load("../models/kurdish-word2vec-2000.model")
twenty_model=gensim.models.Word2Vec.load("../models/kurdish-word2vec-20000.model")


In [265]:
two_model.wv.most_similar("گران",topn=2)

[('بوون', 0.9917393922805786), ('کڕی', 0.9916492104530334)]

In [289]:
twenty_model.wv.most_similar("گەیاندن",topn=2)


[('گەیاندنی', 0.6742141246795654), ('گەیاندنەکە', 0.6029205918312073)]