# Building a vectorizer model with gensim

In [153]:
import gensim
import pandas as pd
from tqdm.notebook import tqdm

In [154]:
df=pd.read_csv("../data/translated/kurdish-reviews6.csv")
# df=pd.read_csv("../data/medical-kurdish-dataset.csv")

In [155]:
df.shape    

(30000, 11)

In [156]:
df['KurdishText'] = df['KurdishText'].fillna('')

In [157]:

# res={}
# for i,row in tqdm(df.iterrows(),total=len(df)):
#     try:
#         text=row.KurdishText
#         myId=row.Id
#         gensim_results=gensim.utils.simple_preprocess(text)
#         res[myId]=gensim_results
#     except TypeError:
#         print("typerror for text: ",text)
#         continue



### tokenizing the kurdih reviews

In [158]:
# continuing with old way not looping
review_text=df.KurdishText.apply(gensim.utils.simple_preprocess)

### building the model and training word2vec

In [159]:
model=gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4
)

In [160]:
model.build_vocab(review_text,progress_per=100)

In [161]:
model.epochs

5

In [162]:
model.corpus_count

30000

In [163]:
model.train(review_text,total_examples=model.corpus_count,epochs=model.epochs)

(8131975, 9922710)

In [164]:
model.save(f"../models/kurdish-word2vec-{model.corpus_count}.model")

In [206]:
model.wv.most_similar("بەسەرچوون")

[('بەرواری', 0.865491509437561),
 ('بەسەرچوونی', 0.8429747223854065),
 ('بەسەرچوونیان', 0.8340602517127991),
 ('نیسانی', 0.7773920893669128),
 ('حوزەیرانی', 0.7524710297584534),
 ('jan', 0.7447219491004944),
 ('گەیاندرایە', 0.7416166067123413),
 ('پاکێجەکە', 0.7401153445243835),
 ('oct', 0.7377173900604248),
 ('پشکنی', 0.7356353998184204)]

In [166]:
model.wv.similarity(w1="دڵتەنگ",w2="دڵخۆش")

0.76048034

In [196]:
model.wv.doesnt_match(["گونجاو","هەرزان","باش","کچ"])

'کچ'

In [193]:
model.wv.doesnt_match(["چا","چای","پشیلە","قاوە","شیر"])


'پشیلە'

In [195]:
model.wv.doesnt_match(["شێر","سەگ","پشیلە","باڵندە","پرتەقاڵ"])


'پرتەقاڵ'

In [169]:
model.wv.doesnt_match(["دڵخۆش","گران","دڵتەنگ","خەمبار"])


'گران'

In [170]:
model.wv.most_similar(positive=["شاژن","پاشا"],negative=["پیاو"])

[('egrave', 0.7382482290267944),
 ('cr', 0.7165148258209229),
 ('ڕۆژهەڵاتی', 0.6775225400924683),
 ('هەڵمکردن', 0.6752817630767822),
 ('دروێنە', 0.6732248067855835),
 ('ئەمریکای', 0.6725807785987854),
 ('نەریتی', 0.6571123003959656),
 ('دەستوەردان', 0.6523147821426392),
 ('سوێتە', 0.6464184522628784),
 ('چینەکانی', 0.6381911635398865)]

In [171]:
model.wv["باش"]

array([-2.6172478 ,  1.45575   , -0.21326923, -2.0738397 ,  0.4716205 ,
       -2.6992457 , -2.0740469 , -0.7920629 ,  0.00628766, -0.06026602,
       -0.24524309,  2.1522872 ,  2.3297627 ,  2.1529198 , -3.1394815 ,
       -1.210419  ,  0.07800918, -0.44206458, -0.83098996,  1.0399299 ,
        0.22150122, -1.109959  ,  0.65006506, -1.5981456 ,  2.6995044 ,
       -0.05390262,  4.0762434 ,  0.2719617 ,  1.807954  , -1.3387496 ,
        0.4806367 ,  1.2275134 , -0.98409253, -2.6822445 ,  4.1402826 ,
       -1.1665434 , -0.41551536, -0.7742945 ,  1.6125237 , -1.1872507 ,
        0.8419438 , -1.9698278 ,  1.4520111 ,  0.5330274 , -2.7859385 ,
       -1.3103911 ,  0.6321787 , -0.78267956, -0.76309246, -0.687131  ,
        2.4477892 , -1.632078  ,  2.0252657 , -0.3197764 , -1.9605489 ,
        0.14420328,  1.4096761 ,  1.5112597 , -2.144825  ,  0.7348722 ,
        2.0299315 ,  0.55946475, -0.77947587, -3.190896  , -0.88962245,
       -3.6717885 ,  1.2174424 ,  0.8748749 , -2.2511363 ,  2.49

In [172]:
model.wv.distance(w1="ناوازە",w2="نایاب")

0.5627623498439789

## Comparison between 2000 model and 20,000 model

In [173]:
two_model=gensim.models.Word2Vec.load("../models/kurdish-word2vec-20000.model")
twenty_model=gensim.models.Word2Vec.load("../models/kurdish-word2vec-30000.model")


#### Similiarity comparison

In [181]:
text="باش"
print(f'model 1: {two_model.wv.most_similar(text,topn=3)}')
print(f'model 2: {twenty_model.wv.most_similar(text,topn=3)}')

model 1: [('باشتر', 0.7169090509414673), ('گونجاو', 0.6994403600692749), ('ئاسان', 0.6794298887252808)]
model 2: [('گونجاو', 0.7060096263885498), ('باشتر', 0.6612275838851929), ('شایستە', 0.6326435208320618)]
