# Building a vectorizer model with gensim

In [23]:
import gensim
import pandas as pd
from tqdm.notebook import tqdm

In [24]:
df=pd.read_csv("../data/translated/kurdish-reviews9.csv")
# df=pd.read_csv("../data/medical-kurdish-dataset.csv")

In [25]:
df.shape    

(75279, 11)

In [26]:
df.head()

Unnamed: 0,Id,KurdishText,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,چەندین بەرهەمی خۆراکی سەگی قوتووکراوی ڤیتالیتی...,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,بەرهەمەکە گەیشتە لای کە بە ناوی Jumbo Salted P...,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,ئەمە شیرینییەکە کە لە دەوروبەری چەند سەدەیەکدا...,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,ئەگەر بەدوای پێکهاتەی نهێنی ڕۆبیتوسیندا دەگەڕێ...,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,تافی نایاب بە نرخێکی نایاب. جۆرێکی بەرفراوانی ...,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [27]:
df['KurdishText'] = df['KurdishText'].fillna('')

In [28]:

# res={}
# for i,row in tqdm(df.iterrows(),total=len(df)):
#     try:
#         text=row.KurdishText
#         myId=row.Id
#         gensim_results=gensim.utils.simple_preprocess(text)
#         res[myId]=gensim_results
#     except TypeError:
#         print("typerror for text: ",text)
#         continue



### tokenizing the kurdih reviews

In [29]:
# continuing with old way not looping
review_text=df.KurdishText.apply(gensim.utils.simple_preprocess)

### building the model and training word2vec

In [30]:
model=gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4
)

In [31]:
model.build_vocab(review_text,progress_per=100)

In [32]:
model.epochs

5

In [33]:
model.corpus_count

75279

In [34]:
model.train(review_text,total_examples=model.corpus_count,epochs=5)

(20815381, 25180775)

In [35]:
model.save(f"../models/kurdish-word2vec-{model.corpus_count}.model")

In [36]:
model.wv.most_similar("دڵ")

[('نەخۆشییەکانی', 0.8756718039512634),
 ('خوێن', 0.8586665987968445),
 ('خوێنبەرەکان', 0.8529062271118164),
 ('بەڕێوەبردنی', 0.8256251215934753),
 ('ڕێگریکردن', 0.814089834690094),
 ('چالاکیی', 0.8050230145454407),
 ('فسفۆر', 0.7880133986473083),
 ('شێرپەنجە', 0.7855885028839111),
 ('جەڵتەی', 0.7839686274528503),
 ('مێنادیۆن', 0.7801363468170166)]

In [37]:
model.wv.similarity(w1="دڵتەنگ",w2="دڵخۆش")

0.7983525

In [38]:
model.wv.doesnt_match(["گونجاو","هەرزان","باش","کچ"])

'کچ'

In [39]:
model.wv.doesnt_match(["چا","چای","پشیلە","قاوە","شیر"])


'پشیلە'

In [40]:
model.wv.doesnt_match(["شێر","سەگ","پشیلە","باڵندە","پرتەقاڵ"])


'پرتەقاڵ'

In [41]:
model.wv.doesnt_match(["دڵخۆش","گران","دڵتەنگ","خەمبار"])


'گران'

In [42]:
model.wv.most_similar(positive=["شاژن","پاشا"],negative=["پیاو"])

[('توندترین', 0.6572930812835693),
 ('canada', 0.6530537009239197),
 ('پینگشوی', 0.6431078314781189),
 ('پیتزاکەیان', 0.6412283182144165),
 ('فیلیپ', 0.6367488503456116),
 ('شارەکانی', 0.6315086483955383),
 ('toxicology', 0.6306287050247192),
 ('چیۆ', 0.6282880306243896),
 ('پارێزگای', 0.6282369494438171),
 ('ڕووەکدا', 0.6252650022506714)]

In [43]:
model.wv["باش"]

array([-1.1929961 ,  0.51924175, -0.74429405, -4.710522  ,  0.31205124,
       -5.2910805 , -1.7026699 , -1.8074635 ,  0.1406476 ,  1.180593  ,
       -2.9354897 , -1.7606179 ,  2.034902  ,  0.8709499 , -1.5353724 ,
       -3.6651635 , -0.4548875 ,  0.20891607, -0.36300564,  0.42292854,
        0.46465954, -2.755293  ,  2.330648  , -5.3687954 ,  0.21620622,
       -1.5957713 ,  2.5542397 ,  1.6081854 ,  1.4147915 , -1.4009733 ,
       -0.3379254 ,  0.8386915 , -2.1979232 ,  3.3237066 ,  3.7057314 ,
       -0.8145219 , -1.0016776 ,  1.2162306 , -2.402222  ,  1.0426123 ,
        0.18449575, -2.6088705 , -0.5371913 , -1.5655692 ,  1.1663175 ,
       -2.419917  ,  1.3923601 ,  0.34320232,  0.43659243, -0.9940651 ,
        1.2988131 ,  3.040159  ,  1.2378373 ,  2.5186384 ,  0.889813  ,
        2.3639197 ,  2.2881238 , -3.2758002 , -1.5401207 ,  0.228828  ,
        2.0922208 ,  1.1712044 , -2.1530445 , -2.307461  ,  2.7780008 ,
       -2.4855216 ,  0.6950483 , -1.5084782 ,  1.1238075 ,  2.65

In [44]:
model.wv.distance(w1="ناوازە",w2="نایاب")

0.4876762628555298

## Comparison between 2000 model and 20,000 model

In [88]:
two_model=gensim.models.Word2Vec.load("../models/kurdish-word2vec-20000.model")
twenty_model=gensim.models.Word2Vec.load("../models/kurdish-word2vec-75279.model")
eng=gensim.models.Word2Vec.load("../models/english-word2vec.model")



#### Similiarity comparison

In [114]:
text="کۆنەپەرستی"
texteng="cat"
print(f'model 1: {two_model.wv.most_similar(text,topn=5)}')
print(f'model 2: {twenty_model.wv.most_similar(text,topn=5)}')
print(f'model 2: {eng.wv.most_similar(texteng,topn=5)}')

model 1: [('خواردووەتەوە', 0.9018285274505615), ('بەئەمەکی', 0.8709939122200012), ('نەخواردبوو', 0.8638148903846741), ('تایگەر', 0.8596477508544922), ('تێکدا', 0.8576906323432922)]
model 2: [('خۆپەرستی', 0.7607849836349487), ('ئازیزت', 0.7354581952095032), ('absoulte', 0.6752956509590149), ('خۆشترم', 0.6639035940170288), ('پیشەگەری', 0.6616527438163757)]
model 2: [('chewing', 0.7258464694023132), ('cats', 0.698319137096405), ('dog', 0.687409520149231), ('kid', 0.6554325222969055), ('toddler', 0.6033084392547607)]


In [101]:
text=["دۆلار","سەنت","شەربەت","پاوند"]

print(f'model 1: {two_model.wv.doesnt_match(text)}')
print(f'model 1: {twenty_model.wv.doesnt_match(text)}')

model 1: شەربەت
model 1: شەربەت


AttributeError: 'KeyedVectors' object has no attribute 'expandose_vectors'