# Building a vectorizer model with gensim

In [18]:
import gensim
import pandas as pd
from tqdm.notebook import tqdm

##### Importing translated dataset

In [19]:
df=pd.read_csv("../data/translated/kurdish-reviews9.csv")
df.shape    

(75279, 11)

##### Removing empty rows

In [20]:
df['KurdishText'] = df['KurdishText'].fillna('')

##### Tokenizing the kurdish reviews

In [21]:
# continuing with old way not looping
review_text=df.KurdishText.apply(gensim.utils.simple_preprocess)

In [23]:
review_text[0]

['چەندین',
 'بەرهەمی',
 'خۆراکی',
 'سەگی',
 'قوتووکراوی',
 'ڤیتالیتیم',
 'کڕیوە',
 'بۆم',
 'دەرکەوتووە',
 'کە',
 'هەموویان',
 'کوالیتی',
 'باشیان',
 'هەیە',
 'بەرهەمەکە',
 'زیاتر',
 'لە',
 'قیمە',
 'دەچێت',
 'نەک',
 'گۆشتی',
 'پرۆسێس',
 'کراو',
 'بۆنێکی',
 'خۆشی',
 'هەیە',
 'لابرادۆرەکەم',
 'وردە',
 'وردە',
 'باشتر',
 'لە',
 'زۆربەی',
 'ئەم',
 'بەرهەمە',
 'قەدری',
 'دەزانێت']

### Building the model and training word2vec

In [25]:
model=gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4
)

In [26]:
model.build_vocab(review_text,progress_per=100)

In [27]:
model.corpus_count

75279

In [28]:
model.train(review_text,total_examples=model.corpus_count,epochs=model.epochs)

(20816612, 25180775)

In [17]:
model.save(f"../models/xendan7000.model")

##### Testing the model by finding most similiar and doesnt match

In [28]:
model.wv.most_similar("بێهۆش")

[('دایکی', 0.9240180850028992),
 ('پیاوێک', 0.92145836353302),
 ('هاوڕێکانی', 0.9199870824813843),
 ('ژنەکەی', 0.9163362979888916),
 ('روداوەکانیش', 0.9147975444793701),
 ('لێکەوتوەتەوە', 0.9103312492370605),
 ('لێدانی', 0.908986508846283),
 ('گەیەنرایە', 0.908150315284729),
 ('ئۆتۆمبێلەکەی', 0.9069851040840149),
 ('ساچمە', 0.9066586494445801)]

In [19]:
model.wv.similarity(w1="دڵتەنگ",w2="دڵخۆش")

0.66419977

In [20]:
model.wv.doesnt_match(["گونجاو","هەرزان","باش","پرتەقاڵ"])

'هەرزان'

In [36]:
model.wv.doesnt_match(["ئاو","چای","پشیلە","قاوە","شیر"])


'چای'

In [37]:
model.wv.doesnt_match(["شێر","سەگ","پشیلە","باڵندە","پرتەقاڵ"])


'شێر'

In [38]:
model.wv.doesnt_match(["دڵخۆش","گران","دڵتەنگ","خەمبار"])


'گران'

In [40]:
model.wv.most_similar(positive=["ژن","پاشا"],negative=["پیاو"])

[('ریزەکانی', 0.8832405805587769),
 ('بینیویەتی', 0.8755570650100708),
 ('بۆردومانەکە', 0.873737096786499),
 ('شنگال', 0.8732637763023376),
 ('ئێندزێ', 0.870159924030304),
 ('تیرۆرستان', 0.8671587109565735),
 ('ئوردوگای', 0.8669210076332092),
 ('چەكدارانی', 0.8661324977874756),
 ('گارە', 0.8660470247268677),
 ('بانە', 0.8655515313148499)]

In [35]:
# vector arary of a word
model.wv["باش"]

array([-0.90211964,  0.43025637, -0.9367391 , -2.6173587 ,  2.0945787 ,
       -1.5308189 , -2.3683774 , -3.3523567 ,  0.73924965, -1.1344695 ,
       -4.014437  ,  0.666944  ,  0.7020901 , -2.2726924 , -1.9118208 ,
       -3.3315017 , -2.0781443 , -2.1347258 ,  2.5213184 , -1.3058217 ,
       -0.13207778, -0.8528401 ,  1.4988103 , -2.875501  ,  0.42943385,
        1.2140238 ,  2.6804671 ,  4.4979696 ,  1.811867  , -1.1507096 ,
        2.1694405 ,  1.4167244 , -4.4290113 ,  2.549508  ,  2.0360394 ,
        0.52259445,  0.05791227,  1.8339542 , -2.5408585 , -0.6802383 ,
        2.1184626 , -0.44735515,  0.3888209 ,  0.2232721 ,  0.27532724,
       -1.837784  ,  0.23343086, -0.79494554,  0.52373177, -3.6053371 ,
        3.1700099 ,  4.9057856 ,  3.12903   ,  2.1287427 , -1.6651137 ,
        2.8429258 ,  0.28667638, -1.0683757 , -1.1722019 , -1.1205236 ,
        3.2309926 ,  2.531725  , -1.8850188 ,  0.33407545,  1.195811  ,
       -1.3354232 , -0.7482134 , -3.39835   ,  1.4679952 ,  4.01

In [36]:
#distance between two words
model.wv.distance(w1="ناوازە",w2="نایاب")

0.47279030084609985

## Comparison between 2000 rows model and 75,279 rows model

In [31]:
first_model=gensim.models.Word2Vec.load("../models/kurdish-word2vec-3000.model")
second_model=gensim.models.Word2Vec.load("../models/kurdish-word2vec-75279.model")

##### Similiarity comparison

In [32]:
text="دڵتەنگ"
print(f'model 1: {first_model.wv.most_similar(text,topn=5)}')
print(f'model 2: {second_model.wv.most_similar(text,topn=5)}')

model 1: [('تارگێت', 0.9552736878395081), ('دەیفرۆشن', 0.9551293253898621), ('ئەوەندەی', 0.9550510048866272), ('فڕێدا', 0.9547895193099976), ('ستانداردی', 0.9546394348144531)]
model 2: [('بێهیوا', 0.9291748404502869), ('ناڕازی', 0.9205631017684937), ('کنجکاو', 0.9063196778297424), ('دەمارگیر', 0.8955897688865662), ('هیوادار', 0.8950649499893188)]


##### Finding the word that doesnt match

In [33]:
text=["دۆلار","سەنت","شەربەت","پاوند"]
print(f'model 1: {first_model.wv.doesnt_match(text)}')
print(f'model 1: {second_model.wv.doesnt_match(text)}')

model 1: شەربەت
model 1: شەربەت
