# Preprocessing & Tokenization

In [1]:
import gensim
import pandas as pd

In [2]:
df = pd.read_json("Cell_Phones_and_Accessories_5.json",lines=True)
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


In [3]:
df.shape

(194439, 9)

In [4]:
df['reviewText'][0]

"They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again"

In [5]:
# verilen metni küçük harflere çevirir, noktalama işaretlerini kaldırır ve kelimeleri tokenize eder (yani, kelimeleri ayrı ayrı ele alır)

gensim.utils.simple_preprocess("They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again")

['they',
 'look',
 'good',
 'and',
 'stick',
 'good',
 'just',
 'don',
 'like',
 'the',
 'rounded',
 'shape',
 'because',
 'was',
 'always',
 'bumping',
 'it',
 'and',
 'siri',
 'kept',
 'popping',
 'up',
 'and',
 'it',
 'was',
 'irritating',
 'just',
 'won',
 'buy',
 'product',
 'like',
 'this',
 'again']

In [6]:
review_text = df['reviewText'].apply(gensim.utils.simple_preprocess)
review_text.head()

0    [they, look, good, and, stick, good, just, don...
1    [these, stickers, work, like, the, review, say...
2    [these, are, awesome, and, make, my, phone, lo...
3    [item, arrived, in, great, time, and, was, in,...
4    [awesome, stays, on, and, looks, great, can, b...
Name: reviewText, dtype: object

## Word2Vec Model eğitimi

In [8]:
model = gensim.models.Word2Vec(window=10,           # bir cümlenin içindeki kelime sayısı
                               vector_size=100,     # kelime vektörlerinin boyutu
                               min_count=2,         # modeldeki kelime sayısı
                               sg=1)                # skip-gram modeli

### kelime dağarcığını (vocab) oluştur

In [9]:
model.build_vocab(review_text) # modeli eğitmek için kelime haznesi oluşturur

In [10]:
model.epochs    # modeli eğitmek için kaç kez veri setini geçeceğini belirler

5

In [11]:
model.window    # bir cümlenin içindeki kelime sayısı

10

In [12]:
model.train(review_text, total_examples=model.corpus_count, epochs = model.epochs) # modeli eğitmek için veri setini kullanır

(61508824, 83868975)

In [14]:
model.corpus_count # modelin eğitim veri setindeki kelime sayısı

194439

In [13]:
model.save("./final_word2vec.model")


## Benzer Kelimeleri ve Kelimeler Arasındaki Benzerliği Bulun

In [15]:
model.wv.most_similar("bad")  # kelimenin en yakın kelimelerini bulur

[('terrible', 0.734882652759552),
 ('horrible', 0.7063153386116028),
 ('pathetic', 0.6608651876449585),
 ('good', 0.6555846333503723),
 ('poor', 0.6496292948722839),
 ('okay', 0.6443062424659729),
 ('guess', 0.641335129737854),
 ('liike', 0.640103816986084),
 ('crappy', 0.637427806854248),
 ('ok', 0.6323232054710388)]

In [16]:
model.wv.similarity(w1='cheap',w2 = 'inexpensive')   # iki kelime arasındaki benzerliği bulur

0.62215537

In [18]:
matrix = model.wv.vectors # kelime vektörlerini döndürür

print(f"shape of the word embedding matrix is {matrix.shape}")    # kelime vektörlerinin şeklini döndürür

shape of the word embedding matrix is (35561, 100)


In [19]:
W_out = model.syn1neg
W_out.shape # çıktı ağırlıklarının şeklini döndürür

(35561, 100)

In [21]:
#https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4
import random
random_word = random.choice(model.wv.index_to_key) # rastgele bir kelime seçer
random_word

'spots'

In [22]:
vocab_list = list(model.wv.index_to_key)    

In [23]:
len(vocab_list)

35561

In [24]:
vocab_list  

['the',
 'it',
 'and',
 'to',
 'is',
 'this',
 'of',
 'for',
 'my',
 'that',
 'in',
 'on',
 'phone',
 'with',
 'you',
 'case',
 'but',
 'have',
 'not',
 'was',
 'as',
 'so',
 'one',
 'very',
 'are',
 'like',
 'if',
 'be',
 'can',
 'or',
 'great',
 'your',
 'at',
 'when',
 'use',
 'screen',
 'just',
 'good',
 'all',
 'they',
 'battery',
 'from',
 'would',
 'out',
 'will',
 'well',
 'an',
 'has',
 'iphone',
 'had',
 'get',
 'charge',
 'up',
 'no',
 'me',
 'than',
 'more',
 'only',
 'charger',
 'about',
 'product',
 'other',
 'there',
 'really',
 'time',
 'also',
 'off',
 'these',
 'which',
 'works',
 'does',
 'because',
 'do',
 'don',
 'them',
 'much',
 'back',
 'what',
 'nice',
 'little',
 'price',
 'love',
 'usb',
 'its',
 'some',
 'quality',
 'charging',
 'work',
 'fit',
 'any',
 'easy',
 'even',
 've',
 'device',
 'too',
 'after',
 'still',
 'used',
 'protector',
 'while',
 'power',
 'using',
 'got',
 'better',
 'am',
 'bought',
 'two',
 'now',
 'by',
 'cable',
 'first',
 'recommend'

In [25]:
model.wv ['bad'] # kelimenin vektörünü döndürür

<gensim.models.keyedvectors.KeyedVectors at 0x28c9d94e5d0>

In [33]:
vocab_len = len(vocab_list)

In [26]:
model.wv.index_to_key # kelime indekslerini döndürür

['the',
 'it',
 'and',
 'to',
 'is',
 'this',
 'of',
 'for',
 'my',
 'that',
 'in',
 'on',
 'phone',
 'with',
 'you',
 'case',
 'but',
 'have',
 'not',
 'was',
 'as',
 'so',
 'one',
 'very',
 'are',
 'like',
 'if',
 'be',
 'can',
 'or',
 'great',
 'your',
 'at',
 'when',
 'use',
 'screen',
 'just',
 'good',
 'all',
 'they',
 'battery',
 'from',
 'would',
 'out',
 'will',
 'well',
 'an',
 'has',
 'iphone',
 'had',
 'get',
 'charge',
 'up',
 'no',
 'me',
 'than',
 'more',
 'only',
 'charger',
 'about',
 'product',
 'other',
 'there',
 'really',
 'time',
 'also',
 'off',
 'these',
 'which',
 'works',
 'does',
 'because',
 'do',
 'don',
 'them',
 'much',
 'back',
 'what',
 'nice',
 'little',
 'price',
 'love',
 'usb',
 'its',
 'some',
 'quality',
 'charging',
 'work',
 'fit',
 'any',
 'easy',
 'even',
 've',
 'device',
 'too',
 'after',
 'still',
 'used',
 'protector',
 'while',
 'power',
 'using',
 'got',
 'better',
 'am',
 'bought',
 'two',
 'now',
 'by',
 'cable',
 'first',
 'recommend'

In [27]:
model.wv.key_to_index # kelime anahtarlarını döndürür

{'the': 0,
 'it': 1,
 'and': 2,
 'to': 3,
 'is': 4,
 'this': 5,
 'of': 6,
 'for': 7,
 'my': 8,
 'that': 9,
 'in': 10,
 'on': 11,
 'phone': 12,
 'with': 13,
 'you': 14,
 'case': 15,
 'but': 16,
 'have': 17,
 'not': 18,
 'was': 19,
 'as': 20,
 'so': 21,
 'one': 22,
 'very': 23,
 'are': 24,
 'like': 25,
 'if': 26,
 'be': 27,
 'can': 28,
 'or': 29,
 'great': 30,
 'your': 31,
 'at': 32,
 'when': 33,
 'use': 34,
 'screen': 35,
 'just': 36,
 'good': 37,
 'all': 38,
 'they': 39,
 'battery': 40,
 'from': 41,
 'would': 42,
 'out': 43,
 'will': 44,
 'well': 45,
 'an': 46,
 'has': 47,
 'iphone': 48,
 'had': 49,
 'get': 50,
 'charge': 51,
 'up': 52,
 'no': 53,
 'me': 54,
 'than': 55,
 'more': 56,
 'only': 57,
 'charger': 58,
 'about': 59,
 'product': 60,
 'other': 61,
 'there': 62,
 'really': 63,
 'time': 64,
 'also': 65,
 'off': 66,
 'these': 67,
 'which': 68,
 'works': 69,
 'does': 70,
 'because': 71,
 'do': 72,
 'don': 73,
 'them': 74,
 'much': 75,
 'back': 76,
 'what': 77,
 'nice': 78,
 'little

In [28]:
vocab_int2word = {model.wv.key_to_index[w]:model.wv.get_vector(w, norm=True) for w in vocab_list}
# . Bu sözlük, kelimelerin model tarafından öğrenilen vektörlerini anahtar-değer (key-value) çiftleri olarak saklar

In [29]:
vocab_int2word  # kelime indekslerini ve vektörlerini döndürür

{0: array([ 6.94181994e-02,  1.18129037e-01, -5.13313636e-02, -5.90617917e-02,
        -4.01573107e-02, -5.40364310e-02,  1.26240045e-01,  2.84647107e-01,
        -3.58697996e-02, -1.35244012e-01, -6.86006024e-02, -5.84004745e-02,
         7.10079297e-02,  1.00893117e-01, -6.77196011e-02, -1.01524107e-01,
        -3.97188887e-02,  1.18833646e-01,  4.64924462e-02, -8.67921934e-02,
         3.21885794e-02, -1.45799726e-01,  2.06812739e-01, -1.33129418e-01,
         5.43009453e-02, -9.64717492e-02, -4.11192365e-02, -9.98946875e-02,
        -1.74470603e-01,  5.40151633e-02,  9.49024260e-02, -5.71595170e-02,
        -1.65613852e-02, -1.69919655e-01,  3.47247384e-02,  1.27045423e-01,
         1.01733953e-01,  3.17393690e-02,  2.79076640e-02, -1.10930353e-01,
        -1.58430580e-02,  2.42862180e-02, -8.96868482e-02,  6.17096312e-02,
         2.09684130e-02,  5.04185148e-02, -1.87630042e-01, -5.73112294e-02,
        -1.94062829e-01,  1.25741661e-01,  1.09888412e-01, -1.12710036e-01,
         

In [31]:
vocab_int2word[4697]    

array([ 0.23371696,  0.14116225,  0.09403869,  0.07595507,  0.07984642,
        0.09919146,  0.22803007,  0.13634503, -0.03192644,  0.08368894,
       -0.08921527, -0.16313015,  0.11729302, -0.04957659,  0.03162155,
        0.14600936, -0.02600583,  0.14534283,  0.0943578 , -0.09583825,
        0.04792016,  0.15195857,  0.08200265, -0.00326954,  0.00822163,
        0.07344974, -0.13676184,  0.02685939, -0.13034421,  0.01353804,
        0.07293899,  0.12343916,  0.12155976, -0.08418179, -0.02828683,
        0.0433173 ,  0.25241476,  0.1002509 ,  0.06469642,  0.06961599,
        0.02228048, -0.06917268,  0.05447729, -0.00819838,  0.06355379,
       -0.12720214, -0.09989544, -0.0076817 , -0.09316429,  0.04955938,
        0.01400036,  0.04071441,  0.02077082,  0.24179651, -0.10321981,
       -0.04247403,  0.11631961, -0.04064941, -0.12609522, -0.05238934,
       -0.03572119,  0.02630859,  0.0271055 , -0.00171504, -0.20880963,
        0.04811369,  0.10763972,  0.06394096, -0.27094954,  0.10

In [35]:
import numpy as np
one_hot = np.zeros(shape=(1,vocab_len))

one_hot[0,model.wv.key_to_index['awesome']] = 1