##  (loss = :mrl, opt = :sgd, agg = :sum, constr = :maxval) 

In [244]:
import pandas as pd
import numpy as np 
import re
from collections import defaultdict
import nltk  

In [245]:
data = pd.read_csv('parserEcho8.csv', engine='python', encoding = 'utf-8')

In [246]:
data.columns = ['name', 'desc']
data = data.dropna(axis = 0)

In [14]:

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [521]:
from nltk.stem import WordNetLemmatizer

In [247]:
lemmatizer = WordNetLemmatizer()

In [248]:
vocab = defaultdict(int)

In [249]:
def tokenize(doc):
    doc = re.sub('\W', ' ', doc).lower().split()
    doc = [lemmatizer.lemmatize(word) for word in doc]
    d = defaultdict(int)
    for word in doc:
        d[word] += 1
    return d

In [250]:
def get_vocab(feature):
    feature = feature.map(tokenize)
    for d in feature:
        for w, v in d.items():
            vocab[w] += v
    return feature 

In [251]:
names = get_vocab(data.name)
desc = get_vocab(data.desc)

In [252]:
desc[2]

defaultdict(int,
            {'его': 1,
             'обвиняют': 1,
             'в': 2,
             'десяти': 1,
             'эпизодах': 1,
             'сексуального': 1,
             'насилия': 1,
             'том': 1,
             'числе': 1,
             'и': 1,
             'над': 1,
             'несовершеннолетними': 1})

In [254]:
vocab2 = [x for x in sorted(vocab.items(), key=lambda x: -x[1]) if x[1] > 15]

In [256]:
vocab3 = defaultdict(int)
count = 1
for value,ind in vocab2:
    vocab3[value] = count
    count=count+1
    

In [257]:
len(vocab3)

16482

In [258]:
def new_features(features):
    new_docs = list()
    for doc in features:
        vector = list()
        for word in doc:
            if vocab3[word]!=0:
                vector.append(vocab3[word])
        new_docs.append(vector)
    return new_docs

In [259]:
new_names = new_features(names)
new_desc = new_features(desc)

In [499]:
emb = np.random.randn(50, len(vocab2))

In [500]:
emb_size = 50

In [501]:
maxval = 10000

In [502]:
#emb = np.random.normal(0,1.0,(emb_size, len(vocab2)))

In [503]:
emb.shape

(50, 16482)

In [504]:
emb

array([[-0.31128327,  0.52594134,  0.3332039 , ...,  1.19904194,
         0.56444583,  1.0119893 ],
       [-0.30882589, -1.45049469,  0.1154132 , ..., -0.09901407,
        -1.3866041 ,  0.65642323],
       [ 0.59555886, -1.31364724, -1.0336686 , ..., -0.20656153,
        -0.36972137,  0.8361492 ],
       ...,
       [ 0.65025709, -0.49581775, -1.93419397, ...,  1.30710333,
         0.89416039,  1.45488112],
       [-1.25784098,  0.85298139, -1.50401125, ..., -0.73544384,
         1.92932706,  0.22883619],
       [ 0.93105808,  0.09798943, -0.73553983, ..., -1.16811624,
        -1.27944891, -0.89303161]])

In [505]:
test_names = new_names[:1000]
train_names = new_names[1000:3000]
test_desc = new_desc[:1000]
train_desc = new_desc[1000:3000]

In [506]:
len(train_names)

2000

In [507]:
test_data = []
test_data.append(test_names)
test_data.append(test_desc)
train_data = []
train_data.append(train_names)
train_data.append(train_desc)

In [508]:
def agg(m, idx):
    agg_arr = list()
    for text in idx:
        arr = np.zeros((1,emb_size))
        for word in text:
            arr = arr + m[:,word]
        agg_arr.append(arr)
    return agg_arr

In [509]:
def backward_hinge(u, v, vh, gamma = 1.0):
    loss = gamma - np.dot(u,v.transpose()) + np.dot(u,vh.transpose())
    if loss > 0:
        return (vh - v, -u, u)
    else:
        return

In [510]:
def update(embed, idx, Δ, η):
    d = -Δ*η
    for i in idx:
        emb[:, i] +=d[0,:]

In [511]:
def train_tuples(emb, idx, η):
    embs = agg(emb, idx)
    Δs = backward_hinge(embs[0],embs[1],embs[2])
    if(Δs == None):
        return
    for i, Δ in zip(idx, Δs):
        update(emb, i, Δ, η)

In [512]:
emb

array([[-0.31128327,  0.52594134,  0.3332039 , ...,  1.19904194,
         0.56444583,  1.0119893 ],
       [-0.30882589, -1.45049469,  0.1154132 , ..., -0.09901407,
        -1.3866041 ,  0.65642323],
       [ 0.59555886, -1.31364724, -1.0336686 , ..., -0.20656153,
        -0.36972137,  0.8361492 ],
       ...,
       [ 0.65025709, -0.49581775, -1.93419397, ...,  1.30710333,
         0.89416039,  1.45488112],
       [-1.25784098,  0.85298139, -1.50401125, ..., -0.73544384,
         1.92932706,  0.22883619],
       [ 0.93105808,  0.09798943, -0.73553983, ..., -1.16811624,
        -1.27944891, -0.89303161]])

In [513]:
#check_maxval(emb)

In [514]:
emb

array([[-0.31128327,  0.52594134,  0.3332039 , ...,  1.19904194,
         0.56444583,  1.0119893 ],
       [-0.30882589, -1.45049469,  0.1154132 , ..., -0.09901407,
        -1.3866041 ,  0.65642323],
       [ 0.59555886, -1.31364724, -1.0336686 , ..., -0.20656153,
        -0.36972137,  0.8361492 ],
       ...,
       [ 0.65025709, -0.49581775, -1.93419397, ...,  1.30710333,
         0.89416039,  1.45488112],
       [-1.25784098,  0.85298139, -1.50401125, ..., -0.73544384,
         1.92932706,  0.22883619],
       [ 0.93105808,  0.09798943, -0.73553983, ..., -1.16811624,
        -1.27944891, -0.89303161]])

In [515]:
def check_maxval(embed):
    for i in range(len(embed)):
        for j in range(len(embed[0])):
            if abs(emb[i,j])>maxval:
                emb[i,j]=maxval

In [516]:
def train_on_epoch(emb, data, η):
    first = np.random.permutation(len(data[0]))
    second = np.roll(first, 1)
    count=0
    for (f, s) in zip(first, second):
        if count>50:
            count = 0
            check_maxval(emb)
        u = data[0][f]
        v = data[1][f]
        train_tuples(emb, (u, v, data[1][s]), η)

In [517]:
def top_k(vec, i, k):
    count = 0
    for index in range(len(vec[0])):
        if vec[0,i] < vec[0,index]:
            count=count+1
    if count<k:
        return True
    else:
        return False

In [518]:
def recall_at_k(emb, data, k = 10):
    n_test = len(data[0])
    desc_emb = np.zeros((emb_size,n_test))
    recall=0
    
    for i in range(n_test):
        arr = np.zeros((1,emb_size))
        for word in data[1][i]:
            arr = arr + emb[:,word]
        desc_emb[:,i] = arr  
        
    for i in range(n_test):
        title_emb = np.zeros((1,emb_size))
        for word in data[0][i]:
            title_emb = title_emb + emb[:,word]
        
        if top_k(np.dot(title_emb, desc_emb), i, k)==True:
            recall=recall+1
            
    return recall / n_test


In [519]:
def train(emb, train_data, test_data, n_epochs, η):
    for epoch in range(n_epochs):
        t = train_on_epoch(emb, train_data, η)
        recall = recall_at_k(emb, test_data)
        print(epoch)
        print(recall)

In [520]:
train(emb, train_data, test_data, 20, 1.0)

  
  This is separate from the ipykernel package so we can avoid doing imports until
  


0
0.567
1
0.609


  
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  
  if __name__ == '__main__':
  if __name__ == '__main__':
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app


2
0.984
3
0.991
4
0.989
5
0.99
6
0.99
7
0.994
8
0.995
9
0.995
10
0.995
11
0.995
12
0.995
13
0.995
14
0.995
15
0.995
16
0.995
17
0.996
18
0.996
19
0.996
