In [21]:
# Run this cell
import torch
from torch import nn
from torch.nn import functional as F
import torch.optim as optim

from time import time
import pandas as pd
import numpy as np

import tensorflow as tf

from keras.layers import Input, Embedding, LSTM, Dense, Lambda

# import by me
from gensim.models import KeyedVectors
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.models import load_model
from keras.layers import Input, Embedding, LSTM #, Merge
import keras.backend as K
from keras.optimizers import Adadelta
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.callbacks import Callback

from gensim import models

In [22]:
# Run this cell
class ModelParam(object):
    """
    define the parameters of the model
    """
    def __init__(
        self, 
        input_size1, # sentence length of language1
        input_size2, # sentence length of language2
        vocab_size, 
        sent_size, 
        embedding_dim,
        sent_vector,
    ):
        self.input_size1 = input_size1
        self.input_size2 = input_size2
        self.vocab_size = vocab_size
        self.sent_size = sent_size
        self.embedding_dim = embedding_dim
        self.sent_vector = sent_vector
    
        

In [23]:
# Run this cell
# initialize some variables
w2v_model = KeyedVectors.load_word2vec_format('vectors_embeddings.bin', binary=True)

# Inpute size
w2v_dim = 200

n_units_1st_layer = 64

# Training epoch number
n_epoch = 100

# Model Optimization parameters
batch_size = 64
gradient_clipping_norm = 1.25

# File name (or the intact file path) which indicates the model you want to save.
saved_model = "embeddings_saved_model.hdf5"

# Whether use early stopping
# If you turn off early stopping the auc values after each epoch will not be computed.
early_stopping_or_not = True

# Control parameters of early stopping
min_delta_value = 1e-3
patience_steps_num =50

In [24]:
# Run this cell

TRAIN_CSV = 'train1.csv'
TEST_CSV = 'test.csv'

train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

def text_to_word_list(text):
    
    text = str(text)
    text = text.upper()
    text = text.split()

    return text

In [25]:
# Run this cell
# This is the model for generating predictions
# Takes parameters from ModelParam class
class crossLingualModel(nn.Module):
    def __init__(self, model_param: ModelParam):
        super().__init__()
        #Ifty
        self.embedding = embeddings
        self.sent_vector = model_param.sent_vector
        
        self.sent = torch.randn(
            model_param.sent_size, 
            requires_grad=True, 
            dtype=torch.float
        )
        
        self.linear = nn.Linear(
            model_param.embedding_dim + model_param.sent_size,
            model_param.vocab_size
        )
        
        self.softmax = nn.LogSoftmax(dim=1)
        
    # x1: embeddings of context words in lang1
    # x2: embeddings of context words in lang2
    def forward(self, x1, x2):
        x1 = torch.FloatTensor(x1)
        x2= torch.FloatTensor(x2)
        embedding_output1 = x1 
        embedding_output1 = F.relu(embedding_output1)
        embedding_output2 = x2 
        embedding_output2 = F.relu(embedding_output2)
        sum_embedding1 = torch.Tensor([x.sum() for x in embedding_output1.transpose(0, -1)])
        sum_embedding2 = torch.Tensor([x.sum() for x in embedding_output2.transpose(0, -1)])
        sent = self.sent
        
        concat1 = torch.cat((sum_embedding1, torch.Tensor(self.sent_vector)), 0)
        concat2 = torch.cat((sum_embedding2, torch.Tensor(self.sent_vector)), 0)

        linear_output1 = self.linear(concat1)
        linear_output2 = self.linear(concat2)

        pred1 = self.softmax(linear_output1.reshape(1,-1))
        pred2 = self.softmax(linear_output2.reshape(1,-1))

        return [pred1, pred2]


In [26]:
# Run this cell
# For calculating word vector
def get_mean_vector(words):
    # remove out-of-vocabulary words #w2v_model.wv
    words = [word for word in words if word in w2v_model.wv]
    if len(words) >= 1:
        return np.mean(w2v_model.wv[words], axis=0)
    else:
        return []


In [27]:
# Run this cell
# For getting center word and surrounding words according to sliding window with window size C
def get_windows(words_lang1, words_lang2, C):
    i = C
    #get word_list with min length
    lang1_len = len(words_lang1)
    lang2_len = len(words_lang2)
    min_len = min(lang1_len, lang2_len)
    
    while i < min_len - C:
        center_word_lang1 = words_lang1[i]
        center_word_lang2 = words_lang2[i]
        context_words_lang1 = words_lang1[(i - C):i] + words_lang1[(i+1):(i+C+1)]
        context_words_lang2 = words_lang2[(i - C):i] + words_lang2[(i+1):(i+C+1)]
        yield context_words_lang1, context_words_lang2, center_word_lang1, center_word_lang2 
        i += 1


In [28]:
# Run this cell
# This is the main cell for preparing center words and cotext words and iterates over entire dataset
epochs = 3
window_size = 2
TRAIN_CSV = 'train1.csv'
train_df = pd.read_csv(TRAIN_CSV)

# 80% of total data in dataset is used for training and rest 20% will be used for testing
from sklearn.model_selection import train_test_split
train_X, test_X = train_test_split (train_df, test_size=0.2 )

questions_cols = ['lang1', 'lang2']
dataset = train_X

for epoch in range(epochs):
    epoch_losses = list()
    for index, row in dataset.iterrows():
        #calculate paragraph vector for entire row
        sent_vector = get_mean_vector(text_to_word_list(row))
        #print(sent_vector)

        #keeping vocab size same as embedding dim
        model_param = ModelParam(101, 101, 200, 200, 200, sent_vector)
        model = crossLingualModel(model_param)
        optimizer = optim.Adam(model.parameters(), lr=0.0001)
     
        for context_words_lang1, context_words_lang2, center_word_lang1, center_word_lang2 in get_windows(text_to_word_list(row['lang1']), text_to_word_list(row['lang2']), window_size):
            context_words_lang1_array = []
            context_words_lang2_array = []
            for word_lang1 in context_words_lang1:
                lang1_array = np.array(w2v_model.wv[word_lang1])
                context_words_lang1_array.append(lang1_array)
           
            for word_lang2 in context_words_lang2:
                lang2_array = np.array(w2v_model.wv[word_lang2])
                context_words_lang2_array.append(lang2_array)

            prediction1, prediction2 = model(np.array(context_words_lang1_array), np.array(context_words_lang2_array))
           
            #embedding for target word in lang1
            target1 = w2v_model.wv[center_word_lang1]
            target1 = torch.from_numpy(target1)
            target1 = torch.autograd.Variable(target1)
            target1 = target1.reshape(1,-1)
            
            loss_lang1 = nn.BCEWithLogitsLoss()(prediction1, target1)
           
            #embedding for target word in lang2
            target2 = w2v_model.wv[center_word_lang2]
            target2 = torch.from_numpy(target2)
            target2 = torch.autograd.Variable(target2)
            target2 = target2.reshape(1,-1)
            
            loss_lang2 = nn.BCEWithLogitsLoss()(prediction2, target2)

            summed_loss = loss_lang1 + loss_lang2 
            epoch_losses.append(summed_loss.item())
            model.zero_grad()
            summed_loss.backward()
            optimizer.step()
            
    print('epoch_losses')
    print(epoch_losses)
        
    
  

  after removing the cwd from sys.path.
  


epoch_losses
[0.21432772278785706, -0.017687208950519562, 0.18492980301380157, -0.1894727200269699, 0.1078895851969719, -0.09708941727876663, 0.011865071952342987, 0.41190579533576965, -0.1858416646718979, -0.3982364237308502, 0.24906781315803528, 0.8052610158920288, 0.33057349920272827, 0.1024564802646637, 0.1958761066198349, 0.41239234805107117, -0.12415763735771179, 0.07016605138778687, -0.013905540108680725, -0.140476793050766, 0.05203809589147568, -0.10878349095582962, -0.24183279275894165, 0.4411723017692566, -0.5538578033447266, -0.19453489780426025, 0.03746756166219711, -0.28095120191574097]
epoch_losses
[0.21443015336990356, 0.08213548362255096, 0.16635334491729736, -0.12901253998279572, 0.15858696401119232, -0.07123620063066483, 0.0021004769951105118, 0.4442633390426636, -0.12054423987865448, -0.42788925766944885, 0.2476448267698288, 0.811376690864563, 0.29557451605796814, 0.14119456708431244, 0.20867133140563965, 0.41332074999809265, -0.15096382796764374, 0.14994597434997559

## Model ENDS HERE