# Poincaré Embeddings for OOPs Dataset

## Mount to drive

In [25]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
%cd /content/drive/MyDrive/Engineering/Curriculum/8th Semester/Internship/descriptive_evaluation_project/Hewlett Dataset

/content/drive/.shortcut-targets-by-id/17Gn89Edqfyxljr8tO09VdcQWGcUCa_Ua/descriptive_evaluation_project/Hewlett Dataset


## Generate Embeddings using Poincaré

### Import Packages and set variables

In [27]:
import pandas as pd
import numpy as np
import itertools
from sklearn.cluster import KMeans
import pprint
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [28]:
DATASET_CSV = '/content/drive/MyDrive/Engineering/Curriculum/8th Semester/Internship/descriptive_evaluation_project/Hewlett Dataset/Hewlett Dataset.csv'

## Code

In [29]:
oop_dataset=pd.read_csv(DATASET_CSV)
oop_dataset.head()
oop_dataset=oop_dataset[['EssayText']]
oop_dataset=oop_dataset[:1000]
oop_dataset

Unnamed: 0,EssayText
0,Some additional information that we would need...
1,"After reading the expirement, I realized that ..."
2,"What you need is more trials, a control set up..."
3,The student should list what rock is better an...
4,For the students to be able to make a replicat...
...,...
995,"In order to replicate this experiment, you nee..."
996,"How much liasis was used in each container,rin..."
997,One piece of additional information would be t...
998,They need to specify what the four different s...


In [30]:
# Import required libraries
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

# Prerequisites for cleaning
nltk.download("stopwords")                      # Download stopwords from NLTK library
nltk.download('wordnet')                        # Download wordnet, a lexixal database from NLTK library
stopwords = set(stopwords.words('english'))     # Store stopwords
lemmatizer = WordNetLemmatizer()                # Create object for lemmatization

# Function for standard cleaning of text (remove punctuations, abbreviations, etc.) using regular expressions
def standard_clean(text):
  text = str(text)
  text = text.lower()
  text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
  text = re.sub(r"what's", "what is ", text)
  text = re.sub(r"\'s", " ", text)
  text = re.sub(r"\'ve", " have ", text)
  text = re.sub(r"can't", "cannot ", text)
  text = re.sub(r"n't", " not ", text)
  text = re.sub(r"i'm", "i am ", text)
  text = re.sub(r"\'re", " are ", text)
  text = re.sub(r"\'d", " would ", text)
  text = re.sub(r"\'ll", " will ", text)
  text = re.sub(r",", " ", text)
  text = re.sub(r"\.", " ", text)
  text = re.sub(r"!", " ! ", text)
  text = re.sub(r"\/", " ", text)
  text = re.sub(r"\^", " ^ ", text)
  text = re.sub(r"\+", " + ", text)
  text = re.sub(r"\-", " - ", text)
  text = re.sub(r"\=", " = ", text)
  text = re.sub(r"'", " ", text)
  text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
  text = re.sub(r":", " : ", text)
  text = re.sub(r" e g ", " eg ", text)
  text = re.sub(r" b g ", " bg ", text)
  text = re.sub(r" u s ", " american ", text)
  text = re.sub(r"\0s", "0", text)
  text = re.sub(r" 9 11 ", "911", text)
  text = re.sub(r"e - mail", "email", text)
  text = re.sub(r"j k", "jk", text)
  text = re.sub(r"\s{2,}", " ", text)
  text = re.sub(r"-", " ", text)
  return text

# Function to remove stopwords from a sentence
def remove_stopwords(text):
  text = text.split()
  clean = ""
  for w in text:
    if w not in stopwords:
      clean = clean + " " + w
  return str(clean[1:])

# Function to lemmatize words of a sentence using Lemmatizer object
def lemmatize(text):
  text = text.split()
  clean = ""
  for w in text:
    clean = clean + " " + lemmatizer.lemmatize(w)
  return str(clean[1:])

# Function to clean the text
def clean(text):
  text = standard_clean(text)
  text = remove_stopwords(text)
  text = lemmatize(text)
  return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [31]:
oop_dataset['EssayText'] = oop_dataset['EssayText'].map(lambda x: clean(x))
oop_dataset['EssayText'] = oop_dataset['EssayText'].apply(word_tokenize)
oop_dataset['EssayText'] = oop_dataset['EssayText'].apply(lambda x: ','.join([str(item) for item in x]))
oop_dataset.head()

Unnamed: 0,EssayText
0,"additional,information,would,need,replicate,ex..."
1,"reading,expirement,realized,additional,informa..."
2,"need,trial,control,set,exact,amount,vinegar,po..."
3,"student,list,rock,better,rock,worse,procedure"
4,"student,able,make,replicate,would,need,tell,us..."


In [32]:
tokenizer = dict()
oop_dataset['EssayText'] = oop_dataset['EssayText'].apply(
    lambda named_entities: [str(named_entitie)
                            for named_entitie in named_entities.split(',')])

oop_dataset.head()

Unnamed: 0,EssayText
0,"[additional, information, would, need, replica..."
1,"[reading, expirement, realized, additional, in..."
2,"[need, trial, control, set, exact, amount, vin..."
3,"[student, list, rock, better, rock, worse, pro..."
4,"[student, able, make, replicate, would, need, ..."


In [33]:
pairs_df = oop_dataset['EssayText'].apply(lambda named_entities: list(itertools.combinations(named_entities, 2)))
pairs_df = pairs_df[pairs_df.apply(len) > 0]
pairs_df = pd.DataFrame(np.concatenate(pairs_df.values), columns=['named_entity_1', 'named_entity_2'])
pairs_df.head(10)

Unnamed: 0,named_entity_1,named_entity_2
0,additional,information
1,additional,would
2,additional,need
3,additional,replicate
4,additional,experiment
5,additional,much
6,additional,vinegar
7,additional,placed
8,additional,identical
9,additional,container


In [34]:
pairs_df[['named_entity_1', 'named_entity_2']].to_csv("edges.tsv", sep = "\t", index=False)

In [35]:
from gensim.models.poincare import PoincareModel, PoincareRelations
from gensim.test.utils import datapath
file_path = datapath('/content/drive/MyDrive/Engineering/Curriculum/8th Semester/Internship/descriptive_evaluation_project/Hewlett Dataset/edges.tsv')
model = PoincareModel(PoincareRelations(file_path), negative=2)
model.train(epochs=25)

In [36]:
model.kv.save_word2vec_format('test_vectors')

In [37]:
emb_df = pd.read_csv('test_vectors', sep=' ', skiprows=[0], header=None)
emb_df.set_index(0, inplace=True)
emb_df.index.name = 'named_entity'
emb_df.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,41,42,43,44,45,46,47,48,49,50
named_entity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
would,-0.003274,-0.001131,0.002267,-0.006409,0.003045,-0.005283,0.005535,-0.002963,0.006273,-0.00852,...,0.001686,-0.002151,-0.000351,0.007821,0.00206,-0.003153,-0.003675,-0.002758,-0.00525,0.00279
need,0.001227,-0.005345,-0.000812,0.004171,0.007398,0.004175,0.00348,0.001964,-0.00073,0.007614,...,0.004464,0.002582,0.005171,-0.002372,-0.00429,0.000231,0.00218,0.006067,0.007143,-0.000791
sample,-0.002747,-0.002093,-0.002056,-0.00051,-0.005816,-0.005331,0.004405,-0.000592,0.00253,-0.001136,...,-0.001392,-0.003179,0.006303,0.001199,0.001468,0.004659,-0.001303,-0.001315,0.002756,0.000172
experiment,-0.006704,0.009086,-0.003726,-0.015376,-0.008842,-0.013178,3e-06,-0.010008,0.002492,-0.007909,...,-0.007558,-0.010468,-0.012677,-0.001435,0.001295,-0.008152,-0.002018,-0.00781,-0.012972,0.007889
know,-0.0016,-0.002294,-0.005362,-0.006881,-0.010403,-0.001012,0.003952,-0.004575,0.008281,-0.002963,...,0.004755,-0.006209,-0.00505,-0.003113,-0.007852,-0.005003,-0.001292,-0.000733,-0.00191,0.008214


## Manhattan LSTM

### Import Packages

In [38]:
from time import time
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
import re
import nltk
nltk.download('stopwords')
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

import itertools
import datetime

from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Lambda
import keras.backend as K
# from keras.optimizers import Adadelta
from keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [49]:
TRAIN_CSV = '/content/drive/MyDrive/Engineering/Curriculum/8th Semester/Internship/descriptive_evaluation_project/Hewlett Dataset/train_main.csv'
TEST_CSV = '/content/drive/MyDrive/Engineering/Curriculum/8th Semester/Internship/descriptive_evaluation_project/Hewlett Dataset/train_main.csv'
EMBEDDING_FILE = '/content/drive/MyDrive/Engineering/Curriculum/8th Semester/Internship/descriptive_evaluation_project/Hewlett Dataset/test_vectors'

In [50]:
def text_to_word_list(text):
  standard_clean(text)
  text=text.split()
  return text

In [51]:
# Prepare embedding
vocabulary = dict()
inverse_vocabulary = ['<unk>']  # '<unk>' will never be used, it is only a placeholder for the [0, 0, ....0] embedding
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

questions_cols = ['question1', 'question2']

In [52]:
train_df = pd.read_csv(TRAIN_CSV)
train_df

Unnamed: 0,question1,question2,is_duplicate
0,First you would need to know what the samples ...,To replicate the experiments you will need all...,0
1,"You would need the starting mass, ending mass,...",1. Have a determined size for the starting sam...,0
2,To replicate these students experiment you wou...,After reading the groups procedure the additio...,0
3,To replicate the experiment you would need to ...,You will need the type of vinegar and know the...,0
4,The additonal information youwould need to kno...,"You need to add to number 4, it should say how...",0
...,...,...,...
1048570,The additional information I would need to kno...,1.)How much vinegar should go into each contai...,0
1048571,"On statement 2, They need to put the measureme...","You would need to know what materials to use, ...",0
1048572,You would need to include how much vinegar to ...,First they should of put the amount of vinegar...,0
1048573,Additional information that the group would ne...,This procedure is lacking important informatio...,0


In [53]:
test_df = pd.read_csv(TEST_CSV)
test_df

Unnamed: 0,question1,question2,is_duplicate
0,First you would need to know what the samples ...,To replicate the experiments you will need all...,0
1,"You would need the starting mass, ending mass,...",1. Have a determined size for the starting sam...,0
2,To replicate these students experiment you wou...,After reading the groups procedure the additio...,0
3,To replicate the experiment you would need to ...,You will need the type of vinegar and know the...,0
4,The additonal information youwould need to kno...,"You need to add to number 4, it should say how...",0
...,...,...,...
1048570,The additional information I would need to kno...,1.)How much vinegar should go into each contai...,0
1048571,"On statement 2, They need to put the measureme...","You would need to know what materials to use, ...",0
1048572,You would need to include how much vinegar to ...,First they should of put the amount of vinegar...,0
1048573,Additional information that the group would ne...,This procedure is lacking important informatio...,0


In [54]:


# Iterate over the questions only of both training and test datasets
for dataset in [train_df, test_df]:
    # print(dataset)
    for index, row in dataset.iterrows():

        # Iterate through the text of both questions of the row
        for question in questions_cols:
            # print(question)
            q2n = []  # q2n -> question numbers representation
            for word in text_to_word_list(row[question]):
                # print(word)
                # Check for unwanted words
                if word in stopwords and word not in word2vec.vocab:
                    continue

                if word not in vocabulary:
                    vocabulary[word] = len(inverse_vocabulary)
                    q2n.append(len(inverse_vocabulary))
                    inverse_vocabulary.append(word)
                else:
                    q2n.append(vocabulary[word])

            # Replace questions as word to question as number representation
            dataset.at[index, question] = q2n
            


In [55]:
embedding_dim = 50
embeddings = 1 * np.random.randn(len(vocabulary) + 1, embedding_dim)  # This will be the embedding matrix
embeddings[0] = 0  # So that the padding will be ignored

# Build the embedding matrix
for word, index in vocabulary.items():
    if word in word2vec.vocab:
        embeddings[index] = word2vec.word_vec(word)

del word2vec

In [56]:
max_seq_length = max(train_df.question1.map(lambda x: len(x)).max(),
                     train_df.question2.map(lambda x: len(x)).max(),
                     test_df.question1.map(lambda x: len(x)).max(),
                     test_df.question2.map(lambda x: len(x)).max())

# Split to train validation

validation_size = 40000
training_size = len(train_df) - validation_size

X = train_df[questions_cols]
Y = train_df['is_duplicate']

X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size)

# Split to dicts
X_train = {'left': X_train.question1, 'right': X_train.question2}
X_validation = {'left': X_validation.question1, 'right': X_validation.question2}
X_test = {'left': test_df.question1, 'right': test_df.question2}

# Convert labels to their numpy representations
Y_train = Y_train.values
Y_validation = Y_validation.values

# Zero padding
for dataset, side in itertools.product([X_train, X_validation], ['left', 'right']):
    dataset[side] = pad_sequences(dataset[side], maxlen=max_seq_length)

# Make sure everything is ok
assert X_train['left'].shape == X_train['right'].shape
assert len(X_train['left']) == len(Y_train)

In [None]:
# Model variables
from tensorflow.keras.optimizers import Adadelta
n_hidden = 50
gradient_clipping_norm = 1.25
batch_size = 64
n_epoch = 25

def exponent_neg_manhattan_distance(left, right):
    ''' Helper function for the similarity estimate of the LSTMs outputs'''
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

# The visible layer
left_input = Input(shape=(max_seq_length,), dtype='int32')
right_input = Input(shape=(max_seq_length,), dtype='int32')

embedding_layer = Embedding(len(embeddings), embedding_dim, weights=[embeddings], input_length=max_seq_length, trainable=False)

# Embedded version of the inputs
encoded_left = embedding_layer(left_input)
encoded_right = embedding_layer(right_input)

# Since this is a siamese network, both sides share the same LSTM
shared_lstm = LSTM(n_hidden)

left_output = shared_lstm(encoded_left)
right_output = shared_lstm(encoded_right)

# Calculates the distance as defined by the MaLSTM model
malstm_distance = Lambda(function=lambda x: exponent_neg_manhattan_distance(x[0], x[1]),output_shape=lambda x: (x[0][0], 1))([left_output, right_output])

# Pack it all up into a model
malstm = Model([left_input, right_input], [malstm_distance])

# Adadelta optimizer, with gradient clipping by norm
optimizer = Adadelta(clipnorm=gradient_clipping_norm)

malstm.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])

# Start training
training_start_time = time()

malstm_trained = malstm.fit([X_train['left'], X_train['right']], Y_train, batch_size=batch_size, epochs=n_epoch,
                            validation_data=([X_validation['left'], X_validation['right']], Y_validation))

print("Training time finished.\n{} epochs in {}".format(n_epoch, datetime.timedelta(seconds=time()-training_start_time)))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25

In [None]:
malstm_trained.history['accuracy']
malstm_trained.history['val_accuracy']

In [None]:
# Plot accuracy
plt.plot(malstm_trained.history['accuracy'])
plt.plot(malstm_trained.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot loss
plt.plot(malstm_trained.history['loss'])
plt.plot(malstm_trained.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()