In [14]:
"""Implementation of GraphNLI model."""

import math
from datetime import datetime
import pandas as pd
import numpy as np
import sys
from sentence_transformers import models
from sentence_transformers import SentenceTransformer, InputExample
from sentence_transformers.evaluation import LabelAccuracyEvaluator
from torch.utils.data import DataLoader
from SoftmaxLoss import *


model_name = 'distilroberta-base'

train_batch_size = 16
graph_walk_len = 5
random_walk_len = 4
num_epochs = 4

train_samples = []
test_samples = []

model_save_path = 'output/training_nli_' + model_name.replace("/", "-")

# Using RoBERTa model for mapping tokens to embeddings.
word_embedding_model = models.Transformer(model_name)

# Applying mean pooling to get one fixed sized sentence vector.
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


# trainset = pd.read_csv('../train_graph_set_walk.csv')
trainset = pd.read_csv('model_training/training_data/IBM30K_with_n_ACM.csv')
trainset = trainset.fillna('')

# Split train and test data
from sklearn.model_selection import train_test_split
trainset, testset = train_test_split(trainset, test_size=0.2, random_state=42)
trainset.reset_index(inplace=True, drop=True)
testset.reset_index(inplace=True, drop=True)

for i in range(len(devset)):
    texts = []
    for j in range(1, random_walk_len+1):
        texts.append(devset.iloc[i]['sent' + str(j)])
    test_samples.append(InputExample(texts=texts, label=int(devset.iloc[i]['label'])))

print(train_samples)


train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=2)

dev_dataloader = DataLoader(test_samples, shuffle=True, batch_size=train_batch_size)

dev_evaluator = LabelAccuracyEvaluator(dev_dataloader, name='sts-dev', softmax_model=train_loss)


warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up

# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
    evaluator=dev_evaluator,
    epochs=num_epochs,
    evaluation_steps=1000,
    warmup_steps=warmup_steps,
    output_path=model_save_path)


# Load the stored model and evaluate its performance on the test set.
test_dataloader = DataLoader(test_samples, shuffle=True, batch_size=train_batch_size)

# model = SentenceTransformer(model_save_path)

test_evaluator = LabelAccuracyEvaluator(test_dataloader, name='sts-test', softmax_model=train_loss)
test_evaluator(model, output_path=model_save_path)

       Unnamed: 0                                          argument1  \
0            8403  i think that we shouldn't abolish intellectual...   
1           17439  School prayer goes against the separation of t...   
2            3082  cash advances give certain employees an unfair...   
3            8832  If people were not allowed to practice their o...   
4           28303  we should stop urbanization because of the pol...   
...           ...                                                ...   
25507       29802  women are just as capable of being in combat w...   
25508        5390  everybody in this country deserves to be treat...   
25509         860  affirmative action punishes qualified candidat...   
25510       15795  polygamy could lead to unequal relationships a...   
25511       23654  they have taken god out of schools anyway, so ...   

                                            argument2 relationship  
0      We should abolish intellectual property rights            a

Epoch:   0%|                                              | 0/4 [00:00<?, ?it/s]
Iteration:   0%|                                       | 0/1595 [00:00<?, ?it/s][A
Epoch:   0%|                                              | 0/4 [00:00<?, ?it/s]


KeyError: 4313

In [4]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(trainset, test_size=0.2, random_state=42)

In [5]:
train

Unnamed: 0.1,Unnamed: 0,argument1,argument2,relationship
8403,8403,i think that we shouldn't abolish intellectual...,We should abolish intellectual property rights,a
17439,17439,School prayer goes against the separation of t...,We should prohibit school prayer,s
3082,3082,cash advances give certain employees an unfair...,Payday loans should be banned,s
8832,8832,If people were not allowed to practice their o...,We should adopt atheism,a
28303,28303,we should stop urbanization because of the pol...,We should fight urbanization,s
...,...,...,...,...
29802,29802,women are just as capable of being in combat w...,We should prohibit women in combat,a
5390,5390,everybody in this country deserves to be treat...,We should adopt gender-neutral language,s
860,860,affirmative action punishes qualified candidat...,We should end affirmative action,s
15795,15795,polygamy could lead to unequal relationships a...,We should legalize polygamy,a
