In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import sys
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

In [2]:
# From utils.py
def read_data(f):
	with open(f) as inp:
		lines = inp.readlines()
	data = []
	for line in lines:
		line = line.strip().split()
		sentence = []
		for token in line:
			token = token.split('|')
			word = token[0]
			tag = token[1]
			sentence.append((word,tag))
		data.append(sentence)
	return data

In [3]:
# from utils.py
def convert_data_for_training(data):
	#for d in data:
	#	tokens = [t[0] for t in d]
	#	tags = [t[1] for t in d]
	return [([t[0] for t in d],[t[1] for t in d]) for d in data]

In [4]:
# from pytorch_tagging.py
TRAINING_FILE = "./irish.train"
training_data = convert_data_for_training(read_data(TRAINING_FILE))

In [5]:
# from pytorch_tagging.py 
def words_tags_indes(data):
	word_to_ix = {'PAD': 0 ,'UNK':1}
	ix_to_word = {0:'PAD',1:'UNK'}
	tag_to_ix = {'PAD':0}
	ix_to_tag = {0:'PAD'}
	for sent, tags in data:
		for word in sent:
			if word not in word_to_ix:
				word_to_ix[word] = len(word_to_ix)
				ix_to_word[word_to_ix[word]] = word
		for tag in tags:
			if tag not in tag_to_ix:
				tag_to_ix[tag] = len(tag_to_ix)
				ix_to_tag[tag_to_ix[tag]] = tag
	return word_to_ix,ix_to_word,ix_to_tag, tag_to_ix

In [6]:
word_to_ix,ix_to_word,ix_to_tag, tag_to_ix = words_tags_indes(training_data)

In [7]:
#pytorch_tagging.py
torch.manual_seed(42)
#Converts a sequence of words to a tensor of numerical values. 
def prepare_sequence(seq, to_ix):
	idxs = []
	for word in seq:
		if word in to_ix:
			idxs.append(to_ix[word])
		else:
			 idxs.append(to_ix['UNK'])
	return torch.tensor(idxs, dtype=torch.long)

In [8]:
# #pytorch_tagging.py
# # See what the scores are before training
# # Note that element i,j of the output is the score for tag j for word i.
# # Here we don't need to train, so the code is wrapped in torch.no_grad()
# with torch.no_grad():
#   #changes here
#   for index in range(len(training_data)):
#     #changes here
# 	  inputs = prepare_sequence(training_data[index][0], word_to_ix)
# 	  tag_scores = model(inputs)
# 	  print(tag_scores)
#    #changes here
# 	  for i,word in enumerate(training_data[index][0]):
# 		  j = int(np.argmax(tag_scores[i]))
# 		  print(f"\t{word}|{ix_to_tag[j]}")

In [9]:
def pad_collate(training_data):
	sentence_array = []
	tag_array = []
	for sentence, tags in training_data:
		sentence_in = prepare_sequence(sentence, word_to_ix)
		targets = prepare_sequence(tags, tag_to_ix)
		sentence_array.append(sentence_in)
		tag_array.append(targets)
	sentence_pad = pad_sequence(sentence_array, batch_first= True, padding_value= 0.0)
	tag_pad = pad_sequence(tag_array, batch_first= True, padding_value= 0.0)
	return sentence_pad, tag_pad 


In [10]:
train_data_loader = DataLoader(dataset=training_data, batch_size=32, shuffle=True, collate_fn=pad_collate)

In [11]:
#pytorch_tagging.py
class LSTMTagger(nn.Module):
	# Class that defines our model
	def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size, batch_size):
		super(LSTMTagger, self).__init__()
		self.hidden_dim = hidden_dim

		self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

		# The LSTM takes word embeddings as inputs, and outputs hidden states
		# with dimensionality hidden_dim.
		self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first= True)

		# The linear layer that maps from hidden state space to tag space
		self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

	# This is the forward computation, which constructs the computation graph
	def forward(self, sentence):
		# Get the embeddings
		embeds = self.word_embeddings(sentence)
		# put them through the LSTM and get its output
		# lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
		# # pass that output through the linnear layer
		# tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
		# # convert the logits to a log probability distribution
		# tag_scores = F.log_softmax(tag_space, dim=1)
		# return tag_scores
		batch_size = sentence.size(0)
		sentence = sentence.long()
		embeds = self.word_embeddings(sentence)
		lstm_out, _ = self.lstm(embeds)
		lstm_out = lstm_out.contiguous().reshape(-1, self.hidden_dim)
		tag_space = self.hidden2tag(lstm_out)
		tag_scores = F.log_softmax(tag_space, dim=1)
		return tag_scores

In [12]:
#pytorch_tagging.py
# Hyperparameters
EMBEDDING_DIM = 32
HIDDEN_DIM = 32
BATCH_SIZE = 32
# DROPOUT = ?
# LAYERS = ?

In [13]:
#pytorch_taggin.py
# Initialize the model
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix),BATCH_SIZE)
# Loss function to use
loss_function = nn.NLLLoss()
# Optimizer to use during training
optimizer = optim.SGD(model.parameters(), lr=0.1)


In [14]:
#pytorch_taggin.py
# Training loop
def train_model(model,n_epochs, patience, training_data):
	for epoch in range(n_epochs):  # normally you would NOT do 100 epochs, it is toy data
		print(f"Starting epoch {epoch}...")
		training_losses = []
		for sentence, tags in training_data:
			# Step 1. Remember that Pytorch accumulates gradients.
			# We need to clear them out before each instance
			model.zero_grad()

			# Step 2. Get our inputs ready for the network, that is, turn them into
			# Tensors of word indices.
			# Eventually I suggest you use the DataLoader modules
			# The batching can take place here
			#sentence_in = prepare_sequence(sentence, word_to_ix)
			#targets = prepare_sequence(tags, tag_to_ix)
			# Step 3. Run our forward pass.
			tag_scores = model(sentence)
			# Step 4. Compute the loss, gradients, and update the parameters by
			#  calling optimizer.step()
			loss = loss_function(tag_scores, tags.flatten())
			training_losses.append(loss.item())
			loss.backward()
			optimizer.step()

In [15]:
 train_model(model,20, 8, train_data_loader)

Starting epoch 0...
Starting epoch 1...
Starting epoch 2...
Starting epoch 3...
Starting epoch 4...
Starting epoch 5...
Starting epoch 6...
Starting epoch 7...
Starting epoch 8...
Starting epoch 9...
Starting epoch 10...
Starting epoch 11...
Starting epoch 12...
Starting epoch 13...
Starting epoch 14...
Starting epoch 15...
Starting epoch 16...
Starting epoch 17...
Starting epoch 18...
Starting epoch 19...


In [16]:
print("Saving model here")
path = "./model_save_Batch.pth"
model_state = {'state_dict' : model.state_dict(),
						'optimizer' : optimizer.state_dict(),
						}
torch.save(model_state, path)

Saving model here


In [17]:
model_checkpoint = torch.load(path)
model.load_state_dict(model_checkpoint['state_dict'])
optimizer.load_state_dict(model_checkpoint['optimizer'])

In [18]:
TEST_FILE = "./irish.test"
test_data = convert_data_for_training(read_data(TEST_FILE))

with torch.no_grad():
	# this will be the file to write the outputs
	with open("mymodel_output_irish.txt", 'w') as op:
		for instance in test_data:
			# Convert the test sentence into a word ID tensor
			inputs = torch.LongTensor(prepare_sequence(instance[0], word_to_ix))
			#inputs = prepare_sequence(instance[0], word_to_ix)
			# Forward pass
			tag_scores = model(inputs.reshape(-1,inputs.shape[0]))
			# Find the tag with the highest probability in each position
			outputs = [int(np.argmax(ts)) for ts in tag_scores]
			# Prepare the output to be written in the same format as the test file (word|tag)
			formatted_output = ' '.join([f"{word}|{ix_to_tag[tag_id]}" for word,tag_id in zip(instance[0],outputs)])
			# Write the output
			op.write(formatted_output + '\n')

In [19]:
#compute_accuracy
def acc_read_data(f):
	with open(f) as inp:
		lines = inp.readlines()
	data = []
	for line in lines:
		line = line.strip().split()
		sentence = []
		for token in line:
			token = token.split('|')
			word = token[0]
			tag = token[1]
			sentence.append((word,tag))
		data.append(sentence)
	return data

In [20]:
def compute_accuracy(output, gold):
	try:
		assert(len(output) == len(gold))
	except:
		print("Different number of lines in the two files!")
		return -1

	count_correct = 0
	count_total_tokens = 0
	for o_sent,g_sent in zip(output,gold):
		try:
			assert(len(o_sent)==len(g_sent))
		except:
			print("Different number of tokens in the two lines!")
			return -1
		check = [o_token[1] == g_token[1] for o_token,g_token in zip(o_sent,g_sent)]
		count_correct += sum(check)
		count_total_tokens += len(check)
	return count_correct/count_total_tokens

In [21]:
output = acc_read_data("./mymodel_output_irish.txt")
gold = acc_read_data("./irish.test")
acc = compute_accuracy(output,gold)

In [22]:
acc

0.42922148580472846