In [0]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

#Loading Data

In [0]:
#Loading data from given CSV files. (The given CSV files have been uploaded over G-drive)

train_df = pd.read_csv('https://drive.google.com/uc?id=17s-v7RkT7LTojqDDGkVa1dbi6GhukUu6', encoding = "ISO-8859-1")
val_df = pd.read_csv('https://drive.google.com/uc?id=1afH0fbRM8w9N41R0o9WHHvr1nEso8UEb', encoding = "ISO-8859-1")
test_df = pd.read_csv('https://drive.google.com/uc?id=1CRQv7ojJG0wSxaRZiotXo4Dwcb2ChKDH', encoding = "ISO-8859-1")

In [0]:
train_df.head()

Unnamed: 0,Sentence,NER
0,-docstart-,O
1,eu rejects german call to boycott british lamb .,I-ORG O I-MISC O O O I-MISC O O
2,peter blackburn,I-PER I-PER
3,brussels 1996-08-22,I-LOC O
4,the european commission said on thursday it di...,O I-ORG I-ORG O O O O O O I-MISC O O O O O I-M...


In [3]:
#Checking for missing values

train_df.isnull().values.any()

False

#Extracting lists of words and tags from data

In [4]:
train_data=[]
target_y_train=[]

for index, row in train_df.iterrows():
  text = [word for word in row['Sentence'].split()]
  train_data.append(text)
  labels = [label for label in row['NER'].split()]
  target_y_train.append(labels)
print(train_data)
print(target_y_train)

[['-docstart-'], ['eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.'], ['peter', 'blackburn'], ['brussels', '1996-08-22'], ['the', 'european', 'commission', 'said', 'on', 'thursday', 'it', 'disagreed', 'with', 'german', 'advice', 'to', 'consumers', 'to', 'shun', 'british', 'lamb', 'until', 'scientists', 'determine', 'whether', 'mad', 'cow', 'disease', 'can', 'be', 'transmitted', 'to', 'sheep', '.'], ['germany', "'s", 'representative', 'to', 'the', 'european', 'union', "'s", 'veterinary', 'committee', 'werner', 'zwingmann', 'said', 'on', 'wednesday', 'consumers', 'should', 'buy', 'sheepmeat', 'from', 'countries', 'other', 'than', 'britain', 'until', 'the', 'scientific', 'advice', 'was', 'clearer', '.'], ['"', 'we', 'do', "n't", 'support', 'any', 'such', 'recommendation', 'because', 'we', 'do', "n't", 'see', 'any', 'grounds', 'for', 'it', ',', '"', 'the', 'commission', "'s", 'chief', 'spokesman', 'nikolaus', 'van', 'der', 'pas', 'told', 'a', 'news', 'briefing', '.

In [5]:
validation_data=[]
target_y_validation=[]

for index, row in val_df.iterrows():
  text = [word for word in row['Sentence'].split()]
  validation_data.append(text)
  labels = [label for label in row['NER'].split()]
  target_y_validation.append(labels)
print(validation_data)
print(target_y_validation)

[['-docstart-'], ['cricket', '-', 'leicestershire', 'take', 'over', 'at', 'top', 'after', 'innings', 'victory', '.'], ['london', '1996-08-30'], ['west', 'indian', 'all-rounder', 'phil', 'simmons', 'took', 'four', 'for', '38', 'on', 'friday', 'as', 'leicestershire', 'beat', 'somerset', 'by', 'an', 'innings', 'and', '39', 'runs', 'in', 'two', 'days', 'to', 'take', 'over', 'at', 'the', 'head', 'of', 'the', 'county', 'championship', '.'], ['their', 'stay', 'on', 'top', ',', 'though', ',', 'may', 'be', 'short-lived', 'as', 'title', 'rivals', 'essex', ',', 'derbyshire', 'and', 'surrey', 'all', 'closed', 'in', 'on', 'victory', 'while', 'kent', 'made', 'up', 'for', 'lost', 'time', 'in', 'their', 'rain-affected', 'match', 'against', 'nottinghamshire', '.'], ['after', 'bowling', 'somerset', 'out', 'for', '83', 'on', 'the', 'opening', 'morning', 'at', 'grace', 'road', ',', 'leicestershire', 'extended', 'their', 'first', 'innings', 'by', '94', 'runs', 'before', 'being', 'bowled', 'out', 'for', '29

In [6]:
test_data=[]

for index, row in test_df.iterrows():
  text = [word for word in row['Sentence'].split()]
  test_data.append(text)
print(test_data)



In [0]:
print(test_df.iloc[:10])

                                            Sentence  NER
0                                         -docstart-  NaN
1  soccer - japan get lucky win , china in surpri...  NaN
2                                        nadim ladki  NaN
3           al-ain , united arab emirates 1996-12-06  NaN
4  japan began the defence of their asian cup tit...  NaN
5  but china saw their luck desert them in the se...  NaN
6  china controlled most of the match and saw sev...  NaN
7  oleg shatskiku made sure of the win in injury ...  NaN
8  the former soviet republic was playing in an a...  NaN
9  despite winning the asian games title two year...  NaN


#BiLSTM CRF Pre-processing

##Generate word_to_ix and tag_to_ix

In [0]:
word_to_ix = {}
a=[]
for sentence in train_data+validation_data+test_data:
    for word in sentence:
        a.append(word)
        word = word.lower()
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
word_list = list(word_to_ix.keys())

START_TAG = "<START>"
STOP_TAG = "<STOP>"
tag_to_ix = {START_TAG:0, STOP_TAG:1}
for tags in target_y_train+target_y_validation:
    for tag in tags:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)


##Generate embedding matrix

In [13]:
import gensim.downloader as api
import numpy as np
word_emb_model = api.load("glove-twitter-25") 

EMBEDDING_DIM = 25

embedding_matrix = []
for word in word_list:
    try:
        embedding_matrix.append(word_emb_model.wv[word])
    except:
        embedding_matrix.append([0]*EMBEDDING_DIM)
embedding_matrix = np.array(embedding_matrix)
embedding_matrix.shape


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  # Remove the CWD from sys.path while we load stuff.


(13972, 25)

##Convert dataset to idx

In [0]:
def to_index(data, to_ix):
    input_index_list = []
    for sent in data:
        input_index_list.append([to_ix[w] for w in sent])
    return input_index_list

train_input_index =  to_index(train_data,word_to_ix)
train_output_index = to_index(target_y_train,tag_to_ix)
val_input_index = to_index(validation_data,word_to_ix)
val_output_index = to_index(target_y_validation,tag_to_ix)
test_input_index = to_index(test_data,word_to_ix)

#Model

In [0]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim

torch.manual_seed(1)

def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)

        """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix))
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2 ,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2).to(device),
                torch.randn(2, 1, self.hidden_dim // 2).to(device))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1).to(device)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).to(device), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

##Function for accuracy

In [0]:
import numpy as np
def cal_acc(model, input_index, output_index):
    ground_truth = []
    predicted = []
    for i,idxs in enumerate(input_index):
        ground_truth += output_index[i]
        score, pred = model(torch.tensor(idxs, dtype=torch.long).to(device))
        predicted += pred
    accuracy = sum(np.array(ground_truth) == np.array(predicted))/len(ground_truth)
    return ground_truth, predicted, accuracy

##Initialize Model

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
HIDDEN_DIM = 50

model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, 25, HIDDEN_DIM).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

##Train the model

In [20]:
"""Each epoch will take about 1-2 minutes"""

import datetime

for epoch in range(70):  
    time1 = datetime.datetime.now()
    train_loss = 0

    model.train()
    for i, idxs in enumerate(train_input_index):
        tags_index = train_output_index[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()

    model.eval()
    _, _, train_acc = cal_acc(model,train_input_index,train_output_index)
    _, _, val_acc = cal_acc(model,val_input_index,val_output_index)

    val_loss = 0
    for i, idxs in enumerate(val_input_index):
        tags_index = val_output_index[i]
        sentence_in = torch.tensor(idxs, dtype=torch.long).to(device)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        loss = model.neg_log_likelihood(sentence_in, targets)
        val_loss+=loss.item()
    time2 = datetime.datetime.now()

    print("Epoch:%d, Training loss: %.2f, train acc: %.4f, val loss: %.2f, val acc: %.4f, time: %.2fs" %(epoch+1, train_loss,train_acc, val_loss, val_acc, (time2-time1).total_seconds()))


Epoch:1, Training loss: 11741.78, train acc: 0.9088, val loss: 2206.82, val acc: 0.8966, time: 114.96s
Epoch:2, Training loss: 7465.51, train acc: 0.9239, val loss: 1951.40, val acc: 0.9074, time: 112.97s
Epoch:3, Training loss: 5942.07, train acc: 0.9374, val loss: 1774.48, val acc: 0.9191, time: 111.32s
Epoch:4, Training loss: 4861.93, train acc: 0.9443, val loss: 1742.47, val acc: 0.9202, time: 111.72s
Epoch:5, Training loss: 4111.24, train acc: 0.9491, val loss: 1758.11, val acc: 0.9226, time: 111.26s
Epoch:6, Training loss: 3468.52, train acc: 0.9545, val loss: 1705.31, val acc: 0.9235, time: 112.28s
Epoch:7, Training loss: 3011.64, train acc: 0.9574, val loss: 1749.92, val acc: 0.9239, time: 112.48s
Epoch:8, Training loss: 2638.91, train acc: 0.9626, val loss: 1696.77, val acc: 0.9275, time: 112.86s
Epoch:9, Training loss: 2310.59, train acc: 0.9640, val loss: 1771.79, val acc: 0.9287, time: 111.88s
Epoch:10, Training loss: 2025.04, train acc: 0.9665, val loss: 1811.41, val acc: 

#Prediction of Test labels

In [21]:
test_output_index = []
for idxs in test_input_index:
    score, pred = model(torch.tensor(idxs, dtype=torch.long).to(device))
    test_output_index += pred
test_output_index

def decode_output(output_list):
    ix_to_tag = {v:k for k,v in tag_to_ix.items()}
    return [ix_to_tag[output] for output in output_list]

test_output_decode = decode_output(test_output_index)


print(test_data)
print(test_input_index)
print(test_output_decode)

[[0], [1755, 636, 1673, 1993, 6418, 2302, 69, 341, 229, 3801, 3774, 9], [8819, 8820], [8821, 69, 816, 1073, 1074, 8822], [1673, 1311, 14, 3100, 159, 196, 5609, 1766, 1971, 22, 77, 6418, 1759, 2302, 742, 663, 229, 77, 387, 2099, 2003, 1768, 18, 1094, 9], [132, 341, 4531, 196, 3811, 4786, 1084, 229, 14, 497, 1768, 159, 14, 387, 69, 7705, 5, 77, 3801, 2324, 3774, 5, 8823, 8824, 9], [341, 1110, 935, 159, 14, 1768, 84, 4531, 3688, 3912, 6345, 26, 14, 2399, 2298, 1104, 8825, 2160, 8568, 8826, 764, 6481, 159, 77, 8827, 750, 8828, 5, 8829, 14, 6362, 775, 14, 3884, 364, 8830, 84, 297, 155, 7663, 1369, 9], [8831, 8832, 3390, 4643, 159, 14, 2302, 229, 3837, 367, 69, 6361, 155, 8833, 1152, 6318, 2936, 49, 1872, 919, 14, 3737, 9], [14, 310, 5953, 1122, 55, 3813, 229, 155, 5609, 1766, 3119, 2484, 68, 14, 648, 367, 9], [777, 2852, 14, 5609, 1835, 1971, 391, 1197, 2573, 69, 8824, 169, 229, 14, 3119, 141, 8834, 9], [391, 2353, 49, 750, 8835, 229, 14, 93, 992, 3093, 2995, 1673, 5, 3914, 49, 1568, 84, 88

##Converting Test ID to required shape (single list)

In [22]:
test_input=[]
for i,tag in enumerate(test_output_decode):
  test_input.append(i)
test_input[-5:]

[46661, 46662, 46663, 46664, 46665]

#Exporting results to CSV file

In [0]:
import csv
with open('Group58_W2V_baseline.csv', 'w') as f:
  writer = csv.writer(f)
  writer.writerows(zip(test_input, test_output_decode))