In [1]:
import pandas as pd

train = pd.read_csv(r'./data/TalkFile_ner_2.csv')

train.head()

Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [2]:
def lower_case(x, lower=False):
    if lower:
        return x.lower()
    else:
        return x

In [3]:
#train['Sentence'].split()

def split_sentence(row):
    data = {'split_sentence': [], 'POS': [], 'Tag': []}

    for column, value in row.items():
        if column == 'Sentence':
            data['split_sentence'].append(value.split())
        if column == 'POS':
            data['POS'].append(value[1:-1].split(","))
        if column == 'Tag':
            data['Tag'].append(value[1:-1].split(","))
            
    return pd.DataFrame(data)
        
new_train_df = pd.concat([split_sentence(row) for _, row in train.iterrows()], ignore_index=True)

new_train_df.head(5)

Unnamed: 0,split_sentence,POS,Tag
0,"[Thousands, of, demonstrators, have, marched, ...","['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', ..."
1,"[Families, of, soldiers, killed, in, the, conf...","['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'..."


In [4]:
def zip_array(row):
    data = {'word_pos_tag': []}
    for column, value in row.items():
        data['word_pos_tag'] = zip(row['split_sentence'], row['POS'], row['Tag']) 
    return pd.DataFrame(data)    

tokenized_ner_data = [zip_array(row) for _, row in new_train_df.iterrows()]
tokenized_ner_data

[                     word_pos_tag
 0         (Thousands, 'NNS', 'O')
 1               (of,  'IN',  'O')
 2   (demonstrators,  'NNS',  'O')
 3            (have,  'VBP',  'O')
 4         (marched,  'VBN',  'O')
 5          (through,  'IN',  'O')
 6      (London,  'NNP',  'B-geo')
 7               (to,  'TO',  'O')
 8          (protest,  'VB',  'O')
 9              (the,  'DT',  'O')
 10             (war,  'NN',  'O')
 11              (in,  'IN',  'O')
 12       (Iraq,  'NNP',  'B-geo')
 13             (and,  'CC',  'O')
 14          (demand,  'VB',  'O')
 15             (the,  'DT',  'O')
 16      (withdrawal,  'NN',  'O')
 17              (of,  'IN',  'O')
 18     (British,  'JJ',  'B-gpe')
 19         (troops,  'NNS',  'O')
 20            (from,  'IN',  'O')
 21            (that,  'DT',  'O')
 22         (country,  'NN',  'O')
 23                (.,  '.',  'O'),
                   word_pos_tag
 0       (Families, 'NNS', 'O')
 1            (of,  'IN',  'O')
 2     (soldiers,  'NNS',  '

In [5]:
# Create the following:
# dic_of_words, word_to_id, id_to_word
# dic_of_tags, tag_to_id, id_to_tag

def create_dic(item_list):
    dic = {}
    for items in item_list:
        for item in items:
            for word in item:
                if word not in dic:
                    dic[word] = 1
                else:
                    dic[word] += 1
    return dic

def create_mapping(dic):
    sorted_items = sorted(dic.items(), key=lambda x: (-x[1], x[0]))
    id_to_item = {i: v[0] for i, v in enumerate(sorted_items)}
    item_to_id = {v: k for k, v in id_to_item.items()}
    return item_to_id, id_to_item
    
def word_mapping(sentences_df):
    words = [ [ [ word[0].lower() for _, word in w.items()] for _, w in s.items()] for s in sentences_df]
    dic = create_dic(words)
    dic['<UNK>'] = 10000000 #UNK tag for unknown words
    word_to_id, id_to_word = create_mapping(dic)
    return dic, word_to_id, id_to_word

def tag_mapping(sentences_df):
    tags = [ [ [ word[-1] for _, word in w.items()] for _, w in s.items()] for s in sentences_df]
    dic = create_dic(tags)
    tag_to_id, id_to_tag = create_mapping(dic)
    return dic, tag_to_id, id_to_tag
    
    

dic_words, word_to_id, id_to_word = word_mapping(tokenized_ner_data)
dic_tags, tag_to_id, id_to_tag = tag_mapping(tokenized_ner_data)

In [6]:
# Prepare sequences
def prepare_dataset(sentences, word_to_id, tag_to_id):
    data = []

    for s in sentences:
        for _, str_words in s.items():
            strs = [w[0] for w in str_words]
            words = [word_to_id[w[0] if w[0] in word_to_id else '<UNK>'] for w in str_words]
            tags = [tag_to_id[w[-1]] for w in str_words]

            data.append({
                'str_words': strs,
                'words': words,
                'tags': tags
            })
    return data

train_data = prepare_dataset(tokenized_ner_data, word_to_id, tag_to_id)

In [7]:
import codecs
import numpy as np


all_word_embeds = {}
for i, line in enumerate(codecs.open("./data/glove.6B.100d.txt", 'r', 'utf-8')):
    s = line.strip().split()
    if len(s) == 100 + 1:
        all_word_embeds[s[0]] = np.array([float(i) for i in s[1:]])

#Intializing Word Embedding Matrix
word_embeds = np.random.uniform(-np.sqrt(0.06), np.sqrt(0.06), (len(word_to_id), 100))

for w in word_to_id:
    if w in all_word_embeds:
        word_embeds[word_to_id[w]] = all_word_embeds[w]
    elif w.lower() in all_word_embeds:
        word_embeds[word_to_id[w]] = all_word_embeds[w.lower()]

print('Loaded %i pretrained embeddings.' % len(all_word_embeds))


Loaded 400000 pretrained embeddings.


In [8]:
# mapping_file = './data/mapping.pkl'

# with open(mapping_file, 'wb') as f:
#     mappings = {
#         'word_to_id': word_to_id,
#         'tag_to_id': tag_to_id
#     }
#     cPickle.dump(mappings, f)

# print('word_to_id: ', len(word_to_id))

In [None]:
# Define the model
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence



class NamedEntityRecog(nn.Module):
    def __init__(self, vocab_size, word_embed_dim, word_hidden_dim, tag_num, dropout, pretrain_embed=None):
        super(NamedEntityRecog, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.input_dim = word_embed_dim

        self.embeds = nn.Embedding(vocab_size, word_embed_dim, padding_idx=0)
        self.embeds.weight.data.copy_(torch.from_numpy(pretrain_embed))

        self.lstm = nn.LSTM(self.input_dim, word_hidden_dim, batch_first=True, bidirectional=True)

        self.hidden2tag = nn.Linear(word_hidden_dim * 2, tag_num)

    def random_embedding(self, vocab_size, embedding_dim):
        pretrain_emb = np.empty([vocab_size, embedding_dim])
        scale = np.sqrt(3.0 / embedding_dim)
        for index in range(1, vocab_size):
            pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedding_dim])
        return pretrain_emb

    def neg_log_likelihood_loss(self, word_inputs, word_seq_lengths, batch_label, mask):
        batch_size = word_inputs.size(0)
        seq_len = word_inputs.size(1)
        word_embeding = self.embeds(word_inputs)
        word_list = [word_embeding]
        word_embeding = torch.cat(word_list, 2)
        word_represents = self.drop(word_embeding)
        packed_words = pack_padded_sequence(word_represents, word_seq_lengths, True)
        hidden = None
        lstm_out, hidden = self.lstm(packed_words, hidden)
        lstm_out, _ = pad_packed_sequence(lstm_out)
        lstm_out = lstm_out.transpose(0, 1)
        feature_out = self.drop(lstm_out)

        feature_out = self.hidden2tag(feature_out)

        loss_function = nn.CrossEntropyLoss(ignore_index=0, reduction='sum')
        feature_out = feature_out.contiguous().view(batch_size * seq_len, -1)
        total_loss = loss_function(feature_out, batch_label.contiguous().view(batch_size * seq_len))
        return total_loss

    def forward(self, word_inputs, word_seq_lengths, batch_label, mask):
        batch_size = word_inputs.size(0)
        seq_len = word_inputs.size(1)
        word_embeding = self.embeds(word_inputs)
        word_list = [word_embeding]
        word_embeding = torch.cat(word_list, 2)
        word_represents = self.drop(word_embeding)
        packed_words = pack_padded_sequence(word_represents, word_seq_lengths, True)
        hidden = None
        lstm_out, hidden = self.lstm(packed_words, hidden)
        lstm_out, _ = pad_packed_sequence(lstm_out)
        lstm_out = lstm_out.transpose(0, 1)
        feature_out = self.drop(lstm_out)

        feature_out = self.hidden2tag(feature_out)

        feature_out = feature_out.contiguous().view(batch_size * seq_len, -1)
        _, tag_seq = torch.max(feature_out, 1)
        tag_seq = tag_seq.view(batch_size, seq_len)
        tag_seq = mask.long() * tag_seq
        return tag_seq


In [None]:
model = NamedEntityRecog(dic_words.size(), 100, 2, tag_to_id.size(), 0.5, word_embeds)


In [None]:
# train
from torch.utils.tensorboard import SummaryWriter

optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9)

train_begin = time.time()
print('train begin', '-' * 50)
print()
print()

writer = SummaryWriter('log')
batch_num = -1
best_f1 = -1
early_stop = 0

for epoch in range(args.epochs):
    epoch_begin = time.time()
    print('train {}/{} epoch'.format(epoch + 1, args.epochs))
    optimizer = lr_decay(optimizer, epoch, 0.05, args.lr)
    batch_num = train_model(train_dataloader, model, optimizer, batch_num, writer, use_gpu)
    new_f1 = evaluate(dev_dataloader, model, word_vocab, label_vocab, pred_file, score_file, eval_script, use_gpu)
    print('f1 is {} at {}th epoch on dev set'.format(new_f1, epoch + 1))
    if new_f1 > best_f1:
        best_f1 = new_f1
        print('new best f1 on dev set:', best_f1)
        early_stop = 0
        torch.save(model.state_dict(), model_name)
    else:
        early_stop += 1

    epoch_end = time.time()
    cost_time = epoch_end - epoch_begin
    print('train {}th epoch cost {}m {}s'.format(epoch + 1, int(cost_time / 60), int(cost_time % 60)))
    print()

    if early_stop > args.patience:
        print('early stop')
        break

train_end = time.time()
train_cost = train_end - train_begin
hour = int(train_cost / 3600)
min = int((train_cost % 3600) / 60)
second = int(train_cost % 3600 % 60)
print()
print()
print('train end', '-' * 50)
print('train total cost {}h {}m {}s'.format(hour, min, second))
print('-' * 50)

model.load_state_dict(torch.load(model_name))
test_acc = evaluate(test_dataloader, model, word_vocab, label_vocab, pred_file, score_file, eval_script, use_gpu)
print('test acc on test set:', test_acc)