# COLAB PRE-REQS

In [1]:
from google.colab import drive
drive.mount('/content/drive')

!unzip /content/drive/My\ Drive/datasets.zip 

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive
Archive:  /content/drive/My Drive/datasets.zip
   creating: datasets/
  inflating: datasets/README.md      
   creating: datasets/dev-articles/
  inflating: datasets/dev-articles/article730081389.txt  
  inflating: datasets/dev-articles/article730093263.txt  
  inflating: datasets/dev-articles/article730246508.txt  
  inflating: datasets/dev-articles/article730269378.txt  
  inflating: datasets/dev-articles/article738028498.txt  
  inflating: datasets/dev-arti

In [2]:
!pip install flair==0.4.3

Collecting flair==0.4.3
[?25l  Downloading https://files.pythonhosted.org/packages/77/e3/389c2dd8d0e6ca1d8fad11aa4940e8df6909a26a5d954c0eff01f0d78b57/flair-0.4.3-py3-none-any.whl (180kB)
[K     |█▉                              | 10kB 27.6MB/s eta 0:00:01[K     |███▋                            | 20kB 6.0MB/s eta 0:00:01[K     |█████▌                          | 30kB 8.5MB/s eta 0:00:01[K     |███████▎                        | 40kB 5.5MB/s eta 0:00:01[K     |█████████                       | 51kB 6.7MB/s eta 0:00:01[K     |███████████                     | 61kB 7.9MB/s eta 0:00:01[K     |████████████▊                   | 71kB 9.0MB/s eta 0:00:01[K     |██████████████▋                 | 81kB 10.1MB/s eta 0:00:01[K     |████████████████▍               | 92kB 11.2MB/s eta 0:00:01[K     |██████████████████▏             | 102kB 8.9MB/s eta 0:00:01[K     |████████████████████            | 112kB 8.9MB/s eta 0:00:01[K     |█████████████████████▉          | 122kB 8.9MB/s e

In [3]:
!ls

datasets  drive  sample_data


# MODULE FUNCTIONS

In [0]:
import sys
sys.setrecursionlimit(10**6)

def read_dev_article(article_id):
    '''
    returns article raw text and span offsets given the article id
    '''
    
    article_fname = './datasets/dev-articles/article' + str(article_id) + '.txt'
    with open(article_fname, newline = '\n') as article:
        raw = article.read()

    return raw

#Word Level Data Processing
def getWordSpans(text):
    wordlist=[]
    def trans(text,pointer=0):
        if pointer==len(text)-1:
            return True
        else:
            while(not text[pointer].isalpha() and pointer<len(text)-1):
                pointer=pointer+1
            s=pointer
            while(text[pointer].isalpha() and pointer<len(text)-1):
                pointer=pointer+1
            wordlist.append([s,pointer])
            return trans(text,pointer)
    try:
        trans(text)
    except :
        return -1
    if(wordlist[-1][1]==wordlist[-1][0]):
        wordlist=wordlist[:-1]
    if(text[-1].isalpha()):
        wordlist[-1][1]+=1
    return wordlist

def getCharSpans(prediction,wordlist):
    charSpans=[]
    def getSpan(prediction,wordlist):
        for i in range(len(prediction)):
            if(i==0):
                if(prediction[i]==1):
                    charSpans.append(wordlist[0][0])
            elif(prediction[i]==0 and prediction[i-1]==1):
                charSpans.append(wordlist[i-1][1])
            elif(prediction[i]==1 and prediction[i-1]==0):
                charSpans.append(wordlist[i][0])
            if(i==len(prediction)-1 and prediction[i]==1):
                charSpans.append(wordlist[-1][1])
    getSpan(prediction,wordlist)
    return [[charSpans[i],charSpans[i+1]] for i in range(0,len(charSpans),2)]

def pred_span(text,prediction):
    wordlist=getWordSpans(text)
    return getCharSpans(prediction,wordlist)   

def getDevWords(article_id):
    text = read_dev_article(article_id)
    wordlist = getWordSpans(text)
    
    words = []
    for word in wordlist:
        words.append(text[word[0]:word[1]])
    
    return words

# IMPORTS

In [0]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

import os
import time
import matplotlib.pyplot as plt

from flair.embeddings import WordEmbeddings, BytePairEmbeddings, StackedEmbeddings, Sentence
%matplotlib inline

In [6]:
#SEED

seed = 1234
torch.manual_seed(seed)
np.random.seed(seed)

#GPU CHECK

device = ('cuda' if torch.cuda.is_available() else 'cpu')
device

'cuda'

# PREDICTION

In [0]:
article_directory = './datasets/dev-articles/'

In [0]:
article_fnames = os.listdir(article_directory)
article_ids = [int(fname[7:16]) for fname in article_fnames]

In [0]:
cols = ['article_id', 'span_start', 'span_end']
df = pd.DataFrame(columns=cols)

In [10]:
#TD-LSTM

class WordTDLSTM(nn.Module):
    
    def __init__(self, input_size, hidden_size):
        super(WordTDLSTM, self).__init__()
        
        #Input Parameters
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        #Defining Layers
        self.lstm_l = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True)
        self.lstm_r = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True)
        self.fc1 = nn.Linear(2*hidden_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)

        self.sigmoid = nn.Sigmoid()
        
    def forward(self, sequence_r, sequence_l):
        #LSTM Outputs
        __, (lstm_r_out, _) = self.lstm_r(sequence_r)                 #Right LSTM output
        __, (lstm_l_out, _) = self.lstm_l(sequence_l)                 #Left LSTM output

        #FC Layer
        fc_input = torch.cat((lstm_r_out, lstm_l_out), dim=2)    #Concatenating outputs from last cells of both LSTMs
        fc_input = fc_input.reshape(fc_input.shape[1], fc_input.shape[2])
        out = self.fc1(fc_input)
        out = self.fc2(out)
        out = self.sigmoid(out)                                  #Final sigmoid output
        
        return out

model = WordTDLSTM(input_size=400*2, hidden_size=256)
model = torch.load('/content/drive/My Drive/WordTDLSTM_fasttext_24_12_19.pth')
model = model.to(device)
model.eval()

WordTDLSTM(
  (lstm_l): LSTM(800, 256, batch_first=True)
  (lstm_r): LSTM(800, 256, batch_first=True)
  (fc1): Linear(in_features=512, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [11]:
# init embedding
embedding = StackedEmbeddings(
    [
        # standard FastText word embeddings for English
        WordEmbeddings('en'),
        # Byte pair embeddings for English
        BytePairEmbeddings('en'),
    ]
)

2019-12-23 20:52:23,953 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.3/en-fasttext-news-300d-1M.vectors.npy not found in cache, downloading to /tmp/tmpqf1kxcw7


100%|██████████| 1200000128/1200000128 [00:13<00:00, 86287031.35B/s]

2019-12-23 20:52:38,053 copying /tmp/tmpqf1kxcw7 to cache at /root/.flair/embeddings/en-fasttext-news-300d-1M.vectors.npy





2019-12-23 20:52:52,308 removing temp file /tmp/tmpqf1kxcw7
2019-12-23 20:52:52,539 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.3/en-fasttext-news-300d-1M not found in cache, downloading to /tmp/tmpvznhsk_n


100%|██████████| 54600983/54600983 [00:07<00:00, 7419142.34B/s]

2019-12-23 20:53:00,069 copying /tmp/tmpvznhsk_n to cache at /root/.flair/embeddings/en-fasttext-news-300d-1M





2019-12-23 20:53:00,130 removing temp file /tmp/tmpvznhsk_n


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
100%|██████████| 1987533/1987533 [00:00<00:00, 27149353.57B/s]

downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs100000.model



 19%|█▊        | 3581952/19357958 [00:00<00:00, 35815370.02B/s]

downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs100000.d50.w2v.bin.tar.gz


100%|██████████| 19357958/19357958 [00:00<00:00, 60412207.17B/s]


In [0]:
max_len = 8
pad_embed = torch.zeros(1,400)

In [13]:
start_time = time.time()
article_id_list, span_start_list, span_end_list = [], [], []

for article_id in article_ids:
  words = getDevWords(article_id)
  word_sent = [(Sentence(word)) for word in words]
  for sent in word_sent:
      embedding.embed(sent)
  word_embeds = [sent[0].embedding.cpu().reshape(1, 400) for sent in word_sent]

  preds = []
  for i in range(len(word_embeds)):
    x_r = word_embeds[i:]
    x_l = word_embeds[:i+1]
        
    if len(x_r)>max_len:
      x_r = x_r[:max_len]
    elif len(x_r)<max_len:
      x_r = x_r + [pad_embed]*(max_len-len(x_r))
        
    if len(x_l)>max_len:
      x_l = x_l[-max_len:]
    elif len(x_l)<max_len:
      x_l = [pad_embed]*(max_len-len(x_l)) + x_l 
    
    x_r_ftxt, x_l_ftxt = torch.cat(x_r, dim=0), torch.cat(x_l, dim=0)
    x_r_ftxt, x_l_ftxt = x_r_ftxt.reshape(1, 8, 400), x_l_ftxt.reshape(1, 8, 400)
    x_target_ftxt = (torch.cat([x_r[-1]]*len(x_r))).reshape(1, 8, 400)
    x_r_ftxt, x_l_ftxt = torch.cat((x_r_ftxt, x_target_ftxt), dim=2), torch.cat((x_l_ftxt, x_target_ftxt), dim=2)

    out = model(x_r_ftxt.float().to(device), x_l_ftxt.float().to(device))
    out = out > 0.5

    preds.append(out.item())

  assert(len(words) == len(preds))
  spans = pred_span(read_dev_article(article_id),preds)

  for i in range(len(spans)):
    article_id_list.append(article_id)
    span_start_list.append(spans[i][0])
    span_end_list.append(spans[i][1])

print("Done in ", time.time()-start_time, " seconds")

Done in  63.765687704086304  seconds


In [0]:
df['article_id'] = article_id_list
df['span_start'] = span_start_list
df['span_end'] = span_end_list

In [15]:
df.head()

Unnamed: 0,article_id,span_start,span_end
0,772836731,97,123
1,772836731,130,134
2,772836731,143,150
3,772836731,224,241
4,772836731,249,263


In [16]:
len(df)

3002

In [0]:
df.to_csv('./fastText_24_12_2019.txt', header=None, index=None, sep='\t', mode='a')

In [0]:
from google.colab import files
files.download('./fastText_24_12_2019.txt')