In [1]:
import torch
import torch.nn as nn 

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle


import nltk
import tqdm
# nltk.download('averaged_perceptron_tagger')
# nltk.download('universal_tagset')


In [2]:
DATA_PATH = '../data/VUA_DATA_PROC/'
VUA_DATA_PATH = '../data/VUA_DATA_RAW/'

In [3]:
corpus = pd.read_csv(VUA_DATA_PATH+'vuamc_corpus_train.csv')

In [4]:
corpus

Unnamed: 0,txt_id,sentence_id,sentence_txt
0,a1e-fragment01,1,Latest corporate unbundler M_reveals laid-back...
1,a1e-fragment01,2,By FRANK KANE
2,a1e-fragment01,3,"IT SEEMS that Roland Franklin , the latest unb..."
3,a1e-fragment01,4,He has not properly investigated the M_target ...
4,a1e-fragment01,5,The 63-year-old M_head of Pembridge Investment...
...,...,...,...
12117,kcv-fragment42,4664,Perhaps when they come back they 've M_got six...
12118,kcv-fragment42,4665,I know .
12119,kcv-fragment42,4666,Some else 's sitting at their desk ?
12120,kcv-fragment42,4667,Well not you know cleaning so I do n't know wh...


In [5]:
os.makedirs(DATA_PATH, exist_ok = True)

In [6]:
def prepare_vua_data(data_file, save_file):
   data= pd.read_csv(VUA_DATA_PATH+data_file)

   sentences = []
   pos_tags = []
   labels = []

   for row in data.values:
      sen = str(row[2]).split(' ')
      sen_len = len(sen)

      cleaned_sen = [w.replace('M_', '') if w.startswith('M_') else w for w in sen]
      pos_tag = [nltk.tag.pos_tag([w], tagset='universal')[0][1] for w in cleaned_sen]
      labels_arr = [1 if w.startswith('M_') else 0 for w in sen]

      cleaned_sen = ' '.join(cleaned_sen).strip()
      sentences.append(cleaned_sen)
      labels.append(labels_arr)
      pos_tags.append(pos_tag)
      
      assert len(pos_tag) == sen_len , "number of position tags for sentence should be equal to the length of sentence"


   df = pd.DataFrame({'txt_id': data['txt_id'].values,
                      'sen_idx': data['sentence_id'].values,
                      'sentence': sentences,
                      'label_seq': labels,
                      'pos_seq': pos_tags,
                      'labeled_sentence': data['sentence_txt'].values
                    #   'genre': np.empty_like(labels)
                      })
   
   df.to_csv(DATA_PATH+save_file, index=False)

                   

In [7]:
def train_val_split(read_path):
    
    np.random.seed(42)
    df = pd.read_csv(DATA_PATH + read_path)
    df = df[df['sentence'].notna()]

    ratio = int(len(df)/10)
    df = df.sample(frac = 1, random_state= 42)

    val_df = df[:ratio]
    train_df = df[ratio:]

    val_path = read_path[:-4]+ '_val.csv'
    train_path = read_path[: -4] + '_train.csv'

    val_df.to_csv(DATA_PATH + val_path, index = False)
    train_df.to_csv(DATA_PATH + train_path, index= False)

    

In [8]:
prepare_vua_data('vuamc_corpus_train.csv', 'VUA_corpus.csv')
prepare_vua_data('vuamc_corpus_test.csv', 'VUA_corpus_test.csv')

In [9]:
train_val_split('VUA_corpus.csv')

In [10]:
train_df = pd.read_csv(f'{DATA_PATH}VUA_corpus_train.csv')
val_df = pd.read_csv(f'{DATA_PATH}VUA_corpus_val.csv')

In [11]:
train_df

Unnamed: 0,txt_id,sen_idx,sentence,label_seq,pos_seq,labeled_sentence
0,clp-fragment01,764,Task analysis can be very expensive in skilled...,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, ...","['NOUN', 'NOUN', 'VERB', 'VERB', 'ADV', 'ADJ',...",Task analysis can be very expensive M_in skill...
1,kcc-fragment02,25,"Well I , I , I asked him what he wanted for Ch...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","['ADV', 'PRON', '.', 'PRON', '.', 'PRON', 'VER...","Well I , I , I asked him what he wanted for Ch..."
2,kcu-fragment02,1489,Now that 's right !,"[0, 1, 0, 0, 0]","['ADV', 'ADP', 'PRT', 'NOUN', '.']",Now M_that 's right !
3,ajf-fragment07,228,The NAACP denounces The Silence of the Lambs b...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","['DET', 'NOUN', 'NOUN', 'DET', 'NOUN', 'ADP', ...",The NAACP M_denounces The Silence of the Lambs...
4,amm-fragment02,1649,"The convex , middle part of the thorax contain...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","['DET', 'NOUN', '.', 'NOUN', 'NOUN', 'ADP', 'D...","The convex , middle part of the thorax contain..."
...,...,...,...,...,...,...
10894,kcv-fragment42,4524,With who ?,"[1, 0, 0]","['ADP', 'PRON', '.']",M_With who ?
10895,crs-fragment01,15,Between 1982 and 1988 provision gradually incr...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","['NOUN', 'NUM', 'CONJ', 'NUM', 'NOUN', 'ADV', ...",M_Between 1982 and 1988 provision gradually in...
10896,ew1-fragment01,12,Long was supported by the bulk of the English ...,"[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","['ADV', 'VERB', 'VERB', 'ADP', 'DET', 'NOUN', ...",Long was M_supported by the M_bulk of the Engl...
10897,a3p-fragment09,183,"We were quite alone and the great church , the...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['PRON', 'VERB', 'ADV', 'ADV', 'CONJ', 'DET', ...","We were quite alone and the great church , the..."


In [12]:
val_df

Unnamed: 0,txt_id,sen_idx,sentence,label_seq,pos_seq,labeled_sentence
0,ab9-fragment03,810,"He was a sensible and capable boy , an eldest ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['PRON', 'VERB', 'DET', 'ADJ', 'CONJ', 'ADJ', ...","He was a sensible and capable boy , an eldest ..."
1,kbj-fragment17,1606,Write it out on a piece of paper and you 'll a...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['VERB', 'PRON', 'ADP', 'ADP', 'DET', 'NOUN', ...",Write it out on a piece of paper and you 'll a...
2,kbh-fragment09,1196,Alright stop it then keep that,"[0, 1, 0, 0, 0, 0]","['NOUN', 'NOUN', 'PRON', 'ADV', 'VERB', 'ADP']",Alright M_stop it then keep that
3,kbw-fragment04,2581,Jonathan .,"[0, 0]","['NOUN', '.']",Jonathan .
4,b1g-fragment02,806,Monitoring of the state of and the changes in ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['VERB', 'ADP', 'DET', 'NOUN', 'ADP', 'CONJ', ...",Monitoring of the state of and the changes in ...
...,...,...,...,...,...,...
1205,kbw-fragment04,2471,Ah .,"[0, 0]","['NOUN', '.']",Ah .
1206,as6-fragment02,416,"As pointed out earlier , the ‘ social causes ’...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","['ADP', 'VERB', 'ADP', 'ADV', '.', 'DET', 'NOU...","As M_pointed M_out earlier , the ‘ social caus..."
1207,g0l-fragment01,324,He burst out laughing .,"[0, 0, 0, 0, 0]","['PRON', 'NOUN', 'ADP', 'VERB', '.']",He burst out laughing .
1208,kbh-fragment03,285,"Yeah , the Robots .","[0, 0, 0, 0, 0]","['NOUN', '.', 'DET', 'NOUN', '.']","Yeah , the Robots ."


In [13]:
labels = pd.read_csv(VUA_DATA_PATH + 'all_pos_tokens.csv', header = None )

In [14]:
labels

Unnamed: 0,0,1
0,a1h-fragment06_114_1,0
1,a1h-fragment06_114_3,0
2,a1h-fragment06_114_4,0
3,a1h-fragment06_115_2,0
4,a1h-fragment06_115_3,0
...,...,...
72606,as6-fragment02_441_21,0
72607,as6-fragment02_441_25,1
72608,as6-fragment02_441_27,0
72609,as6-fragment02_441_30,0


In [15]:
def prepare_tokens_data(label_file, save_tokens_file):

    labels = pd.read_csv(VUA_DATA_PATH + label_file, header = None )

    dict = {}

    for row in labels.values:
        text_id = row[0].split('_')
        txt_id  = text_id[0]
        sen_id = text_id[1]

        offset = int(text_id[2])-1

        if txt_id not in dict:
            dict[txt_id] = {}

        if sen_id not in dict[txt_id]:
            dict[txt_id][sen_id] = [offset]
        else:
            dict[txt_id][sen_id].append(offset)

        with open(DATA_PATH + save_tokens_file, 'wb+') as f:
            pickle.dump(dict, f)

In [16]:
#tokens for test data
prepare_tokens_data('all_pos_tokens_test.csv', 'all_pos_tokens_test.pkl')
prepare_tokens_data('verb_tokens_test.csv', 'verb_tokens_test.pkl')

In [21]:
with open('../data/VUA_DATA_PROC/verb_tokens_test.pkl', 'rb') as pickle_file:
    content = pickle.load(pickle_file)

In [None]:
# def _elmo_vectors(read_path, save_path):

#     df = pd.read_csv(DATA_PATH + read_path)

#     dict = {}

#     txt_ids = df['txt_id'].values
#     sen_ids = df['sen_idx'].values
#     sentences = df['sentence'].values
#     assert len(txt_ids) == len(sentences)

#     batch_sentences = [sentences[i : min(i+64, len(sentences))] for i in range(0, len(sentences, 64))]
#     batch_txt_ids = [txt_ids[i:min(i+64, len(txt_ids))] for i in range(0, len(txt_ids), 64)]
#     batch_sen_ids = [sen_ids[i:min(i+64, len(sen_ids))] for i in range(0, len(sen_ids), 64)]
#     batch_sen_len = [[len(sen.split(' ')) for sen in batch_sen] for batch_sen in batch_sentences]

#     assert len(batch_sentences) == len(batch_txt_ids)

#     for i in tqdm(range(len(batch_sen_ids))):
#         sen = batch_sentences [i]
#         txt_id = batch_txt_ids [i]
#         sen_id = batch_sen_ids[i]
#         sen_len = batch_sen_len[i]
        