In [1]:
import numpy as np
import torch
from transformers import RobertaTokenizerFast, RobertaForTokenClassification
import pandas as pd


In [2]:
import ast
import json
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [72]:
def getData(tokenizer, token_data, span_start, span_end):
    
    mapList = []
    tokenizedList = []
    maxLength = 200
    for datapoint in token_data:
        
        curMap = {}
        curTokenList = []
        for i in range(len(datapoint)):
            curMap[i] = []
            curTk = tokenizer.tokenize(datapoint[i])  
            #iterate over subwords 
            for tk in curTk:
                curMap[i].append(len(curTokenList))
                curTokenList.append(tokenizer.convert_tokens_to_ids(tk))
        curTokenList.append(tokenizer.convert_tokens_to_ids(tokenizer.sep_token))

        mapList.append(curMap)
        tokenizedList.append(curTokenList)
    
    #padding 
    attn_mask = np.zeros((len(token_data), maxLength))
    for i in range(len(tokenizedList)):
        for j in range(len(tokenizedList[i])):
            attn_mask[i][j] = 1
        while (len(tokenizedList[i]) < maxLength):
            tokenizedList[i].append(tokenizer.convert_tokens_to_ids(tokenizer.pad_token))

    #y value - [pad - 0, O - 1, I - 2, B - 3]
    y_val = np.ones((len(token_data), maxLength))
    for dp in range(len(span_start)):
        #overwrite points in ranges 'I'
        for i in range(len(span_start[dp])):
            for idx in range(span_start[dp][i], span_end[dp][i]+1):
                for k in mapList[dp][idx]:           
                    if (idx == span_start[dp][i]):
                        y_val[dp][k] = 3 #B
                    else:
                        y_val[dp][k] = 2 #I
        
    # mark the padded sequence as 'Not part of tweet': 3
    for i in range(len(token_data)):
        for j in range(maxLength):
            if (attn_mask[i][j] == 0):
                
                if (y_val[i][j] != 1):
                    print("Assertion_Failed")

                y_val[i][j] = 0

    return np.array(tokenizedList), attn_mask, y_val, mapList




def getIOData(fileName: str,
              definition_path: str,
              tokenizer):
    data = pd.read_csv(fileName)
    token_data, span_start, span_end = [], [], []
    cnt_nonClaims = 0
    for i in range(len(data)):
        if (data['claim_label'][i] == 1):
            token_data.append(ast.literal_eval((data['tokens'][i])))
            span_start.append(json.loads(data['span_start_index'][i]))
            span_end.append(json.loads(data['span_end_index'][i]))
        elif (data['claim_label'][i] == 0):
            cnt_nonClaims += 1
    tokenizedList, attn_mask, y_val, mapList = getData(tokenizer, token_data, span_start, span_end)
    model_inputs = {}
    model_inputs['input_ids'] = torch.tensor([i for i in tokenizedList], dtype=torch.long, device=DEVICE)
    model_inputs['attention_mask'] = torch.tensor([i for i in attn_mask], dtype=torch.long, device=DEVICE)
    model_inputs['labels'] = torch.tensor([i for i in y_val], dtype=torch.long, device=DEVICE)
    model_inputs['definition_inputs'] = torch.unsqueeze(pd.read_pickle(definition_path), 0).repeat(model_inputs['input_ids'].shape[0], 1, 1, 1)

    return model_inputs, mapList

In [4]:
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
df = pd.read_csv('train.csv')

In [73]:
kaam = getIOData('train.csv', 'claim_desc_orig_vecs.pkl', tokenizer)

In [77]:
tokenizer.convert_ids_to_tokens([7])

['Ġto']

In [80]:
in1 = df['tokens'][0]

In [49]:
df.head()

Unnamed: 0,tokens,claim_label,span_start_index,span_end_index
0,"['""who', ' may', ' (or', ' may', ' not', ') ha...",1,[43],[53]
1,"['RT', ' @Coach_Brod', ': If', ' you', ' have'...",1,[2],[17]
2,"['#Pharmacists', ' warn', ' against', ' #malar...",1,[0],[4]
3,"['You', ' got', ' to', ' boil', ' your', ' Clo...",1,"[0, 22]","[20, 33]"
4,"['There', ' is', ' no', ' virus', '. \nAnd', '...",1,[0],[3]


In [50]:
in1[0:4]

"['#P"

In [82]:
ast.literal_eval(in1)[43:54]

[' if',
 ' we',
 ' have',
 ' not',
 ' had',
 ' the',
 ' virus',
 ', we',
 ' are',
 ' not',
 ' immune']

In [125]:
labels = []
texts = []
label_map = {'<pad>':0, 'O':1, 'I':2, 'B':3}
for i in range(len(df)):
    text = ast.literal_eval(df['tokens'][i])
    texts.append(text)
    spans = [(i,j) for i,j in zip(ast.literal_eval(df['span_start_index'][i]), ast.literal_eval(df['span_end_index'][i]))]
    label = ['O']*len(text)
    for span in spans:
        label[span[0]] = 'B'
        for i in range(span[0]+1, span[1]+1):
            label[i] = 'I'
    labels.append(label)
    

In [126]:
len(labels), len(texts)

(6044, 6044)

In [127]:
labels = []
texts = []
label_map = {'<pad>':0, 'O':1, 'I':2, 'B':3}
for i in range(len(df)):
    text = ast.literal_eval(df['tokens'][i])
    texts.append(text)
    spans = [(i,j) for i,j in zip(ast.literal_eval(df['span_start_index'][i]), ast.literal_eval(df['span_end_index'][i]))]
    label = ['O']*len(text)
    for span in spans:
        label[span[0]] = 'B'
        for i in range(span[0]+1, span[1]+1):
            label[i] = 'I'
    labels.append(label)

tokens = []
labels_final = []
map_list = []

for i in range(len(texts)):
    text = texts[i]
    tokens_list = []
    labels_list = []
    cur_map = {}
    tokens_list.append(tokenizer.convert_tokens_to_ids(tokenizer.sep_token))
    for j in range(len(text)):
        tokenized = tokenizer.tokenize(text[j])
        cur_map[j] = []
        for sub_token in tokenized:
            cur_map[j].append(len(tokens_list))
            tokens_list.append(tokenizer.convert_tokens_to_ids(sub_token))
    tokens_list.append(tokenizer.convert_tokens_to_ids(tokenizer.sep_token))

    for k in range(len(text)):
        for j in cur_map[k]:
            labels_list.append(labels[i][k])
    labels_final.append(labels_list)
    tokens.append(tokens_list)
    map_list.append(cur_map)

In [129]:
tokenizer.convert_ids_to_tokens(tokens[0])

['</s>',
 '"',
 'who',
 'Ġmay',
 'Ġ(',
 'or',
 'Ġmay',
 'Ġnot',
 ')',
 'Ġhave',
 'Ġit',
 '"',
 'Ġ-',
 'ĠSch',
 'rod',
 'inger',
 "'s",
 'ĠVirus',
 '.',
 'Ġ',
 'ĠWe',
 'Ġcan',
 'Ġnot',
 'Ġget',
 'Ġtested',
 ',',
 'Ġso',
 'Ġwe',
 'Ġhave',
 'Ġto',
 'Ġact',
 'Ġlike',
 'Ġwe',
 'Ġhave',
 'Ġthe',
 'Ġvirus',
 'Ġso',
 'Ġwe',
 'Ġdo',
 'Ġnot',
 'Ġspread',
 'Ġit',
 '.',
 'Ġ',
 'ĠWe',
 'Ġalso',
 'Ġhave',
 'Ġto',
 'Ġact',
 'Ġlike',
 'Ġwe',
 'Ġhave',
 'Ġnever',
 'Ġhad',
 'Ġthe',
 'Ġvirus',
 ',',
 'Ġbecause',
 'Ġif',
 'Ġwe',
 'Ġhave',
 'Ġnot',
 'Ġhad',
 'Ġthe',
 'Ġvirus',
 ',',
 'Ġwe',
 'Ġare',
 'Ġnot',
 'Ġimmune',
 '.#',
 'CO',
 'VID',
 '19',
 '</s>']

In [109]:
from transformers import BertTokenizerFast
btokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [118]:
for wrd in tokenizer.vocab.keys():
    if 'unused' in wrd:
        print(wrd)

Ġunused


In [120]:
tokenizer.vocab['Ġunused']

23797

In [95]:
def getData(tokenizer, token_data, span_start, span_end):
    
    mapList = []
    tokenizedList = []
    maxLength = 200
    for datapoint in token_data:
        
        curMap = {}
        curTokenList = [tokenizer.convert_tokens_to_ids(tokenizer.cls_token)]
        for i in range(len(datapoint)):
            curMap[i] = []
            curTk = tokenizer.tokenize(datapoint[i])  
            #iterate over subwords 
            for tk in curTk:
                curMap[i].append(len(curTokenList))
                curTokenList.append(tokenizer.convert_tokens_to_ids(tk))
        curTokenList.append(tokenizer.convert_tokens_to_ids(tokenizer.sep_token))

        mapList.append(curMap)
        tokenizedList.append(curTokenList)
    
    #padding 
    attn_mask = np.zeros((len(token_data), maxLength))
    for i in range(len(tokenizedList)):
        for j in range(len(tokenizedList[i])):
            attn_mask[i][j] = 1
        while (len(tokenizedList[i]) < maxLength):
            tokenizedList[i].append(tokenizer.convert_tokens_to_ids(tokenizer.pad_token))

    #y value - [pad - 0, O - 1, I - 2, B - 3]
    y_val = np.ones((len(token_data), maxLength))
    for dp in range(len(span_start)):
        #overwrite points in ranges 'I'
        for i in range(len(span_start[dp])):
            for idx in range(span_start[dp][i], span_end[dp][i]+1):
                for k in mapList[dp][idx]:           
                    if (idx == span_start[dp][i]):
                        y_val[dp][k] = 3 #B
                    else:
                        y_val[dp][k] = 2 #I
        
    # mark the padded sequence as 'Not part of tweet': 3
    for i in range(len(token_data)):
        for j in range(maxLength):
            if (attn_mask[i][j] == 0):
                
                if (y_val[i][j] != 1):
                    print("Assertion_Failed")

                y_val[i][j] = 0

    return np.array(tokenizedList), attn_mask, y_val, mapList

In [96]:
rtok = RobertaTokenizerFast.from_pretrained('roberta-base')

In [99]:
rtok.tokenize('sanya')

['s', 'anya']