In [None]:
import os, re, pickle
import numpy as np
import pandas as pd
import nltk
import re

In [None]:
## Note that this section of code is adapted from the clinicalBERT github repo: 
## https://github.com/EmilyAlsentzer/clinicalBERT/blob/169542dcdb6eda1fdbc6696e766818a3bb9601a4/downstream_tasks/run_ner.py#L157

def read_data(input_file):
    with open(input_file) as f:
        lines = []
        words = []
        labels = []
        all_words = []
        all_labels = []
        for line in f:
            line = line.strip()
            if len(line) == 0: #i.e. spaces indicate we are in between sentences
                assert len(words) == len(labels)
                if len(words) == 0:
                    continue
                lines.append([words, labels])
                all_words.append(words)
                all_labels.append(labels)
                words = []
                labels = []
                continue

            word = line.split()[0]
            label = line.split()[-1]
            words.append(word)
            labels.append(label)

        return all_words, all_labels, lines

In [None]:
def create_chunks(lst, n):
    """Yield successive n-sized chunks from a list."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [None]:
# sent_list = word_list output from read_data
# label_list = label_list output from read_data
# n = maximum sentence length to be considered for splitting

def process_data(sent_list, label_list, n):
    print("Length of sentence list: ",len(sent_list))
    print("Length of label list: ",len(label_list))
    
    # Chunk lists based on input parameter n
    chunked_sent = []
    for i in sent_list:
        chunks = list(create_chunks(i,n))
        chunked_sent.append(chunks)
        
    chunked_labels = []
    for i in label_list:
        chunks = list(create_chunks(i,n))
        chunked_labels.append(chunks)
    
    # Format output as dataframes where each row contains a sentence or set of labels for that sentence
    chunked_sent_df = pd.DataFrame(chunked_sent)
    stacked_chunked_sent_df = pd.DataFrame(chunked_sent_df.stack())
    stacked_chunked_sent_df.columns = ['SENTENCE']
    stacked_chunked_sent_df['INDEX_COL'] = stacked_chunked_sent_df.index.get_level_values(0)
    stacked_chunked_sent_df['SUB_INDEX_COL'] = stacked_chunked_sent_df.index.get_level_values(1)
    stacked_chunked_sent_df['SENTENCE_LENGTH'] = stacked_chunked_sent_df['SENTENCE'].astype(str).str.split().str.len()
    
    chunked_labels_df = pd.DataFrame(chunked_labels)
    stacked_chunked_labels_df = pd.DataFrame(chunked_labels_df.stack())
    stacked_chunked_labels_df.columns = ['LABELS']
    
    # Join the sentence dataframe with the label dataframe, then reset index
    merged_df = stacked_chunked_sent_df.join(stacked_chunked_labels_df)
    re_indx_df = merged_df.reset_index(drop=True)
    
    # Unique ID will not act as the unique identifier for each chunked sentence
    re_indx_df['UNIQUE_ID'] = re_indx_df.index
    
    # Reorder columns
    re_indx_df = re_indx_df[['INDEX_COL',
                             'SUB_INDEX_COL',
                             'UNIQUE_ID',
                             'SENTENCE_LENGTH',
                             'SENTENCE',
                             'LABELS']]
    
    # Check that all rows have maximum number of words equal to n
    print("Maximum chunked sentence length: ", max(re_indx_df.SENTENCE_LENGTH))
    
    # Explode dataframe so that each row contains a word and its corresponding label
    explode_df = re_indx_df.set_index(['UNIQUE_ID']).apply(pd.Series.explode).reset_index()
    
    return explode_df

## Reformat training set

In [None]:
word_list_train, label_list_train, lines_list_train = read_data("./processed/merged/train.tsv")

In [None]:
reformat_train_df = process_data(word_list_train, label_list_train, 126)

In [None]:
reformat_train_df

In [None]:
# check on sentence 187, 188
reformat_train_df.loc[reformat_train_df['INDEX_COL'] == 187]

In [None]:
# Export
reformat_train_df.to_csv('./processed_data_126/train.csv')

## Reformat dev set

In [None]:
word_list_dev, label_list_dev, lines_list_dev = read_data("./processed/merged/dev.tsv")

In [None]:
reformat_dev_df = process_data(word_list_dev, label_list_dev, 126)

In [None]:
reformat_dev_df

In [None]:
# Export
reformat_dev_df.to_csv('./processed_data_126/dev.csv')

## Reformat testing set

In [None]:
word_list_test, label_list_test, lines_list_test = read_data("./processed/merged/test.tsv")

In [None]:
reformat_test_df = process_data(word_list_test, label_list_test, 126)

In [None]:
reformat_test_df

In [None]:
# Export
reformat_test_df.to_csv('./processed_data_126/test.csv')