In [1]:
import json
import pandas as pd

train = pd.read_csv('./datasets/train.csv')
paper_train_folder = './datasets/train'
# print(train.shape)
# print(train[train['pub_title'] == 'Risk factors and global cognitive status related to brain arteriolosclerosis in elderly individuals'])

train = train.groupby('Id').agg({
    'pub_title': 'first',
    'dataset_title': '|'.join,
    'dataset_label': '|'.join,
    'cleaned_label': '|'.join
}).reset_index()

# print(train.head())
# print(train.shape)

# train['dataset_title'].str.find('|')
train.iloc[3]['pub_title']

'Risk factors and global cognitive status related to brain arteriolosclerosis in elderly individuals'

In [2]:
papers = {}
for paper_id in train['Id'].unique():
    with open(f'{paper_train_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

In [3]:
papers[train['Id'][0]]

[{'section_title': 'Abstract',
  'text': "The aim of this study was to identify if acquiring ICT skills through DOT Lebanon's ICT training program (a local NGO) improved income generation opportunities after 3-months of completing the training. The target population was the NGO's vulnerable young beneficiaries. This study was completed in an effort to find creative and digital solutions to the high rate of youth unemployment in Lebanon (37%), one of the highest rates in the world. Results showed that 48% of beneficiaries who were unemployed at baseline, were exposed to at least one income generation opportunity 3 months after completing the DOT Lebanon training. Also, 49% of beneficiaries who were already employed at baseline were exposed to at least one income generation opportunity. Gender, English proficiency and governorate were variables that were found to be statistically significant. Males were more likely than females to be exposed to income generation opportunities. Those who 

In [4]:
def clean_training_text(txt):
    """
    similar to the default clean_text function but without lowercasing.
    """
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt)).strip()

def shorten_sentences(sentences):
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

def find_sublist(big_list, small_list):
    all_positions = []
    for i in range(len(big_list) - len(small_list) + 1):
        if small_list == big_list[i:i+len(small_list)]:
            all_positions.append(i)
    
    return all_positions

def tag_sentence(sentence, labels): # requirement: both sentence and labels are already cleaned
    sentence_words = sentence.split()
    
    if labels is not None and any(re.findall(f'\\b{label}\\b', sentence)
                                  for label in labels): # positive sample
        nes = ['O'] * len(sentence_words)
        for label in labels:
            label_words = label.split()

            all_pos = find_sublist(sentence_words, label_words)
            for pos in all_pos:
                nes[pos] = 'B'
                for i in range(pos+1, pos+len(label_words)):
                    nes[i] = 'I'

        return True, list(zip(sentence_words, nes))
        
    else: # negative sample
        nes = ['O'] * len(sentence_words)
        return False, list(zip(sentence_words, nes))


In [5]:
# Hyperparameters
MAX_LENGTH = 64 # max no. words for each sentence.
OVERLAP = 20 # if a sentence exceeds MAX_LENGTH, we split it to multiple sentences with overlapping

MAX_SAMPLE = None # set a small number for experimentation, set None for production.

In [6]:
import re
from tqdm import tqdm
import random

cnt_pos, cnt_neg = 0, 0 # number of sentences that contain/not contain labels
ner_data = []

pbar = tqdm(total=len(train))
for i, id, dataset_label in train[['Id', 'dataset_label']].itertuples():
    # paper
    paper = papers[id]
    
    # labels
    labels = dataset_label.split('|')
    labels = [clean_training_text(label) for label in labels]
    
    # sentences
    sentences = set([clean_training_text(sentence) for section in paper 
                 for sentence in section['text'].split('.') 
                ])
    sentences = shorten_sentences(sentences) # make sentences short
    sentences = [sentence for sentence in sentences if len(sentence) > 10] # only accept sentences with length > 10 chars
    
    # positive sample
    for sentence in sentences:
        is_positive, tags = tag_sentence(sentence, labels)
        if is_positive:
            cnt_pos += 1
            ner_data.append(tags)
        elif any(word in sentence.lower() for word in ['data', 'study']): 
            ner_data.append(tags)
            cnt_neg += 1
    
    # process bar
    pbar.update(1)
    pbar.set_description(f"Training data size: {cnt_pos} positives + {cnt_neg} negatives")

# shuffling
random.shuffle(ner_data)


Training data size: 47202 positives + 514263 negatives: 100%|█| 14316/14316 [01:12<00

In [7]:
import nltk
from tqdm import tqdm

# You may need to download NLTK resources for POS tagging
# nltk.download('averaged_perceptron_tagger')

# The word2features function from earlier
def word2features(sent, i):
    word = sent[i]
    
    features = {
        'word': word,
        'is_upper': word.isupper(),
        'is_title': word.istitle(),
        'is_digit': word.isdigit(),
        'word_length': len(word),
        'prefix_1': word[:1],  
        'prefix_2': word[:2],  
        'suffix_1': word[-1:],  
        'suffix_2': word[-2:],  
    }
    
    pos_tag = nltk.pos_tag([word])[0][1]
    features['pos_tag'] = pos_tag
    
    if i > 0:
        prev_word = sent[i - 1]
        features.update({
            'prev_word': prev_word,
            'prev_is_upper': prev_word.isupper(),
            'prev_is_title': prev_word.istitle(),
            'prev_is_digit': prev_word.isdigit(),
        })
    else:
        features['BOS'] = True  
    
    if i < len(sent) - 1:
        next_word = sent[i + 1]
        features.update({
            'next_word': next_word,
            'next_is_upper': next_word.isupper(),
            'next_is_title': next_word.istitle(),
            'next_is_digit': next_word.isdigit(),
        })
    else:
        features['EOS'] = True  

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

In [8]:
# Extract tokens (words) from the data
tokenized_sentences = [[word for word, label in sentence] for sentence in ner_data]

# Feature extraction with a progress bar using tqdm
features = []
for sentence in tqdm(tokenized_sentences, desc="Extracting features", position=0, leave=True):
    features.append(sent2features(sentence))

# Example of showing the features for the first sentence
for token_features in features[0]:
    print(token_features)


Extracting features: 100%|█████████████████| 561465/561465 [06:29<00:00, 1443.13it/s]

{'word': 'A', 'is_upper': True, 'is_title': True, 'is_digit': False, 'word_length': 1, 'prefix_1': 'A', 'prefix_2': 'A', 'suffix_1': 'A', 'suffix_2': 'A', 'pos_tag': 'DT', 'BOS': True, 'next_word': 'medical', 'next_is_upper': False, 'next_is_title': False, 'next_is_digit': False}
{'word': 'medical', 'is_upper': False, 'is_title': False, 'is_digit': False, 'word_length': 7, 'prefix_1': 'm', 'prefix_2': 'me', 'suffix_1': 'l', 'suffix_2': 'al', 'pos_tag': 'JJ', 'prev_word': 'A', 'prev_is_upper': True, 'prev_is_title': True, 'prev_is_digit': False, 'next_word': 'evaluation', 'next_is_upper': False, 'next_is_title': False, 'next_is_digit': False}
{'word': 'evaluation', 'is_upper': False, 'is_title': False, 'is_digit': False, 'word_length': 10, 'prefix_1': 'e', 'prefix_2': 'ev', 'suffix_1': 'n', 'suffix_2': 'on', 'pos_tag': 'NN', 'prev_word': 'medical', 'prev_is_upper': False, 'prev_is_title': False, 'prev_is_digit': False, 'next_word': '28', 'next_is_upper': False, 'next_is_title': False, '




In [11]:
print(features[0])

[{'word': 'A', 'is_upper': True, 'is_title': True, 'is_digit': False, 'word_length': 1, 'prefix_1': 'A', 'prefix_2': 'A', 'suffix_1': 'A', 'suffix_2': 'A', 'pos_tag': 'DT', 'BOS': True, 'next_word': 'medical', 'next_is_upper': False, 'next_is_title': False, 'next_is_digit': False}, {'word': 'medical', 'is_upper': False, 'is_title': False, 'is_digit': False, 'word_length': 7, 'prefix_1': 'm', 'prefix_2': 'me', 'suffix_1': 'l', 'suffix_2': 'al', 'pos_tag': 'JJ', 'prev_word': 'A', 'prev_is_upper': True, 'prev_is_title': True, 'prev_is_digit': False, 'next_word': 'evaluation', 'next_is_upper': False, 'next_is_title': False, 'next_is_digit': False}, {'word': 'evaluation', 'is_upper': False, 'is_title': False, 'is_digit': False, 'word_length': 10, 'prefix_1': 'e', 'prefix_2': 'ev', 'suffix_1': 'n', 'suffix_2': 'on', 'pos_tag': 'NN', 'prev_word': 'medical', 'prev_is_upper': False, 'prev_is_title': False, 'prev_is_digit': False, 'next_word': '28', 'next_is_upper': False, 'next_is_title': False