In [1]:
import os, sys
sys.path.append(os.path.abspath('./src'))
import utils, json
from tqdm import tqdm
from utils import save_value, load_value, load_env_keys, match_labels, tokenize_string
import pandas as pd
import numpy as np

In [7]:
# PATHS
data_path = '/scratch/juanmoo1'
EMA_dump_path = os.path.join(data_path, './jsons/new_EMA_dump.json')
EMA_old_dump_path = os.path.join(data_path, './jsons/EMA_dump.json')

EMA_annotations_path = os.path.join(data_path, './bayer/VendorEMAforMIT/new_annotations/annotations.xlsx')
EMA_old_annotations_path = os.path.join(data_path, './bayer/VendorEMAforMIT/annotations.xlsx')

pickle_dumps_path = os.path.join(data_path, './pickle_dumps/')
checkpoint_path = os.path.join(pickle_dumps_path, 'checkpoint.pickle')

In [8]:
# Raw Data
'''
Format:
{
    document_name <str>: {
                            element_text: <str> (raw text),
                            element_tag: <str> (TEI XML tag)
                          },
                          
    ...
}
'''
data = json.loads(open(EMA_dump_path, 'r').read())
old_data = json.loads(open(EMA_old_dump_path, 'r').read())


# Labels
'''
Dict in form:
{
    file_name: {
        texts: [ <str>, ...],
        labels: [ <str>, ...]
    },
    
    ...
    
}
'''
annotations = utils.parse_spreadsheet(EMA_annotations_path)
old_annotations = utils.parse_spreadsheet(EMA_old_annotations_path)

## Matching Data to Labels

In [4]:
'''
Iterates through each document in the dataset and compares is to labels with the same file name. Matching is done using fuzzy string matching unless the exact_matching is set to True.
'''

labeled_raw_documents = match_labels(data, annotations, exact_match=False)
save_value('labeled_raw_documents', labeled_raw_documents, path=checkpoint_path)

100%|██████████| 5/5 [08:57<00:00, 107.40s/it]


In [9]:
labeled_raw_documents = load_value('labeled_raw_documents', path=checkpoint_path)

## Preprocessing

In [10]:
# Clean input text
processed_documents = dict()
processed_document_list = []


for doc_name in labeled_raw_documents:
    texts = [tokenize_string(raw) for raw in labeled_raw_documents[doc_name]['texts']]
    labels = labeled_raw_documents[doc_name]['labels']
    head1 = [tokenize_string(raw) for raw in labeled_raw_documents[doc_name]['head1']]
    head2 = [tokenize_string(raw) for raw in labeled_raw_documents[doc_name]['head2']]
    
    for i in range(len(texts)):
        processed_document_list.append([doc_name, head1[i], head2[i], labels[i], texts[i]])


data = pd.DataFrame(processed_document_list, columns=['document', 'head1', 'head2', 'label', 'text'])

In [11]:
pd.unique(data['label'])

array(['other', 'Populations - Geriatric', 'Populations - Paediatric',
       'Significant Findings - Pregnancy',
       'Significant Findings - Hepatic Impairment', 'Populations - Adult',
       'Significant Findings - Renal Impairment'], dtype=object)

# Multi-Class SVN

In [15]:
#pipeline of feature engineering and model
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import *
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

model = Pipeline([('vectorizer', CountVectorizer()),
 ('tfidf', TfidfTransformer()),
 ('clf', OneVsRestClassifier(LinearSVC(class_weight='balanced')))])

## Box Search with Header augmented feature 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack, vstack
from functools import reduce

# Search Params
ngram_configs = [(1, 1), (2, 2), (1, 2), (1, 3), (1, 4), (1, 5)]
tfidf_configs = [True, False]
vectorizer_stopwords_configs = ['english', None]
min_df_configs = [0] + [10**(-n) for n in range(3, 4)]

total_config_count = len(ngram_configs) * len(tfidf_configs) * len(vectorizer_stopwords_configs) * len(min_df_configs)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
import random
import numpy as np

def multi_svm_train(doc_list, config=None):
    train_texts = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['texts'] for doc_name in doc_list])
    train_header1 = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['header1'] for doc_name in doc_list])
    train_header2 = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['header2'] for doc_name in doc_list])
    train_labels = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['labels'] for doc_name in doc_list])
    
    if config is None:
        config = load_value('best_config_header', path=checkpoint_path)
        
    text_tokenizer = CountVectorizer(ngram_range=config['ngram_config'], stop_words=config['stop_config'])
    header_tokenizer = CountVectorizer(ngram_range=config['ngram_config'], stop_words=config['stop_config'])
    
    tokenized_texts = text_tokenizer.fit_transform(train_texts)
    tokenized_header1 = header_tokenizer.fit_transform(train_header1)
    tokenized_header2 = header_tokenizer.transform(train_header2)
    X_train = hstack([tokenized_texts, tokenized_header1, tokenized_header2])
    Y_train = train_labels

    model = Pipeline([('tfidf', TfidfTransformer(use_idf=config['tfidf_config'])), ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])
    model.fit(X_train, Y_train)
    
    return (model, text_tokenizer, header_tokenizer)

def multi_svm_test(model, text_tokenizer, header_tokenizer, doc_list):
    test_texts = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['texts'] for doc_name in doc_list])
    test_header1 = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['header1'] for doc_name in doc_list])
    test_header2 = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['header2'] for doc_name in doc_list])
    test_labels = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['labels'] for doc_name in doc_list])
    
    tokenized_texts = text_tokenizer.transform(test_texts)
    tokenized_header1 = header_tokenizer.transform(test_header1)
    tokenized_header2 = header_tokenizer.transform(test_header2)
    X_test = hstack([tokenized_texts, tokenized_header1, tokenized_header2])
    Y_test = test_labels
    pred = model.predict(X_test)
    cm = np.array(confusion_matrix(Y_test, pred))
    
    # Diagonal elemetns were correctly classified
    diagonal = cm.diagonal()
    
    # Input class Counts
    class_sum = cm.sum(axis=1)
    
    # Predicted class counts
    pred_sum = cm.sum(axis=0)
    
    # Per-class performance w/ no-examples -> 0 perf
    precision = np.where(class_sum == 0, 0, diagonal/class_sum)
    recall = np.where(pred_sum == 0, 0, diagonal/pred_sum)
    
    # Frequency Weighted Performance
    c_freq = cm.sum(axis=1)/cm.sum()
    pres = c_freq * precision
    rec = c_freq * recall
    
    # Remove 'other' Category
    c_freq = c_freq[0:1] + c_freq[2:]
    pres = pres[0:1] + pres[2:]
    rec = rec[0:1] + rec[2:]
    
    return pres.sum()/c_freq.sum(), rec.sum()/c_freq.sum()

def cross_validation(doc_list, train_algo, test_algo, k, verbose=False, config=None):
    N = len(doc_list)
    size = N//k
    indeces = list(range(N))
    random.shuffle(indeces)
    all_indeces = set(indeces)
    
    pres_list = []
    rec_list = []
    
    for j in range(N//size):
        train_indeces = indeces[j * size:(j + 1) * size] + indeces[size * k + j: size * k + j + 1]
        test_indeces = list(all_indeces - set(train_indeces))
        
        train_docs = [doc_list[i] for i in train_indeces]
        test_docs = [doc_list[i] for i in test_indeces]
        
        if verbose:
            print('Fold %d starting!'%(j + 1))
        
        m, tt, ht = train_algo(train_docs, config=config)
        pres, rec = test_algo(m, tt, ht, test_docs)
        
        pres_list.append(pres)
        rec_list.append(rec)
        
        if verbose:
            print('precision:', pres)
            print('recall:', rec)
            print('-' * 10 + '\n')
    
    return sum(pres_list)/k, sum(rec_list)/k

In [None]:
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import confusion_matrix, accuracy_score

warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
warnings.filterwarnings(action='ignore', category=RuntimeWarning)

best_config = None
best_f1 = -1

count = 0
for stop_config in vectorizer_stopwords_configs:
    for ngram_config in ngram_configs:
        for min_df_config in min_df_configs:
            for tfidf_config in tfidf_configs:
                count += 1
                config = {
                            'stop_config': stop_config,
                            'ngram_config': ngram_config,
                            'tfidf_config': tfidf_config,
                            'min_df_config': min_df_config
                         }
                print('Progress: ' + str(count) + '/' + str(total_config_count), '\t =>', count/total_config_count)
                print('Testing configuration:', config)
                pres, rec = cross_validation(list(processed_documents), multi_svm_train, multi_svm_test, 10, verbose=False)
                f1 = 2 * (pres * rec)/(pres + rec)
                
                print('Precision: %f \t Recall: %f, F1: %f'%(pres, rec, f1))

                if f1 > best_f1:
                    best_config = config
                    best_f1 = f1


save_value('best_config_header', config, path=checkpoint_path)
save_value('best_score_header', f1, path=checkpoint_path)

In [None]:
config = load_value('best_config_header', path=checkpoint_path)
acc = load_value('best_score_header', path=checkpoint_path)
print(config)
print(acc)

In [None]:
config = load_value('best_config_header', path=checkpoint_path)

avg_precision, avg_recall = cross_validation(list(processed_documents), multi_svm_train, multi_svm_test, 10, verbose=True, config=config)
print('Average Precision:', avg_precision)
print('Average Recall:', avg_recall)

In [None]:
load_value('best_header_config', path=checkpoint_path)

# Single Concept Classification

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack, vstack
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import *
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

def svm_train(train_docs, data, label, config=None):
    train_data = data.loc[data['document'].isin(train_docs)]
    
    if config is None:
        config = {
            'ngram_config': (1, 4),
            'stop_config': 'english',
            'tfidf_config': True
        }
        
    text_tokenizer = CountVectorizer(ngram_range=config['ngram_config'], stop_words=config['stop_config'])
    head1_tokenizer = CountVectorizer(ngram_range=config['ngram_config'], stop_words=config['stop_config'])
    head2_tokenizer = CountVectorizer(ngram_range=config['ngram_config'], stop_words=config['stop_config'])
    
    tokenized_texts = text_tokenizer.fit_transform(train_data['text'])
    tokenized_head1 = head1_tokenizer.fit_transform(train_data['head1'])
    tokenized_head2 = head2_tokenizer.fit_transform(train_data['head2'])
    
    X_train = hstack([tokenized_texts, tokenized_head1, tokenized_head2])
    Y_train = (train_data['label'] == label)
    
    model = Pipeline([('tfidf', TfidfTransformer(use_idf=config['tfidf_config'])), ('clf', LinearSVC(class_weight="balanced"))])
    model.fit(X_train, Y_train)
    
    return {
        'model': model,
        'tt': text_tokenizer,
        'h1t': head1_tokenizer,
        'h2t': head2_tokenizer,
        'label': label
    }

def svm_test(test_docs, data, params):
    
    test_data = data.loc[data['document'].isin(test_docs)]
    
    tt = params['tt'].transform(test_data['text'])
    th1 = params['h1t'].transform(test_data['head1'])
    th2 = params['h2t'].transform(test_data['head2'])
    
    X_test = hstack([tt, th1, th2])
    Y_test = (test_data['label'] == params['label'])
   
    pred = params['model'].predict(X_test)
    cm = np.array(confusion_matrix(Y_test, pred))
    
    # Diagonal elemetns were correctly classified
    diagonal = cm.diagonal()
    
    # Input class Counts
    class_sum = cm.sum(axis=1)
    
    # Predicted class counts
    pred_sum = cm.sum(axis=0)
    
    # Per-class performance w/ no-examples -> 0 perf
    precision = np.where(class_sum == 0, 0, diagonal/class_sum)
    recall = np.where(pred_sum == 0, 0, diagonal/pred_sum)
    
    # Frequency Weighted Performance
    c_freq = cm.sum(axis=1)/cm.sum()
    pres = c_freq * precision
    rec = c_freq * recall
    
    # Remove 'other' Category
    c_freq = c_freq[1:]
    pres = pres[1:] 
    rec = rec[1:]
    
    return pres.sum()/c_freq.sum(), rec.sum()/c_freq.sum(), cm

In [21]:
labels = sorted(pd.unique(data['label']))
documents = pd.unique(data['document'])
train_docs = documents[:3]
test_docs = documents[3:]

data_train = data.loc[data['document'].isin(train_docs)]
data_test = data.loc[data['document'].isin(test_docs)]

In [22]:
test_docs

array(['abasaglar-previously-abasria-epar-product-information_en',
       'aclasta-epar-product-information_en'], dtype=object)

In [23]:
import warnings
# Ignore division by zero when calculating F1 score
warnings.filterwarnings(action='ignore', category=RuntimeWarning) 

for l in labels:
    if l != 'other':
        print('=' * 10 , 'Testing Label:', l, '=' * 10)
        print()
        
        
        params = svm_train(train_docs, data, l)
        precision, recall, cm = svm_test(test_docs, data, params)
        
        train_count = (data_train['label'] == l).sum()
        test_count = (data_test['label'] == l).sum()
        
        print('Confusion Matrix:')
        print(cm)
        print()
        
        print('Precision:', precision)
        print('Recall:', recall)
        print('F1:', 2 * (precision * recall)/(precision + recall))
        print()
        
        print('Training Examples Count:', train_count)
        print('Test Examples Count:', test_count)
        print()   


Confusion Matrix:
[[171   0]
 [  1   2]]

Precision: 0.6666666666666666
Recall: 1.0
F1: 0.8

Training Examples Count: 6
Test Examples Count: 3


Confusion Matrix:
[[163   6]
 [  4   1]]

Precision: 0.2
Recall: 0.14285714285714285
F1: 0.16666666666666666

Training Examples Count: 31
Test Examples Count: 5


Confusion Matrix:
[[172   0]
 [  2   0]]

Precision: 0.0
Recall: 0.0
F1: nan

Training Examples Count: 43
Test Examples Count: 2


Confusion Matrix:
[[171   0]
 [  3   0]]

Precision: 0.0
Recall: 0.0
F1: nan

Training Examples Count: 18
Test Examples Count: 3


Confusion Matrix:
[[161   4]
 [  7   2]]

Precision: 0.2222222222222222
Recall: 0.3333333333333333
F1: 0.26666666666666666

Training Examples Count: 33
Test Examples Count: 9


Confusion Matrix:
[[174]]

Precision: nan
Recall: nan
F1: nan

Training Examples Count: 3
Test Examples Count: 0


Confusion Matrix:
[[167   4]
 [  0   3]]

Precision: 1.0
Recall: 0.42857142857142855
F1: 0.6

Training Examples Count: 14
Test Examples C