In [None]:
import os, sys
sys.path.append(os.path.abspath('./src'))
import utils, json
from tqdm import tqdm
from utils import save_value, load_value, load_env_keys, match_labels

In [None]:
# PATHS
data_path = '/scratch/juanmoo1'
EMA_dump_path = os.path.join(data_path, './jsons/EMA_dump.json')
EMA_xmls_path = os.path.join(data_path, './xmls/')
EMA_annotations_path = os.path.join(data_path, './bayer/VendorEMAforMIT/annotations.xlsx')
pickle_dumps_path = os.path.join(data_path, './pickle_dumps/')
checkpoint_path = os.path.join(pickle_dumps_path, 'checkpoint.pickle')

In [None]:
# Raw Data
'''
Format:
{
    document_name <str>: {
                            element_text: <str> (raw text),
                            element_tag: <str> (TEI XML tag)
                          },
                          
    ...
}
'''
data = json.loads(open(EMA_dump_path, 'r').read())


# Labels
'''
Dict in form:
{
    file_name: {
        texts: [ <str>, ...],
        labels: [ <str>, ...]
    },
    
    ...
    
}
'''
annotations = utils.parse_spreadsheet(EMA_annotations_path)

## Matching Data to Labels

In [None]:
'''
Iterates through each document in the dataset and compares is to labels with the same file name. Matching is done using fuzzy string matching unless the exact_matching is set to True.
'''

labeled_raw_documents = match_labels(data, annotations)
save_value('labeled_raw_documents', labeled_raw_documents, path=checkpoint_path)

In [None]:
labeled_raw_documents = load_value('labeled_raw_documents', path=checkpoint_path)

### Conjecture:
The fraction of text in the labels is much smaller than all text. Thus, we should fail to find labels for most of extracted paragraphs

In [None]:
from matplotlib.pyplot import title, hist
data_ratios = []
for parsed_doc_name in data:
    parsed_doc = data[parsed_doc_name]
    label_doc = annotations[parsed_doc_name]
    
    parsed_text = ''.join(e.strip() for e in parsed_doc['element_text'])
    label_text = ''.join(e.strip() for e in label_doc['texts'])
    
    ratio = len(label_text)/len(parsed_text)
    data_ratios.append(ratio)
data_ratios.sort()

# Cut lowest 5% and top 5%
start = int(0.05 * len(data_ratios))
end = int(0.95 * len(data_ratios))

title("Total Text to Labeled Text Ratio")
hist(data_ratios[start:end])

### There seem to be missing documents:

In [None]:
anames = list(annotations.keys())
fnames = list(data.keys())

# from scipy.spatial.distance import hamming

# There's extra files referenced in the annotation spreadsheet
print('anotation names count:', len(anames))
print('document count:', len(fnames))

missing_ann = set(fnames) - set(anames)
missing_docs = set(anames) - set(fnames)

print('missing annotations count:', len(missing_ann))
print('missing doc count:', len(missing_docs))
print('missing docs:')
for doc_name in missing_docs:
    print('-', doc_name)

## Preprocessing

In [None]:
import re 
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\n", "", string)    
    string = re.sub(r"\r", "", string) 
    string = re.sub(r"[0-9]", "digit", string)
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)
    return string.strip().lower()

In [None]:
import re, spacy

NLP = spacy.load('en_core_web_sm')
MAX_CHARS = 20000

def clean_str(comment):
    comment = re.sub(
        r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", str(comment))
    comment = re.sub(r"[ ]+", " ", comment)
    comment = re.sub(r"\!+", "!", comment)
    comment = re.sub(r"\,+", ",", comment)
    comment = re.sub(r"\?+", "?", comment)
    if (len(comment) > MAX_CHARS):
        comment = comment[:MAX_CHARS]
    return ' '.join([x.text.lower() for x in NLP.tokenizer(comment) if x.text != " "])

In [None]:
# Clean input text
processed_documents = {}

for doc_name in labeled_raw_documents:
    texts = [clean_str(raw) for raw in labeled_raw_documents[doc_name]['paragraphs']]
    labels = [l.lower() for l in labeled_raw_documents[doc_name]['labels']]
    tags = [t.lower() for t in labeled_raw_documents[doc_name]['tags']]
    
    processed_documents[doc_name] = {
        'texts': texts,
        'labels': labels,
        'tags': tags
    }

In [None]:
# Match text with the previous header
for doc_name in processed_documents:
    texts = processed_documents[doc_name]['texts']
    tags = processed_documents[doc_name]['tags']
    
    # First Header
    header_index = [-1] * len(texts)    
    last_header = 0
    
    while(last_header < len(tags) and tags[last_header] != 'head'):
        last_header += 1
        
    i = last_header + 1
    
    while i < len(tags):
        header_index[i] = last_header
        if tags[i] == 'head':
            last_header = i
        
        i += 1
    
    # Second Header
    header2 = [-1] * len(header_index)
    last_header = 0
    while(last_header < len(tags) and header_index[last_header] == -1):
        last_header += 1
    
    i = last_header + 1
    while i < len(header2):
        header2[i] = header_index[last_header]
        if header_index[i] != -1:
            last_header = i
        i += 1
        
    
    
    processed_documents[doc_name]['header_index'] = header_index
    processed_documents[doc_name]['header_index_2'] = header2


### Create testing and training sets

In [None]:
from sklearn.model_selection import ShuffleSplit
from functools import reduce
names = list(processed_documents.keys())

rs = ShuffleSplit(n_splits=1, test_size = 0.3)
split = next(rs.split(names))

train_docs = list(map(lambda i: names[i], split[0]))
test_docs = list(map(lambda i: names[i], split[1]))


X_train = reduce(lambda l, dname: l + processed_documents[dname]['texts'], [[]] + train_docs)
Y_train = reduce(lambda l, name: l + processed_documents[name]['labels'], [[]] + train_docs)


X_test = reduce(lambda l, dname: l + processed_documents[dname]['texts'], [[]] + test_docs)
Y_test = reduce(lambda l, name: l + processed_documents[name]['labels'], [[]] + test_docs)

X = X_train + X_test
Y = Y_train + Y_test

# Count Tokenization

In [None]:
# Text + Header
from sklearn.feature_extraction.text import *

# Count Tokenizer
all_texts = set()
all_headers = set()
for doc_name in processed_documents:
    texts = processed_documents[doc_name]['texts']
    labels = processed_documents[doc_name]['labels']
    header_index = processed_documents[doc_name]['header_index']
    
    all_texts = all_texts | set(texts)
    all_headers = all_headers | set([texts[i] for i in header_index if i != -1])

all_texts = sorted(list(all_texts))
all_headers = sorted(list(all_headers))

text_index = {el:i for i, el in enumerate(all_texts)}
header_index = {el:i for i, el in enumerate(all_headers)}

In [None]:
text_vectorizer = CountVectorizer(ngram_range=(1, 4))
all_texts = text_vectorizer.fit_transform(all_texts)

header_vectorizer = CountVectorizer(ngram_range=(1, 4))
all_headers = header_vectorizer.fit_transform(all_headers)

## Remove data classes with insufficient examples

In [None]:
Y_count = {el: Y.count(el) for el in set(Y)}

min_count = 10
X_train, Y_train = zip(*[(X_train[i], Y_train[i]) for i in range(len(X_train)) if Y_count[Y_train[i]] >= min_count])
X_test, Y_test = zip(*[(X_test[i], Y_test[i]) for i in range(len(X_test)) if Y_count[Y_test[i]] >= min_count])

X = X_train + X_test
Y = Y_train + Y_test

In [None]:
labels = list(set(Y))
labels.sort(key=lambda x: Y.count(x))

for l in labels:
    print('-', l[:20], '| frequency:', Y.count(l))

# Multi-Class SVN

In [None]:
#pipeline of feature engineering and model
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import *
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

model = Pipeline([('vectorizer', CountVectorizer()),
 ('tfidf', TfidfTransformer()),
 ('clf', OneVsRestClassifier(LinearSVC(class_weight='balanced')))])

## Param Search

In [None]:
#paramater selection
# Params 1 #
from sklearn.model_selection import GridSearchCV
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
               'tfidf__use_idf': [False, True],
               'vectorizer__min_df': [0, 0.0001, 0.00001],
               'vectorizer__stop_words':[None, 'english']
             }
gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X, Y)

print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)

# Save best found configuration
save_value('best_params1', gs_clf_svm.best_params_, path=checkpoint_path)
save_value('best_score1', gs_clf_svm.best_score_, path=checkpoint_path)

In [None]:
best_params = load_value('best_params1', path=checkpoint_path)
best_score = load_value('best_score1', path=checkpoint_path)

#Training of Final Model
model = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1,4), min_df = 0, stop_words=None)),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])

model.fit(X_train, Y_train)
#Test
pred = model.predict(X_test)

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(Y_test, pred)


accuracy = accuracy_score(Y_test, pred)

class_count = len(cm)

class_accuracy = [cm[i][i]/sum(cm[i]) if sum(cm[i]) > 0 else 0 for i in range(class_count)]
w_acc = [class_accuracy[i] * sum(cm[i]) for i in range(len(cm))]
acc_no_other = (sum(w_acc) - w_acc[1])/(sum(sum(cm[i]) for i in range(class_count)) - sum(cm[1]))
                                        
print(cm)
print('Class Acuracy:', class_accuracy)
print('Overall Accuracy:', accuracy)
print('Accuracy excluding other:', acc_no_other)

In [None]:
categories = dict()
for real, prediction in zip(Y_test, pred):
    if real not in categories:
        categories[real] = [0,0] #total, correct
    categories[real][0] += 1
    categories[real][1] += (real == prediction)
    
for c in categories:
    tot = categories[c][0]
    corr = categories[c][1]
    acc = corr/tot
#     print(c + ' =>', 'total:', tot, '\t\t correct:', corr, '\t\t accuracy:', acc)
    print(c + ': acc', acc, 'total:', tot)
#     print(c + ': correct:', corr, "\t || accuracy:", acc)

## Box Search with Header augmented feature 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack, vstack
from functools import reduce

# Search Params
ngram_configs = [(1, 1), (2, 2), (1, 2), (1, 3), (1, 4), (1, 5)]
tfidf_configs = [True, False]
vectorizer_stopwords_configs = ['english', None]
min_df_configs = [0] + [10**(-n) for n in range(3, 4)]

total_config_count = len(ngram_configs) * len(tfidf_configs) * len(vectorizer_stopwords_configs) * len(min_df_configs)

# Vocab & Data Lists
texts = set()
headers = set()

text_list = []
label_list = []
header1_list = []
header2_list = []

for doc_name in processed_documents:
    doc = processed_documents[doc_name]
    texts = texts | set(doc['texts'])
    headers = headers | set(doc['texts'][i] for i in doc['header_index'] if i != -1)
    
    header1 = [doc['texts'][i] if i != -1 else "" for i in doc['header_index']]
    header2 = [doc['texts'][i] if i != -1 else "" for i in doc['header_index_2']]
    
    doc['header1'] = header1
    doc['header2'] = header2
    
    text_list.extend(doc['texts'])
    label_list.extend(doc['labels'])
    header1_list.extend(header1)
    header2_list.extend(header2)

save_value('processed_documents', processed_documents, path=checkpoint_path)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
import random
import numpy as np

def multi_svm_train(doc_list, config=None):
    train_texts = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['texts'] for doc_name in doc_list])
    train_header1 = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['header1'] for doc_name in doc_list])
    train_header2 = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['header2'] for doc_name in doc_list])
    train_labels = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['labels'] for doc_name in doc_list])
    
    if config is None:
        config = load_value('best_config_header', path=checkpoint_path)
        
    text_tokenizer = CountVectorizer(ngram_range=config['ngram_config'], stop_words=config['stop_config'])
    header_tokenizer = CountVectorizer(ngram_range=config['ngram_config'], stop_words=config['stop_config'])
    
    tokenized_texts = text_tokenizer.fit_transform(train_texts)
    tokenized_header1 = header_tokenizer.fit_transform(train_header1)
    tokenized_header2 = header_tokenizer.transform(train_header2)
    X_train = hstack([tokenized_texts, tokenized_header1, tokenized_header2])
    Y_train = train_labels

    model = Pipeline([('tfidf', TfidfTransformer(use_idf=config['tfidf_config'])), ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])
    model.fit(X_train, Y_train)
    
    return (model, text_tokenizer, header_tokenizer)

def multi_svm_test(model, text_tokenizer, header_tokenizer, doc_list):
    test_texts = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['texts'] for doc_name in doc_list])
    test_header1 = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['header1'] for doc_name in doc_list])
    test_header2 = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['header2'] for doc_name in doc_list])
    test_labels = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['labels'] for doc_name in doc_list])
    
    tokenized_texts = text_tokenizer.transform(test_texts)
    tokenized_header1 = header_tokenizer.transform(test_header1)
    tokenized_header2 = header_tokenizer.transform(test_header2)
    X_test = hstack([tokenized_texts, tokenized_header1, tokenized_header2])
    Y_test = test_labels
    pred = model.predict(X_test)
    cm = np.array(confusion_matrix(Y_test, pred))
    
    # Diagonal elemetns were correctly classified
    diagonal = cm.diagonal()
    
    # Input class Counts
    class_sum = cm.sum(axis=1)
    
    # Predicted class counts
    pred_sum = cm.sum(axis=0)
    
    # Per-class performance w/ no-examples -> 0 perf
    precision = np.where(class_sum == 0, 0, diagonal/class_sum)
    recall = np.where(pred_sum == 0, 0, diagonal/pred_sum)
    
    # Frequency Weighted Performance
    c_freq = cm.sum(axis=1)/cm.sum()
    pres = c_freq * precision
    rec = c_freq * recall
    
    # Remove 'other' Category
    c_freq = c_freq[0:1] + c_freq[2:]
    pres = pres[0:1] + pres[2:]
    rec = rec[0:1] + rec[2:]
    
    return pres.sum()/c_freq.sum(), rec.sum()/c_freq.sum()

def cross_validation(doc_list, train_algo, test_algo, k, verbose=False, config=None):
    N = len(doc_list)
    size = N//k
    indeces = list(range(N))
    random.shuffle(indeces)
    all_indeces = set(indeces)
    
    pres_list = []
    rec_list = []
    
    for j in range(N//size):
        train_indeces = indeces[j * size:(j + 1) * size] + indeces[size * k + j: size * k + j + 1]
        test_indeces = list(all_indeces - set(train_indeces))
        
        train_docs = [doc_list[i] for i in train_indeces]
        test_docs = [doc_list[i] for i in test_indeces]
        
        if verbose:
            print('Fold %d starting!'%(j + 1))
        
        m, tt, ht = train_algo(train_docs, config=config)
        pres, rec = test_algo(m, tt, ht, test_docs)
        
        pres_list.append(pres)
        rec_list.append(rec)
        
        if verbose:
            print('precision:', pres)
            print('recall:', rec)
            print('-' * 10 + '\n')
    
    return sum(pres_list)/k, sum(rec_list)/k

In [None]:
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import confusion_matrix, accuracy_score

warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
warnings.filterwarnings(action='ignore', category=RuntimeWarning)

best_config = None
best_f1 = -1

count = 0
for stop_config in vectorizer_stopwords_configs:
    for ngram_config in ngram_configs:
        for min_df_config in min_df_configs:
            for tfidf_config in tfidf_configs:
                count += 1
                config = {
                            'stop_config': stop_config,
                            'ngram_config': ngram_config,
                            'tfidf_config': tfidf_config,
                            'min_df_config': min_df_config
                         }
                print('Progress: ' + str(count) + '/' + str(total_config_count), '\t =>', count/total_config_count)
                print('Testing configuration:', config)
                pres, rec = cross_validation(list(processed_documents), multi_svm_train, multi_svm_test, 10, verbose=False)
                f1 = 2 * (pres * rec)/(pres + rec)
                
                print('Precision: %f \t Recall: %f, F1: %f'%(pres, rec, f1))

                if f1 > best_f1:
                    best_config = config
                    best_f1 = f1


save_value('best_config_header', config, path=checkpoint_path)
save_value('best_score_header', f1, path=checkpoint_path)

In [None]:
config = load_value('best_config_header', path=checkpoint_path)
acc = load_value('best_score_header', path=checkpoint_path)
print(config)
print(acc)

In [None]:
config = load_value('best_config_header', path=checkpoint_path)

avg_precision, avg_recall = cross_validation(list(processed_documents), multi_svm_train, multi_svm_test, 10, verbose=True, config=config)
print('Average Precision:', avg_precision)
print('Average Recall:', avg_recall)

In [None]:
load_value('best_header_config', path=checkpoint_path)