In [1]:
import os, sys
sys.path.append(os.path.abspath('./src'))
import utils, json
from tqdm import tqdm
from utils import *
from linear_model import *
import pandas as pd
import numpy as np

In [2]:
# PATHS
data_path = '/scratch/juanmoo1/bayer'

EMA_annotations_path = os.path.join(data_path, 'VendorEMAforMIT/newLabels/annotations.xlsx')
EMA_old_annotations_path = os.path.join(data_path, 'VendorEMAforMIT/annotations.xlsx')

EMA_old_parsed_path = os.path.join(data_path, './VendorEMAforMIT/Labels/parsed.json')
EMA_parsed_path = os.path.join(data_path, './VendorEMAforMIT/newLabels/parsed.json')

pickle_dumps_path = os.path.join(data_path, 'pickle_dumps/')
checkpoint_path = os.path.join(pickle_dumps_path, 'checkpoint.pickle')
shared_path = os.path.join('/scratch/juanmoo1/shared')

In [3]:
'''
Parsed Data

Format: 
{
    document_name <str> : {
        [
            {
                "section": <str>,
                "subsection": <str>,
                "header": <str>,
                "subheader": <str>,
                "text": <str>
            },
            
            ...
            
        ]
    },
    
    ...
    
}
'''
data = load_parsed_file(EMA_parsed_path)
old_data = load_parsed_file(EMA_old_parsed_path)


# Labels
'''
Dict in form:
{
    file_name: {
        texts: [ <str>, ...],
        labels: [ <str>, ...]
    },
    
    ...
    
}
'''

annotations = utils.parse_spreadsheet(EMA_annotations_path)
old_annotations = utils.parse_spreadsheet(EMA_old_annotations_path)

## Matching Data to Labels

In [4]:
'''
Iterates through each document in the dataset and compares is to labels with the same file name. Matching is done using fuzzy string matching unless the exact_matching is set to True.
'''
labels = match_labels(data, annotations, exact_match=True)
old_labels = match_labels(old_data, old_annotations, exact_match=True)

save_value('data', data, path=checkpoint_path)
save_value('old_data', old_data, path=checkpoint_path)
save_value('labels', labels, path=checkpoint_path)
save_value('old_labels', old_labels, path=checkpoint_path)

100%|██████████| 5/5 [00:02<00:00,  1.91it/s]
100%|██████████| 68/68 [00:13<00:00,  4.92it/s]


#### Load Precomputed

In [4]:
data = load_value('data', path=checkpoint_path)
old_data = load_value('old_data', path=checkpoint_path)
labels = load_value('labels', path=checkpoint_path)
old_labels = load_value('old_labels', path=checkpoint_path)

## Preprocessing

In [5]:
# Clean input text
data = tokenize_matches(data)
old_data = tokenize_matches(old_data)

# Single Concept Classification

In [6]:
documents = pd.unique(data['doc_name'])
train_docs = documents[:3]
test_docs = documents[3:]

In [7]:
data_train = data.loc[data['doc_name'].isin(train_docs)]
data_test = data.loc[data['doc_name'].isin(test_docs)]

import warnings
# Ignore division by zero when calculating F1 score
warnings.filterwarnings(action='ignore', category=RuntimeWarning) 
output_file = os.path.join(shared_path, 'trainNew_testNew.txt')

with open(output_file, 'w') as outFile:
    
    for l in labels:
        if l != 'other':
            summary = '=' * 20 + ' Testing Label: ' + str(l) + ' ' + '=' * 20 + '\n'
            out = ''
            

            train_count = data_train[l].sum()
            test_count = data_test[l].sum()
                        

            if train_count > 1:
                
                params = svm_train(data_train, l)
                output = svm_test(data_test, params, verbose=True)

                precision = output['precision']
                recall = output['recall']
                cm = output['cm']             

                all_predicted = output['all_predicted']
                actual_positive = output['actual_positive']
                true_positive = output['true_positive']
                false_positive = output['false_positive']
                false_negative = output['false_negative']


                summary += 'Confussion Matrix: \n'
                summary += str(cm) + '\n'
                
                
                
                summary += 'Precision: ' + str(precision) + '\n'
                summary += 'Recall: ' + str(recall) + '\n'
                summary += 'F1: ' + str(2 * (precision * recall)/(precision + recall)) + '\n'


                summary += 'Training Examples Count: ' + str(train_count) + '\n'
                summary += 'Test Examples Count: ' + str(test_count) + '\n'
                
                example_head = '-' * 20 + ' %s ' + '-' * 20 + '\n'
                example_format = '# %d. DOC: %s\nSECTION: %s \nSUBSECTION: %s\nSUBHEADER: %s \nTEXT: %s \n\n\n'
                
                
                out += example_head%('PREDICTED')
                for index, (doc, sec, subsec, subhead, text) in all_predicted.iterrows():
                    out += example_format%(index, doc, sec, subsec, subhead, text)
                out += '\n'
                
                out += example_head%('TRUE POSITIVE')
                for index, (doc, sec, subsec, subhead, text) in true_positive.iterrows():
                    out += example_format%(index, doc, sec, subsec, subhead, text)
                out += '\n'
                    
                out += example_head%('FALSE NEGATIVE')
                for index, (doc, sec, subsec, subhead, text) in false_negative.iterrows():
                    out += example_format%(index, doc, sec, subsec, subhead, text)
                out += '\n'
                    
                out += example_head%('FALSE POSITIVE')
                for index, (doc, sec, subsec, subhead, text) in false_positive.iterrows():
                    out += example_format%(index, doc, sec, subsec, subhead, text)
                out += '\n'
                
            else:
                summary += 'There were only ' + str(train_count) + ' training examples. 2 or more are needed to train the model.'
                summary += '\n'
            
            print(summary)
            outFile.write(summary + '\n')
            outFile.write(out)



Confussion Matrix: 
[[978   5]
 [  4   5]]
Precision: 0.5555555555555556
Recall: 0.5
F1: 0.5263157894736842
Training Examples Count: 4
Test Examples Count: 9

Confussion Matrix: 
[[899   5]
 [ 75  13]]
Precision: 0.14772727272727273
Recall: 0.7222222222222223
F1: 0.24528301886792456
Training Examples Count: 17
Test Examples Count: 88

Confussion Matrix: 
[[888   3]
 [ 92   9]]
Precision: 0.0891089108910891
Recall: 0.75
F1: 0.15929203539823011
Training Examples Count: 8
Test Examples Count: 101

Confussion Matrix: 
[[954   7]
 [ 26   5]]
Precision: 0.16129032258064516
Recall: 0.4166666666666667
F1: 0.2325581395348837
Training Examples Count: 13
Test Examples Count: 31

Confussion Matrix: 
[[903   5]
 [ 63  21]]
Precision: 0.25
Recall: 0.8076923076923077
F1: 0.38181818181818183
Training Examples Count: 28
Test Examples Count: 84

Confussion Matrix: 
[[989   0]
 [  3   0]]
Precision: 0.0
Recall: 0.0
F1: nan
Training Examples Count: 2
Test Examples Count: 3

Confussion Matrix: 
[[956   4]


# Train Old / Test New

In [8]:
data_train = old_data
data_test = data.loc[data['doc_name'].isin(test_docs)]

import warnings
# Ignore division by zero when calculating F1 score
warnings.filterwarnings(action='ignore', category=RuntimeWarning) 
output_file = os.path.join(shared_path, 'trainOld_testNew.txt')

with open(output_file, 'w') as outFile:
    
    for l in labels:
        if l != 'other':
            summary = '=' * 20 + ' Testing Label: ' + str(l) + ' ' + '=' * 20 + '\n'
            out = ''
            

            train_count = data_train[l].sum()
            test_count = data_test[l].sum()
                        

            if train_count > 1:
                
                params = svm_train(data_train, l)
                output = svm_test(data_test, params, verbose=True)

                precision = output['precision']
                recall = output['recall']
                cm = output['cm']             

                all_predicted = output['all_predicted']
                actual_positive = output['actual_positive']
                true_positive = output['true_positive']
                false_positive = output['false_positive']
                false_negative = output['false_negative']


                summary += 'Confussion Matrix: \n'
                summary += str(cm) + '\n'
                
                
                
                summary += 'Precision: ' + str(precision) + '\n'
                summary += 'Recall: ' + str(recall) + '\n'
                summary += 'F1: ' + str(2 * (precision * recall)/(precision + recall)) + '\n'


                summary += 'Training Examples Count: ' + str(train_count) + '\n'
                summary += 'Test Examples Count: ' + str(test_count) + '\n'
                
                example_head = '-' * 20 + ' %s ' + '-' * 20 + '\n'
                example_format = '# %d. DOC: %s\nSECTION: %s \nSUBSECTION: %s\nSUBHEADER: %s \nTEXT: %s \n\n\n'
                
                
                out += example_head%('PREDICTED')
                for index, (doc, sec, subsec, subhead, text) in all_predicted.iterrows():
                    out += example_format%(index, doc, sec, subsec, subhead, text)
                out += '\n'
                
                out += example_head%('TRUE POSITIVE')
                for index, (doc, sec, subsec, subhead, text) in true_positive.iterrows():
                    out += example_format%(index, doc, sec, subsec, subhead, text)
                out += '\n'
                    
                out += example_head%('FALSE NEGATIVE')
                for index, (doc, sec, subsec, subhead, text) in false_negative.iterrows():
                    out += example_format%(index, doc, sec, subsec, subhead, text)
                out += '\n'
                    
                out += example_head%('FALSE POSITIVE')
                for index, (doc, sec, subsec, subhead, text) in false_positive.iterrows():
                    out += example_format%(index, doc, sec, subsec, subhead, text)
                out += '\n'
                
            else:
                summary += 'There were only ' + str(train_count) + ' training examples. 2 or more are needed to train the model.'
                summary += '\n'
            
            print(summary)
            outFile.write(summary + '\n')



Confussion Matrix: 
[[972  11]
 [  4   5]]
Precision: 0.5555555555555556
Recall: 0.3125
F1: 0.39999999999999997
Training Examples Count: 186
Test Examples Count: 9





Confussion Matrix: 
[[892  12]
 [  2  86]]
Precision: 0.9772727272727273
Recall: 0.8775510204081631
F1: 0.9247311827956989
Training Examples Count: 116
Test Examples Count: 88





Confussion Matrix: 
[[879  12]
 [ 10  91]]
Precision: 0.9009900990099009
Recall: 0.883495145631068
F1: 0.892156862745098
Training Examples Count: 100
Test Examples Count: 101





Confussion Matrix: 
[[961   0]
 [ 18  13]]
Precision: 0.41935483870967744
Recall: 1.0
F1: 0.5909090909090909
Training Examples Count: 37
Test Examples Count: 31





Confussion Matrix: 
[[895  13]
 [  3  81]]
Precision: 0.9642857142857143
Recall: 0.8617021276595745
F1: 0.9101123595505619
Training Examples Count: 144
Test Examples Count: 84





Confussion Matrix: 
[[984   5]
 [  1   2]]
Precision: 0.6666666666666666
Recall: 0.2857142857142857
F1: 0.4
Training Examples Count: 34
Test Examples Count: 3





Confussion Matrix: 
[[960   0]
 [  6  26]]
Precision: 0.8125
Recall: 1.0
F1: 0.896551724137931
Training Examples Count: 368
Test Examples Count: 32





Confussion Matrix: 
[[987   0]
 [  3   2]]
Precision: 0.4
Recall: 1.0
F1: 0.5714285714285715
Training Examples Count: 35
Test Examples Count: 5

Confussion Matrix: 
[[812  12]
 [ 80  88]]
Precision: 0.5238095238095238
Recall: 0.88
F1: 0.6567164179104478
Training Examples Count: 796
Test Examples Count: 168

