In [1]:
import os, sys
sys.path.append(os.path.abspath('./src'))
import utils, json
from tqdm import tqdm
from utils import *
from linear_model import *
import pandas as pd
import numpy as np

In [2]:
# PATHS
data_path = '/scratch/juanmoo1/bayer'

EMA_annotations_path = os.path.join(data_path, 'VendorEMAforMIT/newLabels/annotations.xlsx')
EMA_old_annotations_path = os.path.join(data_path, 'VendorEMAforMIT/annotations.xlsx')

EMA_old_parsed_path = os.path.join(data_path, './VendorEMAforMIT/Labels/parsed.json')
EMA_parsed_path = os.path.join(data_path, './VendorEMAforMIT/newLabels/parsed.json')

pickle_dumps_path = os.path.join(data_path, 'pickle_dumps/')
checkpoint_path = os.path.join(pickle_dumps_path, 'checkpoint.pickle')
shared_path = os.path.join('/scratch/juanmoo1/shared')

In [3]:
'''
Parsed Data

Format: 
{
    document_name <str> : {
        [
            {
                "section": <str>,
                "subsection": <str>,
                "header": <str>,
                "subheader": <str>,
                "text": <str>
            },
            
            ...
            
        ]
    },
    
    ...
    
}
'''
data = load_parsed_file(EMA_parsed_path)
old_data = load_parsed_file(EMA_old_parsed_path)


# Labels
'''
Dict in form:
{
    file_name: {
        texts: [ <str>, ...],
        labels: [ <str>, ...]
    },
    
    ...
    
}
'''

annotations = utils.parse_spreadsheet(EMA_annotations_path)
old_annotations = utils.parse_spreadsheet(EMA_old_annotations_path)

## Matching Data to Labels

In [10]:
'''
Iterates through each document in the dataset and compares is to labels with the same file name. Matching is done using fuzzy string matching unless the exact_matching is set to True.
'''
labels = match_labels(data, annotations, exact_match=False)
old_labels = match_labels(old_data, old_annotations, exact_match=False)

save_value('data', data, path=checkpoint_path)
save_value('old_data', old_data, path=checkpoint_path)
save_value('labels', labels, path=checkpoint_path)
save_value('old_labels', old_labels, path=checkpoint_path)

100%|██████████| 5/5 [05:20<00:00, 64.14s/it]
100%|██████████| 68/68 [16:26<00:00, 14.51s/it] 


#### Load Precomputed

In [4]:
data = load_value('data', path=checkpoint_path)
old_data = load_value('old_data', path=checkpoint_path)
labels = load_value('labels', path=checkpoint_path)
old_labels = load_value('old_labels', path=checkpoint_path)

## Preprocessing

In [5]:
# Clean input text
data = tokenize_matches(data)
old_data = tokenize_matches(old_data)

# Single Concept Classification

In [6]:
documents = pd.unique(data['doc_name'])
train_docs = documents[:3]
test_docs = documents[3:]

In [8]:
data_train = data.loc[data['doc_name'].isin(train_docs)]
data_test = data.loc[data['doc_name'].isin(test_docs)]

import warnings
# Ignore division by zero when calculating F1 score
warnings.filterwarnings(action='ignore', category=RuntimeWarning) 
output_file = os.path.join(shared_path, 'trainNew_testNew.txt')

with open(output_file, 'w') as outFile:
    
    for l in labels:
        if l != 'other':
            summary = '=' * 20 + ' Testing Label: ' + str(l) + ' ' + '=' * 20 + '\n'
            out = ''
            

            train_count = data_train[l].sum()
            test_count = data_test[l].sum()
                        

            if train_count > 1:
                
                params = svm_train(data_train, l)
                output = svm_test(data_test, params, verbose=True)

                precision = output['precision']
                recall = output['recall']
                cm = output['cm']             

                all_predicted = output['all_predicted']
                actual_positive = output['actual_positive']
                true_positive = output['true_positive']
                false_positive = output['false_positive']
                false_negative = output['false_negative']


                summary += 'Confussion Matrix: \n'
                summary += str(cm) + '\n'
                
                
                
                summary += 'Precision: ' + str(precision) + '\n'
                summary += 'Recall: ' + str(recall) + '\n'
                summary += 'F1: ' + str(2 * (precision * recall)/(precision + recall)) + '\n'


                summary += 'Training Examples Count: ' + str(train_count) + '\n'
                summary += 'Test Examples Count: ' + str(test_count) + '\n'
                
                example_head = '-' * 20 + ' %s ' + '-' * 20 + '\n'
                example_format = '# %d. DOC: %s\nSECTION: %s \nSUBSECTION: %s\n HEADER: %s\nSUBHEADER: %s \nTEXT: %s \n\n\n'
                
                
                out += example_head%('PREDICTED')
                for index, (doc, sec, subsec, head, subhead, text) in all_predicted.iterrows():
                    out += example_format%(index, doc, sec, subsec, head, subhead, text)
                out += '\n'
                
                out += example_head%('TRUE POSITIVE')
                for index, (doc, sec, subsec, head, subhead, text) in true_positive.iterrows():
                    out += example_format%(index, doc, sec, subsec, head, subhead, text)
                out += '\n'
                    
                out += example_head%('FALSE NEGATIVE')
                for index, (doc, sec, subsec, head, subhead, text) in false_negative.iterrows():
                    out += example_format%(index, doc, sec, subsec, head, subhead, text)
                out += '\n'
                    
                out += example_head%('FALSE POSITIVE')
                for index, (doc, sec, subsec, head, subhead, text) in false_positive.iterrows():
                    out += example_format%(index, doc, sec, subsec, head, subhead, text)
                out += '\n'
                
            else:
                summary += 'There were only ' + str(train_count) + ' training examples. 2 or more are needed to train the model.'
                summary += '\n'
            
            print(summary)
            outFile.write(summary + '\n')
            outFile.write(out)



Confussion Matrix: 
[[805   1]
 [  0   4]]
Precision: 0.8
Recall: 1.0
F1: 0.888888888888889
Training Examples Count: 3
Test Examples Count: 4

Confussion Matrix: 
[[730   3]
 [ 72   5]]
Precision: 0.625
Recall: 0.06493506493506493
F1: 0.11764705882352941
Training Examples Count: 12
Test Examples Count: 77

Confussion Matrix: 
[[727   1]
 [ 79   3]]
Precision: 0.75
Recall: 0.036585365853658534
F1: 0.0697674418604651
Training Examples Count: 6
Test Examples Count: 82

Confussion Matrix: 
[[786   1]
 [ 18   5]]
Precision: 0.8333333333333333
Recall: 0.21739130434782608
F1: 0.3448275862068965
Training Examples Count: 12
Test Examples Count: 23

Confussion Matrix: 
[[734   3]
 [ 61  12]]
Precision: 0.8
Recall: 0.1643835616438356
F1: 0.2727272727272727
Training Examples Count: 23
Test Examples Count: 73

Confussion Matrix: 
[[808   0]
 [  2   0]]
Precision: 0.0
Recall: 0.0
F1: nan
Training Examples Count: 2
Test Examples Count: 2

Confussion Matrix: 
[[794   0]
 [  9   7]]
Precision: 1.0
Reca

# Train Old / Test New

In [9]:
data_train = old_data
data_test = data.loc[data['doc_name'].isin(test_docs)]

import warnings
# Ignore division by zero when calculating F1 score
warnings.filterwarnings(action='ignore', category=RuntimeWarning) 
output_file = os.path.join(shared_path, 'trainOld_testNew.txt')

with open(output_file, 'w') as outFile:
    
    for l in labels:
        if l != 'other':
            summary = '=' * 20 + ' Testing Label: ' + str(l) + ' ' + '=' * 20 + '\n'
            out = ''
            

            train_count = data_train[l].sum()
            test_count = data_test[l].sum()
                        

            if train_count > 1:
                
                params = svm_train(data_train, l)
                output = svm_test(data_test, params, verbose=True)

                precision = output['precision']
                recall = output['recall']
                cm = output['cm']             

                all_predicted = output['all_predicted']
                actual_positive = output['actual_positive']
                true_positive = output['true_positive']
                false_positive = output['false_positive']
                false_negative = output['false_negative']


                summary += 'Confussion Matrix: \n'
                summary += str(cm) + '\n'
                
                
                
                summary += 'Precision: ' + str(precision) + '\n'
                summary += 'Recall: ' + str(recall) + '\n'
                summary += 'F1: ' + str(2 * (precision * recall)/(precision + recall)) + '\n'


                summary += 'Training Examples Count: ' + str(train_count) + '\n'
                summary += 'Test Examples Count: ' + str(test_count) + '\n'
                
                example_head = '-' * 20 + ' %s ' + '-' * 20 + '\n'
                example_format = '# %d. DOC: %s\nSECTION: %s \nSUBSECTION: %s\n HEADER: %s\nSUBHEADER: %s \nTEXT: %s \n\n\n'
                
                
                out += example_head%('PREDICTED')
                for index, (doc, sec, subsec, head, subhead, text) in all_predicted.iterrows():
                    out += example_format%(index, doc, sec, subsec, head, subhead, text)
                out += '\n'
                
                out += example_head%('TRUE POSITIVE')
                for index, (doc, sec, subsec, head, subhead, text) in true_positive.iterrows():
                    out += example_format%(index, doc, sec, subsec, head, subhead, text)
                out += '\n'
                    
                out += example_head%('FALSE NEGATIVE')
                for index, (doc, sec, subsec, head, subhead, text) in false_negative.iterrows():
                    out += example_format%(index, doc, sec, subsec, head, subhead, text)
                out += '\n'
                    
                out += example_head%('FALSE POSITIVE')
                for index, (doc, sec, subsec, head, subhead, text) in false_positive.iterrows():
                    out += example_format%(index, doc, sec, subsec, head, subhead, text)
                out += '\n'
                
            else:
                summary += 'There were only ' + str(train_count) + ' training examples. 2 or more are needed to train the model.'
                summary += '\n'
            
            print(summary)
            outFile.write(summary + '\n')
            outFile.write(out)



Confussion Matrix: 
[[804   2]
 [  0   4]]
Precision: 0.6666666666666666
Recall: 1.0
F1: 0.8
Training Examples Count: 122
Test Examples Count: 4





Confussion Matrix: 
[[722  11]
 [  1  76]]
Precision: 0.8735632183908046
Recall: 0.987012987012987
F1: 0.9268292682926829
Training Examples Count: 99
Test Examples Count: 77





Confussion Matrix: 
[[718  10]
 [  8  74]]
Precision: 0.8809523809523808
Recall: 0.902439024390244
F1: 0.8915662650602411
Training Examples Count: 82
Test Examples Count: 82





Confussion Matrix: 
[[787   0]
 [ 14   9]]
Precision: 1.0
Recall: 0.391304347826087
F1: 0.5625
Training Examples Count: 27
Test Examples Count: 23





Confussion Matrix: 
[[724  13]
 [  2  71]]
Precision: 0.8452380952380952
Recall: 0.9726027397260275
F1: 0.9044585987261148
Training Examples Count: 109
Test Examples Count: 73

Confussion Matrix: 
[[803   5]
 [  0   2]]
Precision: 0.2857142857142857
Recall: 1.0
F1: 0.4444444444444445
Training Examples Count: 27
Test Examples Count: 2





Confussion Matrix: 
[[794   0]
 [  5  11]]
Precision: 1.0
Recall: 0.6875
F1: 0.8148148148148148
Training Examples Count: 252
Test Examples Count: 16





Confussion Matrix: 
[[806   0]
 [  2   2]]
Precision: 1.0
Recall: 0.5
F1: 0.6666666666666666
Training Examples Count: 28
Test Examples Count: 4

Confussion Matrix: 
[[709   5]
 [ 50  46]]
Precision: 0.9019607843137255
Recall: 0.47916666666666663
F1: 0.6258503401360543
Training Examples Count: 561
Test Examples Count: 96

