In [1]:
import os, sys
sys.path.append(os.path.abspath('./src'))
import utils, json
from tqdm import tqdm
from utils import save_value, load_value, load_env_keys, match_labels_div, match_labels_p, tokenize_string, clean_matches
import pandas as pd
import numpy as np

In [2]:
# PATHS
data_path = '/scratch/juanmoo1'
shared_path = os.path.join(data_path, './shared')

EMA_dump_path = os.path.join(data_path, './jsons/new_EMA_dump.json')
EMA_old_dump_path = os.path.join(data_path, './jsons/EMA_dump.json')

EMA_annotations_path = os.path.join(data_path, './bayer/VendorEMAforMIT/new_annotations/annotations.xlsx')
EMA_old_annotations_path = os.path.join(data_path, './bayer/VendorEMAforMIT/annotations.xlsx')

pickle_dumps_path = os.path.join(data_path, './pickle_dumps/')
checkpoint_path = os.path.join(pickle_dumps_path, 'checkpoint.pickle')

In [3]:
# Raw Data
'''
Format:
{
    document_name <str>: {
                            element_text: <str> (raw text),
                            element_tag: <str> (TEI XML tag)
                          },
                          
    ...
}
'''
raw_data = json.loads(open(EMA_dump_path, 'r').read())
old_raw_data = json.loads(open(EMA_old_dump_path, 'r').read())


# Labels
'''
Dict in form:
{
    file_name: {
        texts: [ <str>, ...],
        labels: [ <str>, ...]
    },
    
    ...
    
}
'''
annotations = utils.parse_spreadsheet(EMA_annotations_path)
old_annotations = utils.parse_spreadsheet(EMA_old_annotations_path)

## Matching Data to Labels

In [4]:
'''
Iterates through each document in the dataset and compares is to labels with the same file name. Matching is done using fuzzy string matching unless the exact_matching is set to True.
'''

matched_div = match_labels_div(raw_data, annotations, exact_match=True)
# save_value('labeled_raw_documents', labeled_raw_documents, path=checkpoint_path)

matched_div_old = match_labels_div(old_raw_data, old_annotations, exact_match=True)
# save_value('old_raw_labeled_documents', old_labeled_raw_documents, path=checkpoint_path)

100%|██████████| 5/5 [00:00<00:00, 98.08it/s]
100%|██████████| 72/72 [00:00<00:00, 340.80it/s]


In [5]:
matched_p = match_labels_p(raw_data, annotations, exact_match=True)
matched_p_old = match_labels_p(old_raw_data, old_annotations, exact_match=True)

100%|██████████| 5/5 [00:00<00:00, 136.72it/s]
100%|██████████| 72/72 [00:00<00:00, 275.78it/s]


#### Load Precomputed

In [4]:
labeled_raw_documents = load_value('labeled_raw_documents', path=checkpoint_path)
old_labeled_raw_documents = load_value('old_labeled_raw_documents', path=checkpoint_path)

## Preprocessing

In [6]:
from sklearn.preprocessing import MultiLabelBinarizer

# Clean input text
data_div = clean_matches(matched_div)
old_data_div = clean_matches(matched_div_old)

data_p = clean_matches(matched_p)
old_data_p = clean_matches(matched_p_old)

In [7]:
# One-hot encode labels
mlb = MultiLabelBinarizer()

#Div Data
old_data_div = old_data_div.join(pd.DataFrame(mlb.fit_transform(old_data_div['label']), columns=mlb.classes_, index=old_data_div.index))
data_div = data_div.join(pd.DataFrame(mlb.transform(data_div['label']), columns=mlb.classes_, index=data_div.index))

#P Data
old_data_p = old_data_p.join(pd.DataFrame(mlb.transform(old_data_p['label']), columns=mlb.classes_, index=old_data_p.index))
data_p = data_p.join(pd.DataFrame(mlb.transform(data_p['label']), columns=mlb.classes_, index=data_p.index))

# Multi-Class SVN

In [15]:
#pipeline of feature engineering and model
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import *
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

model = Pipeline([('vectorizer', CountVectorizer()),
 ('tfidf', TfidfTransformer()),
 ('clf', OneVsRestClassifier(LinearSVC(class_weight='balanced')))])

## Box Search with Header augmented feature 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack, vstack
from functools import reduce

# Search Params
ngram_configs = [(1, 1), (2, 2), (1, 2), (1, 3), (1, 4), (1, 5)]
tfidf_configs = [True, False]
vectorizer_stopwords_configs = ['english', None]
min_df_configs = [0] + [10**(-n) for n in range(3, 4)]

total_config_count = len(ngram_configs) * len(tfidf_configs) * len(vectorizer_stopwords_configs) * len(min_df_configs)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
import random
import numpy as np

def multi_svm_train(doc_list, config=None):
    train_texts = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['texts'] for doc_name in doc_list])
    train_header1 = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['header1'] for doc_name in doc_list])
    train_header2 = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['header2'] for doc_name in doc_list])
    train_labels = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['labels'] for doc_name in doc_list])
    
    if config is None:
        config = load_value('best_config_header', path=checkpoint_path)
        
    text_tokenizer = CountVectorizer(ngram_range=config['ngram_config'], stop_words=config['stop_config'])
    header_tokenizer = CountVectorizer(ngram_range=config['ngram_config'], stop_words=config['stop_config'])
    
    tokenized_texts = text_tokenizer.fit_transform(train_texts)
    tokenized_header1 = header_tokenizer.fit_transform(train_header1)
    tokenized_header2 = header_tokenizer.transform(train_header2)
    X_train = hstack([tokenized_texts, tokenized_header1, tokenized_header2])
    Y_train = train_labels

    model = Pipeline([('tfidf', TfidfTransformer(use_idf=config['tfidf_config'])), ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])
    model.fit(X_train, Y_train)
    
    return (model, text_tokenizer, header_tokenizer)

def multi_svm_test(model, text_tokenizer, header_tokenizer, doc_list):
    test_texts = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['texts'] for doc_name in doc_list])
    test_header1 = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['header1'] for doc_name in doc_list])
    test_header2 = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['header2'] for doc_name in doc_list])
    test_labels = reduce(lambda x, y: x + y, [[]] + [processed_documents[doc_name]['labels'] for doc_name in doc_list])
    
    tokenized_texts = text_tokenizer.transform(test_texts)
    tokenized_header1 = header_tokenizer.transform(test_header1)
    tokenized_header2 = header_tokenizer.transform(test_header2)
    X_test = hstack([tokenized_texts, tokenized_header1, tokenized_header2])
    Y_test = test_labels
    pred = model.predict(X_test)
    cm = np.array(confusion_matrix(Y_test, pred))
    
    # Diagonal elemetns were correctly classified
    diagonal = cm.diagonal()
    
    # Input class Counts
    class_sum = cm.sum(axis=1)
    
    # Predicted class counts
    pred_sum = cm.sum(axis=0)
    
    # Per-class performance w/ no-examples -> 0 perf
    precision = np.where(class_sum == 0, 0, diagonal/class_sum)
    recall = np.where(pred_sum == 0, 0, diagonal/pred_sum)
    
    # Frequency Weighted Performance
    c_freq = cm.sum(axis=1)/cm.sum()
    pres = c_freq * precision
    rec = c_freq * recall
    
    # Remove 'other' Category
    c_freq = c_freq[0:1] + c_freq[2:]
    pres = pres[0:1] + pres[2:]
    rec = rec[0:1] + rec[2:]
    
    return pres.sum()/c_freq.sum(), rec.sum()/c_freq.sum()

def cross_validation(doc_list, train_algo, test_algo, k, verbose=False, config=None):
    N = len(doc_list)
    size = N//k
    indeces = list(range(N))
    random.shuffle(indeces)
    all_indeces = set(indeces)
    
    pres_list = []
    rec_list = []
    
    for j in range(N//size):
        train_indeces = indeces[j * size:(j + 1) * size] + indeces[size * k + j: size * k + j + 1]
        test_indeces = list(all_indeces - set(train_indeces))
        
        train_docs = [doc_list[i] for i in train_indeces]
        test_docs = [doc_list[i] for i in test_indeces]
        
        if verbose:
            print('Fold %d starting!'%(j + 1))
        
        m, tt, ht = train_algo(train_docs, config=config)
        pres, rec = test_algo(m, tt, ht, test_docs)
        
        pres_list.append(pres)
        rec_list.append(rec)
        
        if verbose:
            print('precision:', pres)
            print('recall:', rec)
            print('-' * 10 + '\n')
    
    return sum(pres_list)/k, sum(rec_list)/k

In [None]:
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import confusion_matrix, accuracy_score

warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
warnings.filterwarnings(action='ignore', category=RuntimeWarning)

best_config = None
best_f1 = -1

count = 0
for stop_config in vectorizer_stopwords_configs:
    for ngram_config in ngram_configs:
        for min_df_config in min_df_configs:
            for tfidf_config in tfidf_configs:
                count += 1
                config = {
                            'stop_config': stop_config,
                            'ngram_config': ngram_config,
                            'tfidf_config': tfidf_config,
                            'min_df_config': min_df_config
                         }
                print('Progress: ' + str(count) + '/' + str(total_config_count), '\t =>', count/total_config_count)
                print('Testing configuration:', config)
                pres, rec = cross_validation(list(processed_documents), multi_svm_train, multi_svm_test, 10, verbose=False)
                f1 = 2 * (pres * rec)/(pres + rec)
                
                print('Precision: %f \t Recall: %f, F1: %f'%(pres, rec, f1))

                if f1 > best_f1:
                    best_config = config
                    best_f1 = f1


save_value('best_config_header', config, path=checkpoint_path)
save_value('best_score_header', f1, path=checkpoint_path)

In [None]:
config = load_value('best_config_header', path=checkpoint_path)
acc = load_value('best_score_header', path=checkpoint_path)
print(config)
print(acc)

In [None]:
config = load_value('best_config_header', path=checkpoint_path)

avg_precision, avg_recall = cross_validation(list(processed_documents), multi_svm_train, multi_svm_test, 10, verbose=True, config=config)
print('Average Precision:', avg_precision)
print('Average Recall:', avg_recall)

In [None]:
load_value('best_header_config', path=checkpoint_path)

# Single Concept Classification

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack, vstack
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import *
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

def svm_train(train_data, label, config=None):
#     train_data = data.loc[data['document'].isin(train_docs)]
    
    if config is None:
        config = {
            'ngram_config': (1, 4),
            'stop_config': 'english',
            'tfidf_config': True
        }
        
    text_tokenizer = CountVectorizer(ngram_range=config['ngram_config'], stop_words=config['stop_config'])
    head1_tokenizer = CountVectorizer(ngram_range=config['ngram_config'], stop_words=config['stop_config'])
    head2_tokenizer = CountVectorizer(ngram_range=config['ngram_config'], stop_words=config['stop_config'])
    
    tokenized_texts = text_tokenizer.fit_transform(train_data['text'])
    tokenized_head1 = head1_tokenizer.fit_transform(train_data['head1'])
    tokenized_head2 = head2_tokenizer.fit_transform(train_data['head2'])
    
    X_train = hstack([tokenized_texts, tokenized_head1, tokenized_head2])
    Y_train = train_data[label]
    
    model = Pipeline([('tfidf', TfidfTransformer(use_idf=config['tfidf_config'])), ('clf', LinearSVC(class_weight="balanced"))])
    model.fit(X_train, Y_train)
    
    return {
        'model': model,
        'tt': text_tokenizer,
        'h1t': head1_tokenizer,
        'h2t': head2_tokenizer,
        'label': label
    }

def svm_test(test_data, params):
    
#     test_data = data.loc[data['document'].isin(test_docs)]
    
    tt = params['tt'].transform(test_data['text'])
    th1 = params['h1t'].transform(test_data['head1'])
    th2 = params['h2t'].transform(test_data['head2'])
    
    X_test = hstack([tt, th1, th2])
    Y_test = np.array((test_data[params['label']])).reshape(-1, 1) * 1.0
    
    
    pred = np.array(params['model'].predict(X_test)).reshape(-1, 1) * 1.0
    cm = np.array(confusion_matrix(Y_test, pred))
    
    
    # Diagonal elemetns were correctly classified
    diagonal = cm.diagonal()
    
    # Input class Counts
    class_sum = cm.sum(axis=1)
    
    # Predicted class counts
    pred_sum = cm.sum(axis=0)
    
    # Per-class performance w/ no-examples -> 0 perf
    precision = np.where(class_sum == 0, 0, diagonal/class_sum)
    recall = np.where(pred_sum == 0, 0, diagonal/pred_sum)
    
    # Frequency Weighted Performance
    c_freq = cm.sum(axis=1)/cm.sum()
    pres = c_freq * precision
    rec = c_freq * recall
    
    # Remove 'other' Category
    c_freq = c_freq[1:]
    pres = pres[1:] 
    rec = rec[1:]
    
    output = {
        'precision': pres.sum()/c_freq.sum(),
        'recall': rec.sum()/c_freq.sum(),
        'cm': cm,
        'all_predicted': test_data.loc[pred > 0][['document', 'head2', 'head1', 'text']],
        'actual_positive': test_data.loc[Y_test > 0][['document', 'head2', 'head1', 'text']],
        'true_positive': test_data.loc[Y_test * pred > 0][['document', 'head2', 'head1', 'text']],
        'false_positive': test_data.loc[pred * (1 - Y_test) > 0][['document', 'head2', 'head1', 'text']],
        'false_negative': test_data.loc[Y_test * (1 - pred) > 0][['document', 'head2', 'head1', 'text']]
    }
    
    return output

In [9]:
labels = sorted(mlb.classes_)
documents = pd.unique(data_div['document'])
train_docs = documents[:3]
test_docs = documents[3:]

# Train New / Test New
## Per-Div Classification

In [10]:
data = data_div
data_train = data.loc[data['document'].isin(train_docs)]
data_test = data.loc[data['document'].isin(test_docs)]

import warnings
# Ignore division by zero when calculating F1 score
warnings.filterwarnings(action='ignore', category=RuntimeWarning) 
output_file = os.path.join(shared_path, 'trainNew_testNew_div.txt')

with open(output_file, 'w') as outFile:
    
    for l in labels:
        if l != 'other':
            summary = '=' * 20 + ' Testing Label: ' + str(l) + ' ' + '=' * 20 + '\n'
            out = ''
            

            train_count = data_train[l].sum()
            test_count = data_test[l].sum()
                        

            if train_count > 1:
                
                params = svm_train(data_train, l)
                output = svm_test(data_test, params)

                precision = output['precision']
                recall = output['recall']
                cm = output['cm']             

                all_predicted = output['all_predicted']
                actual_positive = output['actual_positive']
                true_positive = output['true_positive']
                false_positive = output['false_positive']
                false_negative = output['false_negative']


                summary += 'Confussion Matrix: \n'
                summary += str(cm) + '\n'
                
                
                
                summary += 'Precision: ' + str(precision) + '\n'
                summary += 'Recall: ' + str(recall) + '\n'
                summary += 'F1: ' + str(2 * (precision * recall)/(precision + recall)) + '\n'


                summary += 'Training Examples Count: ' + str(train_count) + '\n'
                summary += 'Test Examples Count: ' + str(test_count) + '\n'
                
                example_head = '-' * 20 + ' %s ' + '-' * 20 + '\n'
                example_format = '# %d. DOC: %s\nHEAD2: %s \nHEAD1: %s\nTEXT: %s\n\n\n'
                
                
                out += example_head%('PREDICTED')
                for index, (doc, h2, h1, t) in all_predicted.iterrows():
                    out += example_format%(index, doc, h2, h1, t)
                out += '\n'
                
                out += example_head%('TRUE POSITIVE')
                for index, (doc, h2, h1, t) in true_positive.iterrows():
                    out += example_format%(index, doc, h2, h1, t)
                out += '\n'
                    
                out += example_head%('FALSE NEGATIVE')
                for index, (doc, h2, h1, t) in false_negative.iterrows():
                    out += example_format%(index, doc, h2, h1, t)
                out += '\n'
                    
                out += example_head%('FALSE POSITIVE')
                for index, (doc, h2, h1, t) in false_positive.iterrows():
                    out += example_format%(index, doc, h2, h1, t)
                out += '\n'
                
            else:
                summary += 'There were only ' + str(train_count) + ' training examples. 2 or more are needed to train the model.'
                summary += '\n'
            
            print(summary)
            outFile.write(summary + '\n')
            outFile.write(out)

Confussion Matrix: 
[[163   0]
 [  1   2]]
Precision: 0.6666666666666666
Recall: 1.0
F1: 0.8
Training Examples Count: 4
Test Examples Count: 3

Confussion Matrix: 
[[159   3]
 [  4   0]]
Precision: 0.0
Recall: 0.0
F1: nan
Training Examples Count: 31
Test Examples Count: 4

Confussion Matrix: 
[[165   0]
 [  1   0]]
Precision: 0.0
Recall: 0.0
F1: nan
Training Examples Count: 39
Test Examples Count: 1

Confussion Matrix: 
[[163   0]
 [  3   0]]
Precision: 0.0
Recall: 0.0
F1: nan
Training Examples Count: 15
Test Examples Count: 3

Confussion Matrix: 
[[156   3]
 [  6   1]]
Precision: 0.14285714285714285
Recall: 0.25
F1: 0.18181818181818182
Training Examples Count: 33
Test Examples Count: 7

There were only 0 training examples. 2 or more are needed to train the model.

Confussion Matrix: 
[[166]]
Precision: nan
Recall: nan
F1: nan
Training Examples Count: 2
Test Examples Count: 0

There were only 0 training examples. 2 or more are needed to train the model.

Confussion Matrix: 
[[163   1]


## Per-P Classification

In [11]:
data = data_p
data_train = data.loc[data['document'].isin(train_docs)]
data_test = data.loc[data['document'].isin(test_docs)]

import warnings
# Ignore division by zero when calculating F1 score
warnings.filterwarnings(action='ignore', category=RuntimeWarning) 
output_file = os.path.join(shared_path, 'trainNew_testNew_p.txt')

with open(output_file, 'w') as outFile:
    
    for l in labels:
        if l != 'other':
            summary = '=' * 20 + ' Testing Label: ' + str(l) + ' ' + '=' * 20 + '\n'
            out = ''
            

            train_count = data_train[l].sum()
            test_count = data_test[l].sum()
                        

            if train_count > 1:
                
                params = svm_train(data_train, l)
                output = svm_test(data_test, params)

                precision = output['precision']
                recall = output['recall']
                cm = output['cm']             

                all_predicted = output['all_predicted']
                actual_positive = output['actual_positive']
                true_positive = output['true_positive']
                false_positive = output['false_positive']
                false_negative = output['false_negative']


                summary += 'Confussion Matrix: \n'
                summary += str(cm) + '\n'
                
                
                
                summary += 'Precision: ' + str(precision) + '\n'
                summary += 'Recall: ' + str(recall) + '\n'
                summary += 'F1: ' + str(2 * (precision * recall)/(precision + recall)) + '\n'


                summary += 'Training Examples Count: ' + str(train_count) + '\n'
                summary += 'Test Examples Count: ' + str(test_count) + '\n'
                
                example_head = '-' * 20 + ' %s ' + '-' * 20 + '\n'
                example_format = '# %d. DOC: %s\nHEAD2: %s \nHEAD1: %s\nTEXT: %s\n\n\n'
                
                
                out += example_head%('PREDICTED')
                for index, (doc, h2, h1, t) in all_predicted.iterrows():
                    out += example_format%(index, doc, h2, h1, t)
                out += '\n'
                
                out += example_head%('TRUE POSITIVE')
                for index, (doc, h2, h1, t) in true_positive.iterrows():
                    out += example_format%(index, doc, h2, h1, t)
                out += '\n'
                    
                out += example_head%('FALSE NEGATIVE')
                for index, (doc, h2, h1, t) in false_negative.iterrows():
                    out += example_format%(index, doc, h2, h1, t)
                out += '\n'
                    
                out += example_head%('FALSE POSITIVE')
                for index, (doc, h2, h1, t) in false_positive.iterrows():
                    out += example_format%(index, doc, h2, h1, t)
                out += '\n'
                
            else:
                summary += 'There were only ' + str(train_count) + ' training examples. 2 or more are needed to train the model.'
                summary += '\n'
            
            print(summary)
            outFile.write(summary + '\n')
            outFile.write(out)



Confussion Matrix: 
[[321   0]
 [  0   3]]
Precision: 1.0
Recall: 1.0
F1: 1.0
Training Examples Count: 4
Test Examples Count: 3

Confussion Matrix: 
[[313   3]
 [  7   1]]
Precision: 0.125
Recall: 0.25
F1: 0.16666666666666666
Training Examples Count: 57
Test Examples Count: 8

Confussion Matrix: 
[[323   0]
 [  1   0]]
Precision: 0.0
Recall: 0.0
F1: nan
Training Examples Count: 62
Test Examples Count: 1

Confussion Matrix: 
[[321   0]
 [  3   0]]
Precision: 0.0
Recall: 0.0
F1: nan
Training Examples Count: 21
Test Examples Count: 3

Confussion Matrix: 
[[306   3]
 [ 14   1]]
Precision: 0.06666666666666667
Recall: 0.25
F1: 0.10526315789473685
Training Examples Count: 55
Test Examples Count: 15

There were only 0 training examples. 2 or more are needed to train the model.

Confussion Matrix: 
[[324]]
Precision: nan
Recall: nan
F1: nan
Training Examples Count: 2
Test Examples Count: 0

There were only 0 training examples. 2 or more are needed to train the model.





Confussion Matrix: 
[[321   1]
 [  2   0]]
Precision: 0.0
Recall: 0.0
F1: nan
Training Examples Count: 5
Test Examples Count: 2

There were only 1 training examples. 2 or more are needed to train the model.

Confussion Matrix: 
[[269  10]
 [  7  38]]
Precision: 0.8444444444444444
Recall: 0.7916666666666666
F1: 0.8172043010752689
Training Examples Count: 74
Test Examples Count: 45



# Train Old / Test New
## Per-Div Classification

In [12]:
old_data = old_data_div
new_data = data_div

data_train = old_data.loc[old_data['document'].isin(train_docs)]
data_test = new_data.loc[new_data['document'].isin(test_docs)]

import warnings
# Ignore division by zero when calculating F1 score
warnings.filterwarnings(action='ignore', category=RuntimeWarning) 
output_file = os.path.join(shared_path, 'trainOld_testNew_div.txt')

with open(output_file, 'w') as outFile:
    
    for l in labels:
        if l != 'other':
            summary = '=' * 20 + ' Testing Label: ' + str(l) + ' ' + '=' * 20 + '\n'
            out = ''
            

            train_count = data_train[l].sum()
            test_count = data_test[l].sum()
                        

            if train_count > 1:
                
                params = svm_train(data_train, l)
                output = svm_test(data_test, params)

                precision = output['precision']
                recall = output['recall']
                cm = output['cm']             

                all_predicted = output['all_predicted']
                actual_positive = output['actual_positive']
                true_positive = output['true_positive']
                false_positive = output['false_positive']
                false_negative = output['false_negative']


                summary += 'Confussion Matrix: \n'
                summary += str(cm) + '\n'
                
                
                
                summary += 'Precision: ' + str(precision) + '\n'
                summary += 'Recall: ' + str(recall) + '\n'
                summary += 'F1: ' + str(2 * (precision * recall)/(precision + recall)) + '\n'


                summary += 'Training Examples Count: ' + str(train_count) + '\n'
                summary += 'Test Examples Count: ' + str(test_count) + '\n'
                
                example_head = '-' * 20 + ' %s ' + '-' * 20 + '\n'
                example_format = '# %d. DOC: %s\nHEAD2: %s \nHEAD1: %s\nTEXT: %s\n\n\n'
                
                
                out += example_head%('PREDICTED')
                for index, (doc, h2, h1, t) in all_predicted.iterrows():
                    out += example_format%(index, doc, h2, h1, t)
                out += '\n'
                
                out += example_head%('TRUE POSITIVE')
                for index, (doc, h2, h1, t) in true_positive.iterrows():
                    out += example_format%(index, doc, h2, h1, t)
                out += '\n'
                    
                out += example_head%('FALSE NEGATIVE')
                for index, (doc, h2, h1, t) in false_negative.iterrows():
                    out += example_format%(index, doc, h2, h1, t)
                out += '\n'
                    
                out += example_head%('FALSE POSITIVE')
                for index, (doc, h2, h1, t) in false_positive.iterrows():
                    out += example_format%(index, doc, h2, h1, t)
                out += '\n'
                
            else:
                summary += 'There were only ' + str(train_count) + ' training examples. 2 or more are needed to train the model.'
                summary += '\n'
            
            print(summary)
            outFile.write(summary + '\n')
            outFile.write(out)

There were only 1 training examples. 2 or more are needed to train the model.

Confussion Matrix: 
[[159   3]
 [  4   0]]
Precision: 0.0
Recall: 0.0
F1: nan
Training Examples Count: 32
Test Examples Count: 4

Confussion Matrix: 
[[165   0]
 [  1   0]]
Precision: 0.0
Recall: 0.0
F1: nan
Training Examples Count: 34
Test Examples Count: 1

Confussion Matrix: 
[[163   0]
 [  3   0]]
Precision: 0.0
Recall: 0.0
F1: nan
Training Examples Count: 10
Test Examples Count: 3

Confussion Matrix: 
[[156   3]
 [  6   1]]
Precision: 0.14285714285714285
Recall: 0.25
F1: 0.18181818181818182
Training Examples Count: 34
Test Examples Count: 7

There were only 1 training examples. 2 or more are needed to train the model.

Confussion Matrix: 
[[166]]
Precision: nan
Recall: nan
F1: nan
Training Examples Count: 6
Test Examples Count: 0

Confussion Matrix: 
[[166]]
Precision: nan
Recall: nan
F1: nan
Training Examples Count: 4
Test Examples Count: 0

Confussion Matrix: 
[[164   0]
 [  1   1]]
Precision: 0.5
Rec

## Per-P Classification

In [13]:
old_data = old_data_p
new_data = data_p

data_train = old_data.loc[old_data['document'].isin(train_docs)]
data_test = new_data.loc[new_data['document'].isin(test_docs)]

import warnings
# Ignore division by zero when calculating F1 score
warnings.filterwarnings(action='ignore', category=RuntimeWarning) 
output_file = os.path.join(shared_path, 'trainOld_testNew_p.txt')

with open(output_file, 'w') as outFile:
    
    for l in labels:
        if l != 'other':
            summary = '=' * 20 + ' Testing Label: ' + str(l) + ' ' + '=' * 20 + '\n'
            out = ''
            

            train_count = data_train[l].sum()
            test_count = data_test[l].sum()
                        

            if train_count > 1:
                
                params = svm_train(data_train, l)
                output = svm_test(data_test, params)

                precision = output['precision']
                recall = output['recall']
                cm = output['cm']             

                all_predicted = output['all_predicted']
                actual_positive = output['actual_positive']
                true_positive = output['true_positive']
                false_positive = output['false_positive']
                false_negative = output['false_negative']


                summary += 'Confussion Matrix: \n'
                summary += str(cm) + '\n'
                
                
                
                summary += 'Precision: ' + str(precision) + '\n'
                summary += 'Recall: ' + str(recall) + '\n'
                summary += 'F1: ' + str(2 * (precision * recall)/(precision + recall)) + '\n'


                summary += 'Training Examples Count: ' + str(train_count) + '\n'
                summary += 'Test Examples Count: ' + str(test_count) + '\n'
                
                example_head = '-' * 20 + ' %s ' + '-' * 20 + '\n'
                example_format = '# %d. DOC: %s\nHEAD2: %s \nHEAD1: %s\nTEXT: %s\n\n\n'
                
                
                out += example_head%('PREDICTED')
                for index, (doc, h2, h1, t) in all_predicted.iterrows():
                    out += example_format%(index, doc, h2, h1, t)
                out += '\n'
                
                out += example_head%('TRUE POSITIVE')
                for index, (doc, h2, h1, t) in true_positive.iterrows():
                    out += example_format%(index, doc, h2, h1, t)
                out += '\n'
                    
                out += example_head%('FALSE NEGATIVE')
                for index, (doc, h2, h1, t) in false_negative.iterrows():
                    out += example_format%(index, doc, h2, h1, t)
                out += '\n'
                    
                out += example_head%('FALSE POSITIVE')
                for index, (doc, h2, h1, t) in false_positive.iterrows():
                    out += example_format%(index, doc, h2, h1, t)
                out += '\n'
                
            else:
                summary += 'There were only ' + str(train_count) + ' training examples. 2 or more are needed to train the model.'
                summary += '\n'
            
            print(summary)
            outFile.write(summary + '\n')
            outFile.write(out)

There were only 1 training examples. 2 or more are needed to train the model.

Confussion Matrix: 
[[313   3]
 [  7   1]]
Precision: 0.125
Recall: 0.25
F1: 0.16666666666666666
Training Examples Count: 67
Test Examples Count: 8

Confussion Matrix: 
[[323   0]
 [  1   0]]
Precision: 0.0
Recall: 0.0
F1: nan
Training Examples Count: 58
Test Examples Count: 1

Confussion Matrix: 
[[321   0]
 [  3   0]]
Precision: 0.0
Recall: 0.0
F1: nan
Training Examples Count: 13
Test Examples Count: 3

Confussion Matrix: 
[[306   3]
 [ 14   1]]
Precision: 0.06666666666666667
Recall: 0.25
F1: 0.10526315789473685
Training Examples Count: 65
Test Examples Count: 15

There were only 1 training examples. 2 or more are needed to train the model.

Confussion Matrix: 
[[324]]
Precision: nan
Recall: nan
F1: nan
Training Examples Count: 10
Test Examples Count: 0





Confussion Matrix: 
[[324]]
Precision: nan
Recall: nan
F1: nan
Training Examples Count: 4
Test Examples Count: 0

Confussion Matrix: 
[[322   0]
 [  1   1]]
Precision: 0.5
Recall: 1.0
F1: 0.6666666666666666
Training Examples Count: 4
Test Examples Count: 2

There were only 1 training examples. 2 or more are needed to train the model.

Confussion Matrix: 
[[278   1]
 [ 25  20]]
Precision: 0.4444444444444444
Recall: 0.9523809523809523
F1: 0.6060606060606061
Training Examples Count: 43
Test Examples Count: 45

