In [309]:
import torch
import torch.nn as nn
import torch.utils.data
import pandas as pd
from torch.utils import data
from numpy import array
from numpy import argmax
import argparse
from torch.autograd import Variable
from torch import optim
import numpy as np
import os
import logging
import pickle as pkl
import warnings
pd.options.mode.chained_assignment = None
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="10"
import glob
import random
random.seed(1612)
from sklearn.metrics.pairwise import cosine_similarity
import inflect
from gensim.models import Word2Vec, KeyedVectors
import gensim
import json
import inflect

In [310]:
inflection_engine = inflect.engine()

In [311]:
## PARAMETERS
constituent = 'mods'
dims = 300
use_frequency_information = True
if use_frequency_information:
    dims += 5
temporal = False

In [312]:
torch.manual_seed(1612)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [313]:
print(gensim.__version__)

4.1.2


In [314]:
#TODO: Steps
### 1 Load dev and test data
### 2 Use dev data to put together ten datasets, each consisting of dev data + corrupted compounds
# 3 Load corrupted modifiers for test as well, and put test + corrupted modifiers together into one dataset
# 4 Load word2vec model and get representations for each compound in each set (by concatenation)
# (4.2 - Turn representations for each compound into dataframes so they're compatible with Prajit's old code)
# 5 Turn sets into tensors etc
# 6 Use as inputs to training!!
# 7 Then: implement cosine similarity approach

In [315]:
with open('datasets/COCA_train_min3_no_doubles_filtered_new.txt', 'r') as infile:
    train_compounds = [line.strip('\n\r') for line in infile]

In [316]:
with open('datasets/COCA_dev_min3_no_doubles_filtered_new.txt', 'r') as infile:
    dev_compounds = [line.strip('\n\r') for line in infile]

In [317]:
with open('datasets/COCA_test_min3_no_doubles_filtered_new.txt', 'r') as infile:
    test_compounds = [line.strip('\n\r') for line in infile]

In [318]:
if temporal: 
    with open('encoded_vecs.json', 'rb') as infile:
        embedding_model = json.load(infile)
elif use_frequency_information:
    with open('vecs_with_freqs.json', 'rb') as infile:
        embedding_model = json.load(infile)
else: 
    if dims == 300:
        embedding_model = Word2Vec.load('word2vec_2009.model')
    elif dims == 100:
        embedding_model = Word2Vec.load('word2vec_2009_100.model')
    elif dims == 0:
        embedding_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
    else: 
        raise ValueError('300 and 100 dims are the only vector sized that are supported at the moment!')

In [319]:
if dims==0 or use_frequency_information:
    train_compounds = [compound for compound in train_compounds 
                   if compound.split()[0] in embedding_model and compound.split()[1] in embedding_model]
else: 
    train_compounds = [compound for compound in train_compounds 
                   if compound.split()[0] in embedding_model.wv and compound.split()[1] in embedding_model.wv]

In [320]:
print(len(dev_compounds))
if dims==0 or use_frequency_information:
    dev_compounds = [compound for compound in dev_compounds 
                   if compound.split()[0] in embedding_model and compound.split()[1] in embedding_model]
else:
    dev_compounds = [compound for compound in dev_compounds 
                   if compound.split()[0] in embedding_model.wv and compound.split()[1] in embedding_model.wv]
print(len(dev_compounds))

98866
98866


In [321]:
print(len(test_compounds))
if dims==0 or use_frequency_information:
    test_compounds = [compound for compound in test_compounds 
                   if compound.split()[0] in embedding_model and compound.split()[1] in embedding_model]
else:
    test_compounds = [compound for compound in test_compounds
                   if compound.split()[0] in embedding_model.wv and compound.split()[1] in embedding_model.wv]
print(len(test_compounds))

21800
21799


In [322]:
def load_corrupted_compounds(constituent, data_name):
    assert(data_name in ['dev', 'test']), 'data_name must be either dev or test'
    assert(constituent in ['mods', 'heads']), 'constituent must be either mods or heads'
    if data_name == 'dev':
        corrupted_compound_lists = []
        for i in range(10):
            with open(f'corrupted_samples_filtered/corrupted_{constituent}_10_{i}.txt', 'r') as infile:
                corrupted_compounds = [line.strip('\n\r') for line in infile]
                corrupted_compound_lists.append(corrupted_compounds)
        return corrupted_compound_lists
    else:
        with open(f'corrupted_samples_filtered/corrupted_{constituent}_{data_name}.txt', 'r') as infile: 
            corrupted_compounds = [line.strip('\n\r') for line in infile]
        return corrupted_compounds

In [323]:
def generate_train_datasets(dev_compounds, corrupted_samples):
    datasets = []
    for corrupted_compound_list in corrupted_samples:
        datasets.append((dev_compounds, corrupted_compound_list))
    return datasets

In [324]:
def get_freq_info(word, freq_data):
    if word in list(freq_data['word']):
            word_freq = freq_data.loc[freq_data['word'] == word]
            word_freq = list(word_freq.iloc[0][-5:])
            return word_freq
    else:
        return [0.00]*5

In [325]:
def get_compound_representation(compound, model):
    mod, head = compound.split()
    if temporal or use_frequency_information:
        mod_vector = np.array(model[mod]) if mod in model else np.zeros(dims)
        head_vector = np.array(model[head]) if head in model else np.zeros(dims)
    else:
        mod_vector = model.wv[mod] if mod in model.wv else np.zeros(dims)
        head_vector = model.wv[head] if head in model.wv else np.zeros(dims)
    assert len(mod_vector) == len(head_vector), f'modifier and head vectors are not of same length: {len(mod_vector)} and {len(head_vector)}'
    
    # we need to check word membership in the embedding model slightly differently depending on
    # which embedding model we have loaded
    if temporal or use_frequency_information or dims == 0: 
        constituents_found = (mod in model, head in model)
    else: 
        constituents_found = (mod in model.wv, head in model.wv)
    
    return np.concatenate((mod_vector, head_vector)), constituents_found

In [326]:
# putting together train datasets (from dev data)
train_corrupted_compounds = load_corrupted_compounds(constituent, 'dev')
train_datasets = generate_train_datasets(dev_compounds, train_corrupted_compounds)

In [327]:
# getting corrupted samples for test data
corrupted_compounds_test = load_corrupted_compounds(constituent, 'test')

# making a tuple of positive and negative samples for test data
test_dataset = (test_compounds, corrupted_compounds_test)

In [328]:
def convert_dataset_to_tensors(data_tuple, embedding_model, shuffle=True):
    # TODO find out what to do with compounds that lack a representation (either partially or completely)
    representations_positive = []
    representations_negative = []
    
    mods_not_found = 0
    heads_not_found = 0
    comps_not_found = 0
    
    for compound in data_tuple[0]:
        representation, constituents_found = get_compound_representation(compound, embedding_model)
        mod_found, head_found = constituents_found
        if not mod_found: 
            mods_not_found += 1
        if not head_found: 
            heads_not_found += 1
        if not mod_found and not head_found:
            comps_not_found += 1
        representations_positive.append(torch.tensor(representation))
    
    for compound in data_tuple[1]:
        representation, constituents_found = get_compound_representation(compound, embedding_model)
        representations_negative.append(torch.tensor(representation))
    positive_tensors = torch.stack(representations_positive)
    negative_tensors = torch.stack(representations_negative)
    
    #print('positive mods not found: ', mods_not_found)
    #print('positive heads not found: ', heads_not_found)
    #print('positive comps not found: ', comps_not_found)
    
    positive_Y = torch.ones(positive_tensors.shape[0])
    negative_Y = torch.zeros(negative_tensors.shape[0])
    
    X = torch.cat((positive_tensors, negative_tensors))
    Y = torch.cat((positive_Y, negative_Y))
    
    assert len(X) == len(Y), 'X and Y are not of the same length'
    
    if shuffle:
        indices = torch.randperm(X.shape[0])
        X = torch.index_select(X, 0, indices)
        Y = torch.index_select(Y, 0, indices)
    
    return (X, Y)

In [329]:
def convert_compound_list_to_tensors(compound_list):
    tensors = []
    for comp in compound_list:
        representation, _ = get_compound_representation(comp, embedding_model)
        tensors.append(torch.tensor(representation))
    return torch.stack(tensors)
    

In [330]:
def generate_cosine_similarity_compounds(compound_list, 
                                         n, 
                                         embedding_model, 
                                         target_constituent, 
                                         train_compound_dict, 
                                         dev_compound_dict, dims):
    assert(target_constituent in ['mods', 'heads']), 'target_constituent must be either modifier or head'
    
    print('Generating cosine similarity compounds')
    print(len(compound_list))
    
    novel_compounds_origin_dict = {}
    
    def join_constituents(new_word, other_constituent):
        if target_constituent == 'mods':
            novel_compound = new_word + " " + other_constituent
        elif target_constituent == 'heads':
            novel_compound = other_constituent + " " + new_word
        else: raise ValueError('target_constituent must be either \'mods\' or \'heads\' ')
        return novel_compound
    
    def is_equivalent(word, word_to_compare):
        singular_word = inflection_engine.singular_noun(word)
        singular_word = singular_word if singular_word else word
        
        singular_comparison_word = inflection_engine.singular_noun(word_to_compare)
        singular_comparison_word = singular_comparison_word if singular_comparison_word else word_to_compare
        
        if singular_word == singular_comparison_word:
            return True
        else:
            return False
    
    all_novel_compounds = {}
    for compound in compound_list: 
        num_compounds_found = 0
        mod, head = compound.split()
        k = 4
        if target_constituent == 'mods':
            word_to_change = mod
            other_constituent = head
        else: 
            word_to_change = head
            other_constituent = mod
        #print('word_to_change, other_constituent: ', word_to_change, other_constituent)
        #print(f'getting top {n*k} similar words to {word_to_change} ({target_constituent})')
        if dims==0:
            print('dims == 0')
            similar_words = embedding_model.most_similar(word_to_change, topn=n*k)
        else:
            similar_words = embedding_model.wv.most_similar(word_to_change, topn=n*k)
            #print(f'top similar words: {similar_words}')
        for word in similar_words:
                
            current_novel_compounds=[]
            new_comp = join_constituents(word[0], other_constituent).lower()
            if new_comp not in train_compound_dict \
                and new_comp not in dev_compound_dict \
                and new_comp not in novel_compounds_origin_dict \
                and not is_equivalent(word[0], word_to_change) \
                and not '_' in new_comp:
                    
                current_novel_compounds.append(new_comp)
                #print(f'appended novel compound: {new_comp}')
                num_compounds_found += 1
                novel_compounds_origin_dict.update({new_comp: compound})
            if num_compounds_found >= n:
                break
                #print(f'found {len(current_novel_compounds)} feasible words')
            
        if num_compounds_found < n:
            print('increasing k by one')
            k += 1
        if k == 10:
            print('K too large, moving on to next compound')
            continue
            
    return list(all_novel_compounds.keys()), novel_compounds_origin_dict

In [331]:
novel_comp_dict_filename = f'novel_compound_origin_dict_{constituent}_300.json'
if os.path.exists(novel_comp_dict_filename):
    print('fetching already existing file')
    with open(novel_comp_dict_filename, 'rb') as infile:
        novel_compound_dict = json.load(infile)
else:
    train_dict = {comp : 0 for comp in train_compounds}
    dev_dict = {comp : 0 for comp in dev_compounds}
    novel_compounds, novel_compound_dict = generate_cosine_similarity_compounds(compound_list = dev_compounds,
                                                          n = 3, 
                                                          embedding_model=embedding_model,
                                                          target_constituent=constituent,
                                                          train_compound_dict=train_dict,
                                                          dev_compound_dict=dev_dict,
                                                            dims=dims)
    with open(f'novel_compound_origin_dict_{constituent}_{dims}.json', 'w') as outfile:
        json.dump(novel_compound_dict, outfile)
    print(novel_compounds[:10])
    print(len(dev_compounds))
    print(len(novel_compounds))

fetching already existing file


In [332]:

print(len(novel_compound_dict))
novel_compounds = [comp for comp in novel_compound_dict.keys()]


294577


In [333]:
#FP: 
print('FPS:')
fps = ['dogs run', 'melting cloud', 'riesling sauce', 
       'kevlar jacket', 'waistband blouse', 'gold tsunami', 
       'toes ring', 'boy food', 'cappuccino cherry', 
       'sequined glove', 'kneelength glove', 'lightemitting lamp', 
       'brain moisturizer', 'healthcare burden', 'misrepresentation campaign', 
       'hashish store', 'brain sculpting', 'jog yoga']
for comp in fps: 
    if comp in novel_compound_dict:
        print(comp, " : ", novel_compound_dict[comp])
    else: 
        print(f'{comp} not in dictionary')
print()
print('TPS:')
tps = ['vaccination law', 'loot box', 'pork burger', 
       'infection outbreak', 'authentication method', 'verification code', 
       'tilapia skin', 'horseradish juice']
for comp in tps: 
    if comp in novel_compound_dict:
        print(comp, " : ", novel_compound_dict[comp])
    else: 
        print(f'{comp} not in dictionary')

FPS:
dogs run not in dictionary
melting cloud not in dictionary
riesling sauce not in dictionary
kevlar jacket  :  nylon jacket
waistband blouse  :  sleeve blouse
gold tsunami  :  silver tsunami
toes ring not in dictionary
boy food  :  guy food
cappuccino cherry  :  coffee cherry
sequined glove  :  buckskin glove
kneelength glove  :  buckskin glove
lightemitting lamp  :  fluorescent lamp
brain moisturizer  :  body moisturizer
healthcare burden  :  health burden
misrepresentation campaign  :  disinformation campaign
hashish store not in dictionary
brain sculpting  :  body sculpting
jog yoga not in dictionary

TPS:
vaccination law  :  sterilization law
loot box  :  treasure box
pork burger  :  beef burger
infection outbreak  :  disease outbreak
authentication method  :  encryption method
verification code  :  application code
tilapia skin  :  salmon skin
horseradish juice not in dictionary


In [334]:
#with open(f'novel_compound_origins_{constituent}_{dims}.txt', 'w') as outfile:
#    for key, value in novel_compound_dict.items():
#        outfile.write(key + " : " + value + "\n")

In [335]:
#print('mushroom' in embedding_model.wv)

In [336]:
test_data_tuple = convert_dataset_to_tensors(test_dataset, embedding_model, shuffle=False)

In [337]:
print(len(test_data_tuple[0][0]))

610


In [338]:
train_data_tuples = []

for data_tuple in train_datasets:
    tensors = convert_dataset_to_tensors(data_tuple, embedding_model)
    train_data_tuples.append(tensors)

In [339]:
print(len(train_data_tuples))

10


In [340]:
input_size = dims*2
print(input_size)
hidden_size = 300
num_classes = 2
num_epochs = 50
batch_size = 72
learning_rate = 0.002

610


In [341]:
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes,bias=False)  
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

model = NeuralNet(input_size, hidden_size, num_classes).to(device)#.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 

In [342]:
def run_classifier(train_datasets, test_dataset, cosine_sim_novel_comp_tensors):
    total_accuracy=[]
    novel_comp_predictions=[]
    disambiguator_predictions=[]
    
    for train_dataset in train_datasets: 
        train_X = train_dataset[0].float().to(device)
        print('train_X shape:', train_X.shape)
        train_Y = train_dataset[1].long().to(device)

        test_X = test_dataset[0].float().to(device)
        test_Y = test_dataset[1].long().to(device)
        model = NeuralNet(input_size, hidden_size, num_classes).to(device)
        
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        n_examples=train_X.shape[0]
        for i in range(num_epochs):

            cost = 0.

            num_batches = n_examples // batch_size
            for k in range(num_batches):
                start, end = k * batch_size, (k + 1) * batch_size
                outputs = model(train_X[start:end])
                loss = criterion(outputs, train_Y[start:end])

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            #if (k+1) % 100 == 0:
            #print ('Epoch [{}/{}], Loss: {:.4f}'.format(i+1, num_epochs, loss.item()))

        with torch.no_grad():
            correct=0
            total=0
            outputs = model(test_X)
            if cosine_sim_novel_comp_tensors is not None:
                novel_comp_outputs = model(cosine_sim_novel_comp_tensors.float())
                _, novel_predicted = torch.max(novel_comp_outputs.data, 1)
                novel_comp_predictions.append(novel_predicted)
            _, predicted = torch.max(outputs.data, 1)
            disambiguator_predictions.append(predicted)
            total += test_Y.size(0)
            correct += (predicted == test_Y).sum().item()
        curr_acc=100 * correct / total
        print(curr_acc)
        total_accuracy.append(curr_acc)
    return total_accuracy, disambiguator_predictions, novel_comp_predictions

In [343]:
def comp_exists_number_insensitive(compound, control_list):
    mod, head = compound.split()
            
    singular_head = inflection_engine.singular_noun(head)
    singular_head = singular_head if singular_head else head
    singular_compound = ' '.join((mod, singular_head)) 
            
    plural_head = inflection_engine.plural_noun(head)
    plural_head = plural_head if plural_head else head                
    plural_compound = ' '.join((mod, plural_head)) 
    return (singular_compound in control_list or plural_compound in control_list)

In [344]:
def evaluate_novel_compounds(cosine_novel_compounds, attested_novel_compounds, all_predictions):
    print('EVALUATING COSINE GENERATED COMPOUNDS')
    predictions_df = pd.DataFrame([tensr.tolist() for tensr in all_predictions])
    comp_predictions_per_round = [list(predictions_df[i]) for i in range(len(all_predictions[0]))]
    
    final_compound_predictions = [max(set(pred_list), key = pred_list.count) 
                              for pred_list in comp_predictions_per_round]
    
    comp_to_pred_dict = {compound: prediction 
                               for compound, prediction 
                               in zip(cosine_novel_compounds, final_compound_predictions)}
    true_positives = [comp for comp in comp_to_pred_dict.keys() 
                  if comp_exists_number_insensitive(comp, attested_novel_compounds) and comp_to_pred_dict[comp] == 1]
    false_positives = [comp for comp in comp_to_pred_dict.keys() 
                   if not comp_exists_number_insensitive(comp, attested_novel_compounds) and comp_to_pred_dict[comp] == 1]
    true_negatives = [comp for comp in comp_to_pred_dict.keys() 
                  if not comp_exists_number_insensitive(comp, attested_novel_compounds) and comp_to_pred_dict[comp] == 0]
    false_negatives = [comp for comp in comp_to_pred_dict.keys()
                  if comp_exists_number_insensitive(comp, attested_novel_compounds) and comp_to_pred_dict[comp] == 0]
    
    print(f'True positives: {len(true_positives)}')
    print(f'False positives: {len(false_positives)}')
    print(f'True negatives: {len(true_negatives)}')
    print(f'False negatives: {len(false_negatives)}')
    majority_vote_acc = (len(true_positives) + len(true_negatives)) / \
    (len(true_positives) + len(true_negatives) + len(false_positives) + len(false_negatives))
    print(f'Majority-vote accuracy: {majority_vote_acc}')
    print(f'Average accuracy: {round(np.mean(total_accuracy), 2)}')
    print(f'Average accuracy SD: {round(np.std(total_accuracy), 2)}')
    return true_positives, false_positives, true_negatives, false_negatives

In [345]:
def evaluate_disambiguated_compounds(test_data_compounds, test_Y, all_predictions):
    print('EVALUATING DISAMBIGUATED COMPOUNDS')
    predictions_df = pd.DataFrame([tensr.tolist() for tensr in all_predictions])
    comp_predictions_per_round = [list(predictions_df[i]) for i in range(len(all_predictions[0]))]
    
    # getting majority-vote results
    final_compound_predictions = [max(set(pred_list), key = pred_list.count) 
                              for pred_list in comp_predictions_per_round]
    assert(len(test_data_compounds) == len(test_Y) == len(final_compound_predictions)), 'test_data_compounds, test_Y and final_compound_predictions must be of the same length'
        
    true_positives = [test_data_compounds[i] for i in range(len(test_data_compounds)) 
                      if test_Y[i] == final_compound_predictions[i] == 1]
    false_positives = [test_data_compounds[i] for i in range(len(test_data_compounds)) 
                      if test_Y[i] == 0 and final_compound_predictions[i] == 1]
    true_negatives = [test_data_compounds[i] for i in range(len(test_data_compounds)) 
                      if test_Y[i] == final_compound_predictions[i] == 0]
    false_negatives = [test_data_compounds[i] for i in range(len(test_data_compounds)) 
                      if test_Y[i] == 1 and final_compound_predictions[i] == 0]
    
    print(f'True positives: {len(true_positives)}')
    print(f'False positives: {len(false_positives)}')
    print(f'True negatives: {len(true_negatives)}')
    print(f'False negatives: {len(false_negatives)}')
    majority_vote_acc = (len(true_positives) + len(true_negatives)) / \
    (len(true_positives) + len(true_negatives) + len(false_positives) + len(false_negatives))
    print(f'Majority-vote accuracy: {majority_vote_acc}')
    print(f'Average accuracy: {round(np.mean(total_accuracy), 2)}')
    print(f'Average accuracy SD: {round(np.std(total_accuracy), 2)}')
    return true_positives, false_positives, true_negatives, false_negatives

In [346]:
cosine_novel_tensors = convert_compound_list_to_tensors(novel_compounds)

In [347]:
total_accuracy, disambiguator_predictions, novel_compound_predictions = run_classifier(train_data_tuples, test_data_tuple, cosine_novel_tensors)

train_X shape: torch.Size([197732, 610])
71.51540172939747
train_X shape: torch.Size([197732, 610])
72.02229408931397
train_X shape: torch.Size([197732, 610])
71.85944631757609
train_X shape: torch.Size([197732, 610])
71.88467625404252
train_X shape: torch.Size([197732, 610])
71.84339090346108
train_X shape: torch.Size([197732, 610])
71.63696415055391
train_X shape: torch.Size([197732, 610])
71.6805431317232
train_X shape: torch.Size([197732, 610])
71.68283676231106
train_X shape: torch.Size([197732, 610])
71.85944631757609
train_X shape: torch.Size([197732, 610])
71.29980045413886


In [348]:
print(total_accuracy)

[71.51540172939747, 72.02229408931397, 71.85944631757609, 71.88467625404252, 71.84339090346108, 71.63696415055391, 71.6805431317232, 71.68283676231106, 71.85944631757609, 71.29980045413886]


In [349]:
all_test_compounds = test_dataset[0] + test_dataset[1]
test_Y = list(np.concatenate((np.ones(len(test_dataset[0])), np.zeros(len(test_dataset[1])))))
print(len(all_test_compounds))
print(len(test_Y))
print(len(disambiguator_predictions))

43599
43599
10


In [350]:
true_positives, false_positives, true_negatives, false_negatives = evaluate_disambiguated_compounds(all_test_compounds, test_Y, disambiguator_predictions)

EVALUATING DISAMBIGUATED COMPOUNDS
True positives: 14108
False positives: 3332
True negatives: 18468
False negatives: 7691
Majority-vote accuracy: 0.7471731003004656
Average accuracy: 71.73
Average accuracy SD: 0.2


In [351]:
with open(f'results_{dims}/disambiguated/true_positives_disambiguated_{constituent}_{hidden_size}hidden_{num_epochs}e_temp{temporal}.txt', 'w') as outfile:
    for compound in true_positives:
        outfile.write(compound + "\n")
with open(f'results_{dims}/disambiguated/true_negatives_disambiguated_{constituent}_{hidden_size}hidden_{num_epochs}e_temp{temporal}.txt', 'w') as outfile:
    for compound in true_negatives:
        outfile.write(compound + "\n")
with open(f'results_{dims}/disambiguated/false_positives_disambiguated_{constituent}_{hidden_size}hidden_{num_epochs}e_temp{temporal}.txt', 'w') as outfile:
    for compound in false_positives:
        outfile.write(compound + "\n")
with open(f'results_{dims}/disambiguated/false_negatives_disambiguated_{constituent}_{hidden_size}hidden_{num_epochs}e_temp{temporal}.txt', 'w') as outfile:
    for compound in false_negatives:
        outfile.write(compound + "\n")

In [352]:
test_dict = {comp for comp in test_compounds}

true_positives_cos, false_positives_cos, true_negatives_cos, false_negatives_cos = evaluate_novel_compounds(novel_compounds, test_dict, novel_compound_predictions)

EVALUATING COSINE GENERATED COMPOUNDS
True positives: 448
False positives: 76578
True negatives: 217350
False negatives: 201
Majority-vote accuracy: 0.7393584699416451
Average accuracy: 71.73
Average accuracy SD: 0.2


In [353]:
with open(f'results_{dims}/cosine_novel/true_positives_cosine_{constituent}_{hidden_size}hidden_{num_epochs}e_temp{temporal}.txt', 'w') as outfile:
    for compound in true_positives_cos:
        outfile.write(compound + "\n")

In [354]:
with open(f'results_{dims}/cosine_novel/true_negatives_cosine_{constituent}_{hidden_size}hidden_{num_epochs}e_temp{temporal}.txt', 'w') as outfile:
    for compound in true_negatives_cos:
        outfile.write(compound + "\n")

In [355]:
with open(f'results_{dims}/cosine_novel/false_positives_cosine_{constituent}_{hidden_size}hidden_{num_epochs}e_temp{temporal}.txt', 'w') as outfile:
    for compound in false_positives_cos:
        outfile.write(compound + "\n")

In [356]:
with open(f'results_{dims}/cosine_novel/false_negatives_cosine_{constituent}_{hidden_size}hidden_{num_epochs}e_temp{temporal}.txt', 'w') as outfile:
    for compound in false_negatives_cos:
        outfile.write(compound + "\n")

([67.0332633525299, 66.76746470463607, 66.5421136770739, 67.06215450990966, 66.80406017065043, 67.06215450990966, 67.021706889578, 66.41306650744428, 66.88110325699648, 66.71353454419383], [])

In [357]:
print(round(np.mean(total_accuracy), 2))

71.73


In [358]:
print(round(np.std(total_accuracy), 2))

0.2
