In [1]:
import torch
import torch.nn as nn
import torch.utils.data
import pandas as pd
from torch.utils import data
from numpy import array
from numpy import argmax
import argparse
from torch.autograd import Variable
from torch import optim
import numpy as np
import os
import logging
import pickle as pkl
import warnings
pd.options.mode.chained_assignment = None
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="10"
import glob
import random
random.seed(1991)
from sklearn.metrics.pairwise import cosine_similarity
import inflect
from gensim.models import Word2Vec
import gensim
import inflect

In [2]:
inflection_engine = inflect.engine()

In [3]:
## PARAMETERS
constituent = 'mods'
dims = 300

In [4]:
torch.manual_seed(1612)
if not torch.cuda.is_available():
    
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")
device = 'cuda' if torch.cuda.is_available() else 'cpu'



In [5]:
print(gensim.__version__)

4.1.2


In [6]:
#TODO: Steps
### 1 Load dev and test data
### 2 Use dev data to put together ten datasets, each consisting of dev data + corrupted compounds
# 3 Load corrupted modifiers for test as well, and put test + corrupted modifiers together into one dataset
# 4 Load word2vec model and get representations for each compound in each set (by concatenation)
# (4.2 - Turn representations for each compound into dataframes so they're compatible with Prajit's old code)
# 5 Turn sets into tensors etc
# 6 Use as inputs to training!!
# 7 Then: implement cosine similarity approach

In [7]:
with open('datasets/COCA_train_min3.txt', 'r') as infile:
    train_compounds = [line.strip('\n\r') for line in infile]

In [8]:
with open('datasets/COCA_dev_min3.txt', 'r') as infile:
    dev_compounds = [line.strip('\n\r') for line in infile]

In [9]:
with open('datasets/COCA_test_min3.txt', 'r') as infile:
    test_compounds = [line.strip('\n\r') for line in infile]

In [10]:
if dims == 300:
    word2vec_model = Word2Vec.load('word2vec_2009.model')
elif dims == 100:
    word2vec_model = Word2Vec.load('word2vec_2009_100.model')
else: 
    raise ValueError('300 and 100 dims are the only vector sized that are supported at the moment!')

In [11]:
def load_corrupted_compounds(constituent, data_name):
    assert(data_name in ['dev', 'test']), 'data_name must be either dev or test'
    assert(constituent in ['mods', 'heads']), 'constituent must be either mods or heads'
    if data_name == 'dev':
        corrupted_compound_lists = []
        for i in range(10):
            with open(f'corrupted_samples/corrupted_{constituent}_10_{i}.txt', 'r') as infile:
                corrupted_compounds = [line.strip('\n\r') for line in infile]
                corrupted_compound_lists.append(corrupted_compounds)
        return corrupted_compound_lists
    else:
        with open(f'corrupted_samples/corrupted_{constituent}_{data_name}.txt', 'r') as infile: 
            corrupted_compounds = [line.strip('\n\r') for line in infile]
        return corrupted_compounds

In [12]:
def generate_train_datasets(dev_compounds, corrupted_samples):
    datasets = []
    for corrupted_compound_list in corrupted_samples:
        datasets.append((dev_compounds, corrupted_compound_list))
    return datasets

In [13]:
def get_compound_representation(compound, model):
    mod, head = compound.split()
    mod_vector = model.wv[mod] if mod in model.wv else np.zeros(dims)
    head_vector = model.wv[head] if head in model.wv else np.zeros(dims)
    assert len(mod_vector) == len(head_vector) == dims, 'length of vectors is wrong fsr'
    #if mod not in model.wv:
    #    print(f'modifier \'{mod}\' not found in word2vec model.')
    #if head not in model.wv:
    #    print(f'head \'{head}\' not found in word2vec model.')
    constituents_found = (mod in model.wv, head in model.wv)
    return np.concatenate((mod_vector, head_vector)), constituents_found

In [14]:
# putting together train datasets (from dev data)
train_corrupted_compounds = load_corrupted_compounds(constituent, 'dev')
train_datasets = generate_train_datasets(dev_compounds, train_corrupted_compounds)

In [15]:
# getting corrupted samples for test data
corrupted_compounds_test = load_corrupted_compounds(constituent, 'test')

# making a tuple of positive and negative samples for test data
test_dataset = (test_compounds, corrupted_compounds_test)

In [16]:
def convert_dataset_to_tensors(data_tuple, shuffle=True):
    # TODO find out what to do with compounds that lack a representation (either partially or completely)
    representations_positive = []
    representations_negative = []
    
    mods_not_found = 0
    heads_not_found = 0
    comps_not_found = 0
    
    for compound in data_tuple[0]:
        representation, constituents_found = get_compound_representation(compound, word2vec_model)
        mod_found, head_found = constituents_found
        if not mod_found: 
            mods_not_found += 1
        if not head_found: 
            heads_not_found += 1
        if not mod_found and not head_found:
            comps_not_found += 1
        representations_positive.append(torch.tensor(representation))
    
    for compound in data_tuple[1]:
        representation, constituents_found = get_compound_representation(compound, word2vec_model)
        representations_negative.append(torch.tensor(representation))
    positive_tensors = torch.stack(representations_positive)
    negative_tensors = torch.stack(representations_negative)
    
    #print('positive mods not found: ', mods_not_found)
    #print('positive heads not found: ', heads_not_found)
    #print('positive comps not found: ', comps_not_found)
    
    positive_Y = torch.ones(positive_tensors.shape[0])
    negative_Y = torch.zeros(negative_tensors.shape[0])
    
    X = torch.cat((positive_tensors, negative_tensors))
    Y = torch.cat((positive_Y, negative_Y))
    
    assert len(X) == len(Y), 'X and Y are not of the same length'
    
    if shuffle:
        indices = torch.randperm(X.shape[0])
        X = torch.index_select(X, 0, indices)
        Y = torch.index_select(Y, 0, indices)
    
    return (X, Y)

In [17]:
def convert_compound_list_to_tensors(compound_list):
    tensors = []
    for comp in compound_list:
        representation, _ = get_compound_representation(comp, word2vec_model)
        tensors.append(torch.tensor(representation))
    return torch.stack(tensors)
    

In [22]:
def generate_cosine_similarity_compounds(compound_list, 
                                         n, 
                                         word2vec_model, 
                                         target_constituent, 
                                         train_compound_dict, 
                                         dev_compound_dict):
    assert(target_constituent in ['modifier', 'head']), 'target_constituent must be either modifier or head'
    
    novel_compounds_origin_dict = {}
    
    def join_constituents(new_word, other_constituent):
        if target_constituent == 'modifier':
            novel_compound = new_word + " " + other_constituent
        elif target_constituent == 'head':
            novel_compound = other_constituent + " " + new_word
        else: raise ValueError('target_constituent must be either \'modifier\' or \'head\' ')
        return novel_compound
    
    all_novel_compounds = {}
    for compound in compound_list: 
        num_compounds_found = 0
        mod, head = compound.split()
        k = 3
        if target_constituent == 'modifier':
            word_to_change = mod
            other_constituent = head
        else: 
            word_to_change = head
            other_constituent = mod
        #print('word_to_change, other_constituent: ', word_to_change, other_constituent)
        #print(f'getting top {n*k} similar words to {word_to_change} ({target_constituent})')
        while num_compounds_found < n:
            similar_words = word2vec_model.wv.most_similar(word_to_change, topn=n*k)
            #print(f'top similar words: {similar_words}')
            current_novel_compounds = [join_constituents(word[0], other_constituent) for word in similar_words 
                                if join_constituents(word[0], other_constituent) not in train_compound_dict 
                                and join_constituents(word[0], other_constituent) not in dev_compound_dict
                                and join_constituents(word[0], other_constituent) not in all_novel_compounds][:n]
            num_compounds_found += len(current_novel_compounds)
            #print(f'found {len(current_novel_compounds)} feasible words')
            all_novel_compounds.update({comp:0 for comp in current_novel_compounds})
            #print('increasing k by one')
            k += 1
            novel_compounds_origin_dict.update({novel_compound: compound for novel_compound in current_novel_compounds})
    return list(all_novel_compounds.keys()), novel_compounds_origin_dict

In [23]:
train_dict = {comp : 0 for comp in train_compounds}
dev_dict = {comp : 0 for comp in dev_compounds}
novel_compounds, novel_compound_dict = generate_cosine_similarity_compounds(compound_list = dev_compounds,
                                                      n = 5, 
                                                      word2vec_model=word2vec_model,
                                                      target_constituent='modifier',
                                                      train_compound_dict=train_dict,
                                                      dev_compound_dict=dev_dict)
print(novel_compounds[:10])
print(len(dev_compounds))
print(len(novel_compounds))

['antonin principle', 'souter principle', 'rehnquist principle', 'ginsburg principle', 'retribution principle', 'mars today', 'krypton today', 'oceans today', 'pluto today', 'planets today']
100733
506353


In [26]:
import json
print(len(novel_compound_dict))
with open('novel_compound_origin_dict.json', 'w') as outfile:
    json.dump(novel_compound_dict, outfile)

506353


In [36]:
#FP: 
print('FPS:')
fps = ['dogs run', 'melting cloud', 'riesling sauce', 
       'kevlar jacket', 'waistband blouse', 'gold tsunami', 
       'toes ring', 'boy food', 'cappuccino cherry', 
       'sequined glove', 'kneelength glove', 'lightemitting lamp', 
       'brain moisturizer', 'healthcare burden', 'misrepresentation campaign', 
       'hashish store', 'brain sculpting', 'jog yoga']
for comp in fps: 
    if comp in novel_compound_dict:
        print(comp, " : ", novel_compound_dict[comp])
    else: 
        print(f'{comp} not in dictionary')
print()
print('TPS:')
tps = ['vaccination law', 'loot box', 'pork burger', 
       'infection outbreak', 'authentication method', 'verification code', 
       'tilapia skin', 'horseradish juice']
for comp in tps: 
    if comp in novel_compound_dict:
        print(comp, " : ", novel_compound_dict[comp])
    else: 
        print(f'{comp} not in dictionary')

FPS:
dogs run  :  dog run
melting cloud  :  boiling cloud
riesling sauce  :  wine sauce
kevlar jacket  :  nylon jacket
waistband blouse  :  sleeve blouse
gold tsunami  :  silver tsunami
toes ring  :  toe ring
boy food  :  man food
cappuccino cherry  :  coffee cherry
sequined glove  :  buckskin glove
kneelength glove  :  buckskin glove
lightemitting lamp  :  fluorescent lamp
brain moisturizer  :  body moisturizer
healthcare burden  :  health burden
misrepresentation campaign  :  misinformation campaign
hashish store  :  secondhand store
brain sculpting  :  body sculpting
jog yoga  :  walk yoga

TPS:
vaccination law  :  sterilization law
loot box  :  treasure box
pork burger  :  beef burger
infection outbreak  :  disease outbreak
authentication method  :  encryption method
verification code  :  application code
tilapia skin  :  salmon skin
horseradish juice  :  lime juice


In [32]:
with open('novel_compound_origins.txt', 'w') as outfile:
    for key, value in novel_compound_dict.items():
        outfile.write(key + " : " + value + "\n")

In [162]:
test_data_tuple = convert_dataset_to_tensors(test_dataset, shuffle=False)

In [101]:
print(test_data_tuple[1])

tensor([0., 0., 1.,  ..., 0., 1., 0.])


In [102]:
train_data_tuples = []

for data_tuple in train_datasets:
    tensors = convert_dataset_to_tensors(data_tuple)
    train_data_tuples.append(tensors)

positive mods not found:  0
positive heads not found:  0
positive comps not found:  0
positive mods not found:  0
positive heads not found:  0
positive comps not found:  0
positive mods not found:  0
positive heads not found:  0
positive comps not found:  0
positive mods not found:  0
positive heads not found:  0
positive comps not found:  0
positive mods not found:  0
positive heads not found:  0
positive comps not found:  0
positive mods not found:  0
positive heads not found:  0
positive comps not found:  0
positive mods not found:  0
positive heads not found:  0
positive comps not found:  0
positive mods not found:  0
positive heads not found:  0
positive comps not found:  0
positive mods not found:  0
positive heads not found:  0
positive comps not found:  0
positive mods not found:  0
positive heads not found:  0
positive comps not found:  0


In [163]:
print(len(train_data_tuples))

10


In [164]:
input_size = dims*2
print(input_size)
hidden_size = 300
num_classes = 2
num_epochs = 100
batch_size = 72
learning_rate = 0.002

600


In [165]:
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes,bias=False)  
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

model = NeuralNet(input_size, hidden_size, num_classes).to(device)#.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 

In [166]:
def run_classifier(train_datasets, test_dataset, cosine_sim_novel_comp_tensors):
    total_accuracy=[]
    novel_comp_predictions=[]
    disambiguator_predictions=[]
    
    for train_dataset in train_datasets:
        train_X = train_dataset[0].float().to(device)
        print('train_X shape:', train_X.shape)
        train_Y = train_dataset[1].long().to(device)

        test_X = test_dataset[0].float().to(device)
        test_Y = test_dataset[1].long().to(device)
        model = NeuralNet(input_size, hidden_size, num_classes).to(device)
        
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        n_examples=train_X.shape[0]
        for i in range(num_epochs):

            cost = 0.

            num_batches = n_examples // batch_size
            for k in range(num_batches):
                start, end = k * batch_size, (k + 1) * batch_size
                outputs = model(train_X[start:end])
                loss = criterion(outputs, train_Y[start:end])

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            #if (k+1) % 100 == 0:
            #print ('Epoch [{}/{}], Loss: {:.4f}'.format(i+1, num_epochs, loss.item()))

        with torch.no_grad():
            correct=0
            total=0
            outputs = model(test_X)
            if cosine_sim_novel_comp_tensors is not None:
                novel_comp_outputs = model(cosine_sim_novel_comp_tensors.float())
                _, novel_predicted = torch.max(novel_comp_outputs.data, 1)
                novel_comp_predictions.append(novel_predicted)
            _, predicted = torch.max(outputs.data, 1)
            disambiguator_predictions.append(predicted)
            total += test_Y.size(0)
            correct += (predicted == test_Y).sum().item()
        curr_acc=100 * correct / total
        print(curr_acc)
        total_accuracy.append(curr_acc)
    return total_accuracy, disambiguator_predictions, novel_comp_predictions

In [167]:
def comp_exists_number_insensitive(compound, control_list):
    mod, head = compound.split()
            
    singular_head = inflection_engine.singular_noun(head)
    singular_head = singular_head if singular_head else head
    singular_compound = ' '.join((mod, singular_head)) 
            
    plural_head = inflection_engine.plural_noun(head)
    plural_head = plural_head if plural_head else head                
    plural_compound = ' '.join((mod, plural_head)) 
    return (singular_compound in control_list or plural_compound in control_list)

In [168]:
def evaluate_novel_compounds(cosine_novel_compounds, attested_novel_compounds, all_predictions):
    predictions_df = pd.DataFrame([tensr.tolist() for tensr in all_predictions])
    comp_predictions_per_round = [list(predictions_df[i]) for i in range(len(all_predictions[0]))]
    
    final_compound_predictions = [max(set(pred_list), key = pred_list.count) 
                              for pred_list in comp_predictions_per_round]
    
    comp_to_pred_dict = {compound: prediction 
                               for compound, prediction 
                               in zip(cosine_novel_compounds, final_compound_predictions)}
    true_positives = [comp for comp in comp_to_pred_dict.keys() 
                  if comp_exists_number_insensitive(comp, attested_novel_compounds) and comp_to_pred_dict[comp] == 1]
    false_positives = [comp for comp in comp_to_pred_dict.keys() 
                   if not comp_exists_number_insensitive(comp, attested_novel_compounds) and comp_to_pred_dict[comp] == 1]
    true_negatives = [comp for comp in comp_to_pred_dict.keys() 
                  if not comp_exists_number_insensitive(comp, attested_novel_compounds) and comp_to_pred_dict[comp] == 0]
    false_negatives = [comp for comp in comp_to_pred_dict.keys()
                  if comp_exists_number_insensitive(comp, attested_novel_compounds) and comp_to_pred_dict[comp] == 0]
    
    print(f'True positives: {len(true_positives)}')
    print(f'False positives: {len(false_positives)}')
    print(f'True negatives: {len(true_negatives)}')
    print(f'False negatives: {len(false_negatives)}')
    majority_vote_acc = (len(true_positives) + len(true_negatives)) / \
    (len(true_positives) + len(true_negatives) + len(false_positives) + len(false_negatives))
    print(f'Majority-vote accuracy: {majority_vote_acc}')
    print(f'Average accuracy: {round(np.mean(total_accuracy), 2)}')
    print(f'Average accuracy SD: {round(np.std(total_accuracy), 2)}')
    return true_positives, false_positives, true_negatives, false_negatives

In [169]:
def evaluate_disambiguated_compounds(test_data_compounds, test_Y, all_predictions):
    predictions_df = pd.DataFrame([tensr.tolist() for tensr in all_predictions])
    comp_predictions_per_round = [list(predictions_df[i]) for i in range(len(all_predictions[0]))]
    
    final_compound_predictions = [max(set(pred_list), key = pred_list.count) 
                              for pred_list in comp_predictions_per_round]
    assert(len(test_data_compounds) == len(test_Y) == len(final_compound_predictions)), 'test_data_compounds, test_Y and final_compound_predictions must be of the same length'
        
    true_positives = [test_data_compounds[i] for i in range(len(test_data_compounds)) 
                      if test_Y[i] == final_compound_predictions[i] == 1]
    false_positives = [test_data_compounds[i] for i in range(len(test_data_compounds)) 
                      if test_Y[i] == 0 and final_compound_predictions[i] == 1]
    true_negatives = [test_data_compounds[i] for i in range(len(test_data_compounds)) 
                      if test_Y[i] == final_compound_predictions[i] == 0]
    false_negatives = [test_data_compounds[i] for i in range(len(test_data_compounds)) 
                      if test_Y[i] == 1 and final_compound_predictions[i] == 0]
    
    print(f'True positives: {len(true_positives)}')
    print(f'False positives: {len(false_positives)}')
    print(f'True negatives: {len(true_negatives)}')
    print(f'False negatives: {len(false_negatives)}')
    majority_vote_acc = (len(true_positives) + len(true_negatives)) / \
    (len(true_positives) + len(true_negatives) + len(false_positives) + len(false_negatives))
    print(f'Majority-vote accuracy: {majority_vote_acc}')
    print(f'Average accuracy: {round(np.mean(total_accuracy), 2)}')
    print(f'Average accuracy SD: {round(np.std(total_accuracy), 2)}')
    return true_positives, false_positives, true_negatives, false_negatives

In [119]:
cosine_novel_tensors = convert_compound_list_to_tensors(novel_compounds)

In [170]:
total_accuracy, disambiguator_predictions, novel_compound_predictions = run_classifier(train_data_tuples, test_data_tuple, cosine_novel_tensors)

train_X shape: torch.Size([207136, 600])
66.96585065197712
train_X shape: torch.Size([207136, 600])
66.86376856256862
train_X shape: torch.Size([207136, 600])
66.87725110267917
train_X shape: torch.Size([207136, 600])
67.09874997592404
train_X shape: torch.Size([207136, 600])
66.90229010574164
train_X shape: torch.Size([207136, 600])
67.09297174444808
train_X shape: torch.Size([207136, 600])
66.50744428821818
train_X shape: torch.Size([207136, 600])
66.79057763053989
train_X shape: torch.Size([207136, 600])
67.06408058706832
train_X shape: torch.Size([207136, 600])
66.99666788651554


In [171]:
print(total_accuracy)

[66.96585065197712, 66.86376856256862, 66.87725110267917, 67.09874997592404, 66.90229010574164, 67.09297174444808, 66.50744428821818, 66.79057763053989, 67.06408058706832, 66.99666788651554]


In [172]:
all_test_compounds = test_dataset[0] + test_dataset[1]
test_Y = list(np.concatenate((np.ones(len(test_dataset[0])), np.zeros(len(test_dataset[1])))))
print(len(all_test_compounds))
print(len(test_Y))
print(len(disambiguator_predictions))

51919
51919
10


In [173]:
true_positives, false_positives, true_negatives, false_negatives = evaluate_disambiguated_compounds(all_test_compounds, test_Y, disambiguator_predictions)

True positives: 13361
False positives: 4813
True negatives: 23153
False negatives: 10592
Majority-vote accuracy: 0.7032878137098172
Average accuracy: 66.92
Average accuracy SD: 0.17


In [132]:
true_positives, false_positives, true_negatives, false_negatives = evaluate_novel_compounds(novel_compounds, test_compounds, novel_compound_predictions)

True positives: 805
False positives: 227806
True negatives: 277436
False negatives: 306
Majority-vote accuracy: 0.5495000523350311
Average accuracy: 66.83
Average accuracy SD: 0.22


In [174]:
with open('results_300/disambiguated/true_positives_disambiguated.txt', 'w') as outfile:
    for compound in true_positives:
        outfile.write(compound + "\n")

In [175]:
with open('results_300/disambiguated/true_negatives_disambiguated.txt', 'w') as outfile:
    for compound in true_negatives:
        outfile.write(compound + "\n")

In [176]:
with open('results_300/disambiguated/false_positives_disambiguated.txt', 'w') as outfile:
    for compound in false_positives:
        outfile.write(compound + "\n")

In [177]:
with open('results_300/disambiguated/false_negatives_disambiguated.txt', 'w') as outfile:
    for compound in false_negatives:
        outfile.write(compound + "\n")

([67.0332633525299, 66.76746470463607, 66.5421136770739, 67.06215450990966, 66.80406017065043, 67.06215450990966, 67.021706889578, 66.41306650744428, 66.88110325699648, 66.71353454419383], [])

In [56]:
print(round(np.mean(total_accuracy[0]), 2))

66.83


In [57]:
print(round(np.std(total_accuracy[0]), 2))

0.22
