In [23]:
import torch
import torch.nn as nn
import torch.utils.data
import pandas as pd
from torch.utils import data
from numpy import array
from numpy import argmax
import argparse
from torch.autograd import Variable
from torch import optim
import numpy as np
import os
import logging
import pickle as pkl
import warnings
pd.options.mode.chained_assignment = None
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="10"
import glob
import random
random.seed(1991)
from sklearn.metrics.pairwise import cosine_similarity
import inflect
from gensim.models import Word2Vec
import gensim

In [24]:
## PARAMETERS
constituent = 'mods'

In [25]:
torch.manual_seed(1612)
if not torch.cuda.is_available():
    
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")
device = 'cuda' if torch.cuda.is_available() else 'cpu'



In [26]:
print(gensim.__version__)

4.1.2


In [27]:
#TODO: Steps
### 1 Load dev and test data
### 2 Use dev data to put together ten datasets, each consisting of dev data + corrupted compounds
# 3 Load corrupted modifiers for test as well, and put test + corrupted modifiers together into one dataset
# 4 Load word2vec model and get representations for each compound in each set (by concatenation)
# (4.2 - Turn representations for each compound into dataframes so they're compatible with Prajit's old code)
# 5 Turn sets into tensors etc
# 6 Use as inputs to training!!
# 7 Then: implement cosine similarity approach

In [28]:
with open('datasets/COCA_dev_min3.txt', 'r') as infile:
    dev_compounds = [line.strip('\n\r') for line in infile]

In [29]:
with open('datasets/COCA_test_min3.txt', 'r') as infile:
    test_compounds = [line.strip('\n\r') for line in infile]

In [30]:
word2vec_model = Word2Vec.load('word2vec_2009.model')

In [31]:
def load_corrupted_compounds(constituent, data_name):
    assert(data_name in ['dev', 'test']), 'data_name must be either dev or test'
    assert(constituent in ['mods', 'heads']), 'constituent must be either mods or heads'
    if data_name == 'dev':
        corrupted_compound_lists = []
        for i in range(10):
            with open(f'corrupted_samples/corrupted_{constituent}_10_{i}.txt', 'r') as infile:
                corrupted_compounds = [line.strip('\n\r') for line in infile]
                corrupted_compound_lists.append(corrupted_compounds)
        return corrupted_compound_lists
    else:
        with open(f'corrupted_samples/corrupted_{constituent}_{data_name}.txt', 'r') as infile: 
            corrupted_compounds = [line.strip('\n\r') for line in infile]
        return corrupted_compounds

In [32]:
def generate_train_datasets(dev_compounds, corrupted_samples):
    datasets = []
    for corrupted_compound_list in corrupted_samples:
        datasets.append((dev_compounds, corrupted_compound_list))
    return datasets

In [33]:
def get_compound_representation(compound, model):
    mod, head = compound.split()
    mod_vector = model.wv[mod] if mod in model.wv else np.zeros(300)
    head_vector = model.wv[head] if head in model.wv else np.zeros(300)
    assert len(mod_vector) == len(head_vector) == 300, 'length of vectors is wrong fsr'
    #if mod not in model.wv:
    #    print(f'modifier \'{mod}\' not found in word2vec model.')
    #if head not in model.wv:
    #    print(f'head \'{head}\' not found in word2vec model.')
    constituents_found = (mod in model.wv, head in model.wv)
    return np.concatenate((mod_vector, head_vector)), constituents_found

In [34]:
# putting together train datasets (from dev data)
train_corrupted_compounds = load_corrupted_compounds(constituent, 'dev')
train_datasets = generate_train_datasets(dev_compounds, train_corrupted_compounds)

In [35]:
# getting corrupted samples for test data
corrupted_compounds_test = load_corrupted_compounds(constituent, 'test')

# making a tuple of positive and negative samples for test data
test_dataset = (test_compounds, corrupted_compounds_test)

In [36]:
def convert_dataset_to_tensors(data_tuple, shuffle=True):
    # TODO find out what to do with compounds that lack a representation (either partially or completely)
    representations_positive = []
    representations_negative = []
    
    mods_not_found = 0
    heads_not_found = 0
    comps_not_found = 0
    
    for compound in data_tuple[0]:
        representation, constituents_found = get_compound_representation(compound, word2vec_model)
        mod_found, head_found = constituents_found
        if not mod_found: 
            mods_not_found += 1
        if not head_found: 
            heads_not_found += 1
        if not mod_found and not head_found:
            comps_not_found += 1
        representations_positive.append(torch.tensor(representation))
    
    for compound in data_tuple[1]:
        representation, constituents_found = get_compound_representation(compound, word2vec_model)
        representations_negative.append(torch.tensor(representation))
    positive_tensors = torch.stack(representations_positive)
    negative_tensors = torch.stack(representations_negative)
    
    print('positive mods not found: ', mods_not_found)
    print('positive heads not found: ', heads_not_found)
    print('positive comps not found: ', comps_not_found)
    
    positive_Y = torch.ones(positive_tensors.shape[0])
    negative_Y = torch.zeros(negative_tensors.shape[0])
    
    X = torch.cat((positive_tensors, negative_tensors))
    Y = torch.cat((positive_Y, negative_Y))
    
    assert len(X) == len(Y), 'X and Y are not of the same length'
    
    if shuffle:
        indices = torch.randperm(X.shape[0])
        X = torch.index_select(X, 0, indices)
        Y = torch.index_select(Y, 0, indices)
    
    return (X, Y)

In [37]:
test_data_tuple = convert_dataset_to_tensors(test_dataset)

positive mods not found:  0
positive heads not found:  0
positive comps not found:  0


In [38]:
print(test_data_tuple[1])

tensor([0., 0., 1.,  ..., 0., 1., 0.])


In [39]:
train_data_tuples = []

for data_tuple in train_datasets:
    tensors = convert_dataset_to_tensors(data_tuple)
    train_data_tuples.append(tensors)

positive mods not found:  0
positive heads not found:  0
positive comps not found:  0
positive mods not found:  0
positive heads not found:  0
positive comps not found:  0
positive mods not found:  0
positive heads not found:  0
positive comps not found:  0
positive mods not found:  0
positive heads not found:  0
positive comps not found:  0
positive mods not found:  0
positive heads not found:  0
positive comps not found:  0
positive mods not found:  0
positive heads not found:  0
positive comps not found:  0
positive mods not found:  0
positive heads not found:  0
positive comps not found:  0
positive mods not found:  0
positive heads not found:  0
positive comps not found:  0
positive mods not found:  0
positive heads not found:  0
positive comps not found:  0
positive mods not found:  0
positive heads not found:  0
positive comps not found:  0


In [40]:
input_size = 600
hidden_size = 300
num_classes = 2
num_epochs = 100
batch_size = 72
learning_rate = 0.002

In [41]:
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes,bias=False)  
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

model = NeuralNet(input_size, hidden_size, num_classes).to(device)#.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 

In [42]:
def run_classifier(train_datasets, test_dataset, novel_comp_mods=None, novel_comp_heads=None):
    total_accuracy=[]
    novel_comp_predictions=[]
    
    for train_dataset in train_datasets:
        train_X = train_dataset[0].float().to(device)
        print('train_X shape:', train_X.shape)
        train_Y = train_dataset[1].long().to(device)

        test_X = test_dataset[0].float().to(device)
        test_Y = test_dataset[1].long().to(device)
        model = NeuralNet(input_size, hidden_size, num_classes).to(device)
        
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        n_examples=train_X.shape[0]
        for i in range(num_epochs):

            cost = 0.

            num_batches = n_examples // batch_size
            for k in range(num_batches):
                start, end = k * batch_size, (k + 1) * batch_size
                outputs = model(train_X[start:end])
                loss = criterion(outputs, train_Y[start:end])

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            #if (k+1) % 100 == 0:
            #print ('Epoch [{}/{}], Loss: {:.4f}'.format(i+1, num_epochs, loss.item()))

        with torch.no_grad():
            correct=0
            total=0
            outputs = model(test_X)
            if novel_comp_mods is not None:
                novel_comp_outputs = model(novel_compounds_mods_tensors.float())
                _, novel_predicted = torch.max(novel_comp_outputs.data, 1)
                novel_comp_predictions.append(novel_predicted)
            _, predicted = torch.max(outputs.data, 1)
            total += test_Y.size(0)
            correct += (predicted == test_Y).sum().item()
        curr_acc=100 * correct / total
        print(curr_acc)
        total_accuracy.append(curr_acc)
    return total_accuracy, novel_comp_predictions

In [43]:
total_accuracy = run_classifier(train_data_tuples, test_data_tuple)

train_X shape: torch.Size([207136, 600])
67.0332633525299
train_X shape: torch.Size([207136, 600])
66.76746470463607
train_X shape: torch.Size([207136, 600])
66.5421136770739
train_X shape: torch.Size([207136, 600])
67.06215450990966
train_X shape: torch.Size([207136, 600])
66.80406017065043
train_X shape: torch.Size([207136, 600])
67.06215450990966
train_X shape: torch.Size([207136, 600])
67.021706889578
train_X shape: torch.Size([207136, 600])
66.41306650744428
train_X shape: torch.Size([207136, 600])
66.88110325699648
train_X shape: torch.Size([207136, 600])
66.71353454419383


In [44]:
print(total_accuracy)

([67.0332633525299, 66.76746470463607, 66.5421136770739, 67.06215450990966, 66.80406017065043, 67.06215450990966, 67.021706889578, 66.41306650744428, 66.88110325699648, 66.71353454419383], [])


([67.0332633525299, 66.76746470463607, 66.5421136770739, 67.06215450990966, 66.80406017065043, 67.06215450990966, 67.021706889578, 66.41306650744428, 66.88110325699648, 66.71353454419383], [])

In [45]:
print(np.mean(total_accuracy[0]))

66.83006221229223


In [46]:
print(np.sd(total_accuracy[0]))

AttributeError: module 'numpy' has no attribute 'sd'