In [1]:
import pandas as pd
import numpy as np
import torch
from keras.preprocessing import text, sequence
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
import csv
import codecs
from tqdm import tqdm

import nltk
nltk.download('stopwords')
import string
import re
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score


USE_CUDA = torch.cuda.is_available()
torch.manual_seed(0)

Using TensorFlow backend.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


<torch._C.Generator at 0x7f055c679130>

In [2]:
# Download Training Data
!wget https://competitions.codalab.org/my/datasets/download/60e40c68-a85d-4320-bef1-d2fe26bb45ca
!unzip 60e40c68-a85d-4320-bef1-d2fe26bb45ca
!unzip training-v1.zip
!unzip trial-data.zip
!rm '60e40c68-a85d-4320-bef1-d2fe26bb45ca'
!rm training-v1.zip
!rm trial-data.zip
!rm -r __MACOSX

# Download Test Data
!wget https://competitions.codalab.org/my/datasets/download/5cac0f56-bb6d-40fa-8041-caf8aa13d09d
!unzip 5cac0f56-bb6d-40fa-8041-caf8aa13d09d
!rm 5cac0f56-bb6d-40fa-8041-caf8aa13d09d

!wget https://competitions.codalab.org/my/datasets/download/bb373027-c8b7-48ab-9729-b1ab3fb51c17
!unzip bb373027-c8b7-48ab-9729-b1ab3fb51c17
!rm bb373027-c8b7-48ab-9729-b1ab3fb51c17

!wget https://competitions.codalab.org/my/datasets/download/38273e56-2ab0-4773-82bf-95aec51bba69
!unzip 38273e56-2ab0-4773-82bf-95aec51bba69
!rm 38273e56-2ab0-4773-82bf-95aec51bba69

--2019-03-01 22:32:36--  https://competitions.codalab.org/my/datasets/download/60e40c68-a85d-4320-bef1-d2fe26bb45ca
Resolving competitions.codalab.org (competitions.codalab.org)... 134.158.75.178
Connecting to competitions.codalab.org (competitions.codalab.org)|134.158.75.178|:443... connected.
HTTP request sent, awaiting response... 302 FOUND
Location: https://newcodalab.lri.fr/prod-private/dataset_data_file/None/787f6/start-kit.zip?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=a00e46f1043d3a6b0e95bb649ea11ed0fbeb5453cbd22b3b372f1bccdaaecd7f&X-Amz-Date=20190301T223232Z&X-Amz-Credential=AZIAIOSAODNN7EX123LE%2F20190301%2Fnewcodalab%2Fs3%2Faws4_request [following]
--2019-03-01 22:32:37--  https://newcodalab.lri.fr/prod-private/dataset_data_file/None/787f6/start-kit.zip?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=a00e46f1043d3a6b0e95bb649ea11ed0fbeb5453cbd22b3b372f1bccdaaecd7f&X-Amz-Date=2019

In [0]:
def get_tokenized_corpus(corpus):

  tokenized_corpus = []

  for sentence in corpus:
    tokenized_sentence = []
    for token in sentence.split(' '): 
      tokenized_sentence.append(token)
    tokenized_corpus.append(tokenized_sentence)
 
  return tokenized_corpus

In [0]:
def get_word2idx(tokenized_corpus):
  vocabulary = []
  for sentence in tokenized_corpus:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)
  
  
  word2idx = {w: idx+1 for (idx, w) in enumerate(vocabulary)}
  # we reserve the 0 index for the placeholder token
  word2idx['<pad>'] = 0
 
  return word2idx

In [0]:
def get_model_inputs(tokenized_corpus, word2idx, labels, max_len):

  # we index our sentences
  vectorized_sents = [[word2idx[tok] for tok in sent if tok in word2idx] for sent in tokenized_corpus]
  # we create a tensor of a fixed size filled with zeroes for padding

  data_tensor = torch.zeros((len(vectorized_sents), max_len)).long()
  
  data_lengths = [len(sent) for sent in vectorized_sents]
  # we fill it with our vectorized sentences 
  
  for idx, (sent, sentlen) in enumerate(zip(vectorized_sents, data_lengths)):
    data_tensor[idx, :sentlen] = torch.LongTensor(sent)

  label_tensor = torch.Tensor(labels)
  
  return data_tensor, label_tensor


def get_model_inputs_test(tokenized_corpus, word2idx, max_len):

  # we index our sentences
  vectorized_sents = [[word2idx[tok] for tok in sent if tok in word2idx] for sent in tokenized_corpus]
  # we create a tensor of a fixed size filled with zeroes for padding

  data_tensor = torch.zeros((len(vectorized_sents), max_len)).long()
  
  data_lengths = [len(sent) for sent in vectorized_sents]
  # we fill it with our vectorized sentences 
  
  for idx, (sent, sentlen) in enumerate(zip(vectorized_sents, data_lengths)):
    data_tensor[idx, :sentlen] = torch.LongTensor(sent)
  
  return data_tensor

In [0]:
def batchify(data, batch_size):
    '''
    This function will divide data into batches
    '''
    
    nbatch = data.size(0) // batch_size # divide the dataset into batches parts.

    data = data.narrow(0, 0, nbatch * batch_size) # extra elments that dont fit the batches
    
    data = data.view(batch_size, -1).contiguous() # Evenly divide into all batches
   
    if USE_CUDA:
        data = data.cuda()
        
    return data
  

def getBatch(data, labels, seq_length, batch_size):
    '''
    This function gives us batches for training
    * this is language modelling, our targets will be the next words
    * this function gives partial sequence samples
    '''
    for i in range(0, data.size(1) - seq_length, seq_length):
        inputs = data[:, i: i + seq_length]
#         print("inputs", inputs)
        targets = labels[i: i + batch_size]
      #         targets = Variable(data[:, (i + 1): (i + 1) + seq_length].contiguous())
        yield (inputs, targets)


In [0]:
def calculate_metrics(real, preds):
    acc = accuracy_score(real, preds)
    recall = recall_score(real, preds)
    precision = precision_score(real, preds)
    f1 = f1_score(real, preds, average='macro')
    
    return acc, recall, precision, f1

In [0]:
def get_train_test_dataset(X, y, test_split=0.1, shuffle_dataset=True, random_seed=None):
  indices = list(range(X.shape[0]))
  test_split_idx = int(np.floor(test_split * X.shape[0]))

  if shuffle_dataset and random_seed is not None:
      np.random.seed(random_seed)
      np.random.shuffle(indices)
  elif shuffle_dataset:
      np.random.seed()
      np.random.shuffle(indices)
  train_indices, test_indices = indices[test_split_idx:], indices[:test_split_idx]

  X_train_ = X[train_indices]
  X_test_ = X[test_indices]
  y_train_ = y[train_indices]
  y_test_ = y[test_indices]
  
  return X_train_, y_train_, X_test_, y_test_

In [0]:
def clean_text(text_list):

    output = []
    for text in text_list:
        ## Remove puncuation
        text = text.translate(string.punctuation)

        ## Convert words to lower case and split them
        text = text.lower().split()

        ## Remove stop words
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops and len(w) >= 3]

        text = " ".join(text)

        # Clean the text
        text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
        text = re.sub(r"what's", "what is ", text)
        text = re.sub(r"\'s", " ", text)
        text = re.sub(r"\'ve", " have ", text)
        text = re.sub(r"n't", " not ", text)
        text = re.sub(r"i'm", "i am ", text)
        text = re.sub(r"\'re", " are ", text)
        text = re.sub(r"\'d", " would ", text)
        text = re.sub(r"\'ll", " will ", text)
        text = re.sub(r",", " ", text)
        text = re.sub(r"\.", " ", text)
        text = re.sub(r"!", " ! ", text)
        text = re.sub(r"\/", " ", text)
        text = re.sub(r"\^", " ^ ", text)
        text = re.sub(r"\+", " + ", text)
        text = re.sub(r"\-", " - ", text)
        text = re.sub(r"\=", " = ", text)
        text = re.sub(r"'", " ", text)
        text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
        text = re.sub(r":", " : ", text)
        text = re.sub(r" e g ", " eg ", text)
        text = re.sub(r" b g ", " bg ", text)
        text = re.sub(r" u s ", " american ", text)
        text = re.sub(r"\0s", "0", text)
        text = re.sub(r" 9 11 ", "911", text)
        text = re.sub(r"e - mail", "email", text)
        text = re.sub(r"j k", "jk", text)
        text = re.sub(r"\s{2,}", " ", text)
        text = re.sub(r"[^\w]", " ", text)

        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
        
        output.append(text)

    return output

# Data Processing

In [0]:
#######################
### DATA PROCESSING ###
#######################
# Path files for training and test data
train_path = 'offenseval-training-v1.tsv'
test_task_a_path = 'testset_taska.tsv'
test_task_b_path = 'testset_taskb.tsv'
test_task_c_path = 'test_set_taskc.tsv'

tweets = []
subtask_a, subtask_b, subtask_c = [], [], []
# filename = '/Users/battilanast/polybox/Dropbox_Archive/My_Schoolstuff/ETHZ_Archive/Informatik/ICL_ST19/Natural_Language_Processing/Coursework/data/start-kit/training-v1/offenseval-training-v1.tsv'
with open(train_path) as tsvfile:
    reader = csv.DictReader(tsvfile, dialect='excel-tab')
    print(reader.fieldnames)
    for row in reader:
        tweets.append(row['tweet'])
        subtask_a.append(row['subtask_a'])
        subtask_b.append(row['subtask_b'])
        subtask_c.append(row['subtask_c'])



tweets_clean = clean_text(tweets)
train = tweets_clean

train_labels = np.array(subtask_a)
no_off = len(train_labels[train_labels == 'OFF'])
no_not = len(train_labels[train_labels == 'NOT'])
weight_not = no_not / (no_off + no_not)
weight_off = no_off / (no_off + no_not)
train_labels[train_labels == 'OFF'] = 1.
train_labels[train_labels == 'NOT'] = 0.


train_labels = train_labels.astype(np.float)
# print(train_labels.sum())

print("Length of train set: {}, Length of train labels: {}".format(len(train), len(train_labels)))

tokenized_corpus = get_tokenized_corpus(train)
print("Tokenized corpus size:", len(tokenized_corpus))

sent_lengths = [len(sent) for sent in tokenized_corpus]
max_len = np.max(np.array(sent_lengths))

        
word2idx = get_word2idx(tokenized_corpus)
print("Word2Idx size:", len(word2idx))


data, labels = get_model_inputs(tokenized_corpus, word2idx, train_labels, max_len)

print("Train Tensor shape: {}, Train labels shape: {}". format(data.shape, labels.shape))

###
X_training, y_training, X_test, y_test = get_train_test_dataset(data, labels)
X_train, y_train, X_val, y_val = get_train_test_dataset(X_training, y_training)
###

### splitting validation and test set into offensive and non-offensive
nots_test_idx = [i for i, x in enumerate(y_test) if x == 0.0]
offs_test_idx = [i for i, x in enumerate(y_test) if x == 1.0]
nots_val_idx = [i for i, x in enumerate(y_val) if x == 0.0]
offs_val_idx = [i for i, x in enumerate(y_val) if x == 1.0]

X_test_off = X_test[offs_test_idx]
X_test_not = X_test[nots_test_idx]
y_test_off = y_test[offs_test_idx]
y_test_not = y_test[nots_test_idx]

X_val_off = X_val[offs_val_idx]
X_val_not = X_val[nots_val_idx]
y_val_off = y_val[offs_val_idx]
y_val_not = y_val[nots_val_idx]
###

### Steven end ###
    
# Batchify the training set
X_train_batched = batchify(X_train, 128)
print("X_train", X_train.shape)
print(X_train_batched.shape)

# Batchify the validation set
X_val_off_batched = batchify(X_val_off, 128)
X_val_not_batched = batchify(X_val_not, 128)
X_val_batched = batchify(X_val, 128)
print("X_val", X_val.shape)
print("X_val_not_batched", X_val_not_batched.shape)
print("X_val_batched", X_val_batched.shape)

# Batchify the set set
X_test_off_batched = batchify(X_test_off, 128)
X_test_not_batched = batchify(X_test_not, 128)
X_test_batched = batchify(X_test, 128)
print('X_test.shape:         {}'.format(X_test.shape))
print('X_test_off_batched.shape: {}'.format(X_test_off_batched.shape))
print('X_test_not_batched.shape: {}'.format(X_test_not_batched.shape))
print('X_test_batched.shape: {}'.format(X_test_batched.shape))

['id', 'tweet', 'subtask_a', 'subtask_b', 'subtask_c']


In [0]:
### download glove dataset
!unzip glove.6B.zip

In [0]:
embeddings_index = {}
for i, line in enumerate(open('glove.6B.300d.txt')):
    val = line.split()
    embeddings_index[val[0]] = np.asarray(val[1:], dtype='float32')

for word, i in word2idx.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Models

In [0]:
##############
### MODELS ###
##############

class Model(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(Model, self).__init__()
        
        self.n_layers = 1
        self.hidden_size = hidden_size
        
        ## Embedding Layer, Add parameter 
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size)        
        self.fc1 = nn.Linear(hidden_size, 1)
        self.init_weight()

    def forward(self, x, h, is_training=False):
        x = self.embedding(x)
        x, (h, c) = self.lstm(x)
        # Take the average of the embeddings
#         x = x.mean(1)
        x, _ = torch.max(x, 1)
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
    
        return x, h

    def init_weight(self):
      
        # We initialize the network to uniform weights in the range (-0.1, 0.1)
        self.embedding.weight.data.uniform_(-0.1, 0.1)
        self.fc1.weight.data.uniform_(-0.1, 0.1)
        self.fc1.bias.data.zero_()
#         self.fc2.weight.data.uniform_(-0.1, 0.1)
#         self.fc2.bias.data.zero_()
        
    def init_hidden(self,batch_size):
        # This function gives us the hidden layer initialized to 0 
        # Refer lecture slide
        weight = next(self.parameters())
        return weight.new_zeros(self.n_layers, batch_size, self.hidden_size)


class Model_lstm(nn.Module):
    def __init__(self, max_features, embed_size):
        super(Model_lstm, self).__init__()
        
        self.embedding = nn.Embedding(max_features, embed_size)
        self.lstm = nn.LSTM(embed_size, 20)        
        self.fully_connected = nn.Linear(20, 1)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        
        # Take the average of the embeddings
        x = x.mean(1)
        x = self.fully_connected(x)
        return x
      
      
class Model_conv_lstm_glove(nn.Module):
    def __init__(self, max_features, embed_size, hidden_size):
        super(Model_conv_lstm_glove, self).__init__()
        
        self.n_layers = 1
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(max_features, embed_size)
        et = torch.tensor(embedding_matrix, dtype=torch.float32)
        self.embedding.weight = nn.Parameter(et)
        self.embedding.weight.requires_grad = False
        self.dropout = nn.Dropout2d(0.2)
        self.convolution = nn.Conv1d(embed_size, 5, 3)
        self.relu = nn.ReLU()
        self.max_pooling = nn.MaxPool1d(3)
        self.lstm = nn.LSTM(99, 20, 2)        
        self.fully_connected = nn.Linear(20, 1)
        
#         self.hidden = self.init_hidden()

    def forward(self, x, hidden_state, cell_state):
        x = self.embedding(x)
        x = self.dropout(x)
        x = self.convolution(x)
        x = self.relu(x)
        x = self.max_pooling(x)
        print('x.shape:      {}'.format(x.shape))
        print('hidden.shape: {}'.format(hidden.shape))
#         x, (hidden_state, cell_state) = self.lstm(x, hidden)
        x, (hidden_state, cell_state) = self.lstm(x, hidden_state, cell_state)
        print('hidden after:', hidden_state.shape)
        # Take the average of the embeddings
        x, _ = torch.max(x, 1)
#         x = x.mean(1)
        x = self.fully_connected(x)
        return x, hidden_state

    def init_weight(self):
      
        # We initialize the network to uniform weights in the range (-0.1, 0.1)
        self.embedding.weight.data.uniform_(-0.1, 0.1)
        self.fc1.weight.data.uniform_(-0.1, 0.1)
        self.fc1.bias.data.zero_()
#         self.fc2.weight.data.uniform_(-0.1, 0.1)
#         self.fc2.bias.data.zero_()
        
    def init_hidden(self,batch_size):
        # This function gives us the hidden layer initialized to 0 
        # Refer lecture slide
#         weight = next(self.parameters())
#         return weight.new_zeros(self.n_layers, batch_size, self.hidden_size)
        return torch.zeros(2, batch_size, 20), torch.zeros(2, batch_size, 20)


class Model_conv_lstm(nn.Module):
    def __init__(self, max_features, embed_size):
        super(Model_conv_lstm, self).__init__()
        
        self.embedding = nn.Embedding(max_features, embed_size)
        self.dropout = nn.Dropout2d(0.2)
        self.convolution = nn.Conv1d(82, 128, 103)
        self.relu = nn.ReLU()
        self.max_pooling = nn.MaxPool1d(3)
        self.lstm = nn.LSTM(1, 20)        
        self.fully_connected = nn.Linear(20, 1)

    def forward(self, x):
      
        x = self.embedding(x)
        x = self.dropout(x)
        x = self.convolution(x)
        x = self.relu(x)
        x = self.max_pooling(x)

        x, _ = self.lstm(x)

        
        # Take the average of the embeddings

        x, _ = torch.max(x, 1)

        x = self.fully_connected(x)
        return x
    

class Model_lstm_conv(nn.Module):
    def __init__(self, max_features, embed_size):
        super(Model_lstm_conv, self).__init__()
        
        self.embedding = nn.Embedding(max_features, embed_size)
#         self.dropout = nn.Dropout2d(0.2)
        self.lstm = nn.LSTM(103, 20, dropout=0.5) 
        self.convolution = nn.Conv1d(82, 128, 3)
        self.relu = nn.ReLU()
        self.max_pooling = nn.MaxPool1d(3)       
        self.fully_connected = nn.Linear(6, 1)

    def forward(self, x):
        x = self.embedding(x)
#         x = self.dropout(x)
        x, _ = self.lstm(x)
        x = self.convolution(x)
        x = self.relu(x)
        x = self.max_pooling(x)
        
        # Take the average of the embeddings
        x = x.mean(1)
#         x, _ = torch.max(x, 1)
        x = self.fully_connected(x)
        return x


class Model_conv_lstm_2linear(nn.Module):
    def __init__(self, max_features, embed_size):
        super(Model_conv_lstm_2linear, self).__init__()
        
        self.embedding = nn.Embedding(max_features, embed_size)
        et = torch.tensor(embedding_matrix, dtype=torch.float32)
        self.embedding.weight = nn.Parameter(et)
        self.embedding.weight.requires_grad = False
        self.dropout = nn.Dropout2d(0.2)
        self.convolution = nn.Conv1d(82, 128, 103)
        self.max_pooling = nn.MaxPool1d(3)
        self.lstm = nn.LSTM(1, 20)
        self.fully_connected0 = nn.Linear(20, 10)
        self.relu = nn.ReLU()
        self.fully_connected1 = nn.Linear(10, 1)

    def forward(self, x):
      

        x = self.embedding(x)

        x = self.dropout(x)

        x = self.convolution(x)

        x = self.max_pooling(x)

        x, _ = self.lstm(x)

        x = self.fully_connected0(x)

        x = self.relu(x)

        
        # Take the average of the embeddings

        x, _ = torch.max(x, 1)

        x = self.fully_connected1(x)
        return x


class Model_conv_gru_2linear(nn.Module):
    def __init__(self, max_features, embed_size):
        super(Model_conv_gru_2linear, self).__init__()
        
        self.embedding = nn.Embedding(max_features, embed_size)
        et = torch.tensor(embedding_matrix, dtype=torch.float32)
        self.embedding.weight = nn.Parameter(et)
        self.embedding.weight.requires_grad = False
        self.dropout = nn.Dropout2d(0.2)
        self.convolution = nn.Conv1d(82, 128, 103)
        self.max_pooling = nn.MaxPool1d(3)
        self.gru = nn.GRU(66, 20, dropout=0.5)
        self.fully_connected0 = nn.Linear(20, 10)
        self.relu = nn.ReLU()
        self.fully_connected1 = nn.Linear(10, 1)

    def forward(self, x):
      

        x = self.embedding(x)

        x = self.dropout(x)

        x = self.convolution(x)

        x = self.max_pooling(x)

        x, _ = self.gru(x)

        x = self.fully_connected0(x)

        x = self.relu(x)

        
        # Take the average of the embeddings

        x, _ = torch.max(x, 1)

        x = self.fully_connected1(x)
        return x

In [0]:
def accuracy(output, target):
 
    output = torch.round(torch.sigmoid(output))
    correct = (output == target).float()
    acc = correct.sum()/len(correct)
    return acc


# we will train for N epochs (N times the model will see all the data)
epochs=10

# the input dimension is the vocabulary size
INPUT_DIM = len(word2idx)

# we define our embedding dimension (dimensionality of the output of the first layer)
EMBEDDING_DIM = 103

# dimensionality of the output of the second hidden layer
HIDDEN_DIM = 20

#the outut dimension is the number of classes, 1 for binary classification
OUTPUT_DIM = 1

# Size 1 of X_train
NUM_FEATURES = X_train.size(1)

# Batch size
BATCH_SIZE = 128

In [0]:
print('INPUT_DIM: {}'.format(INPUT_DIM))
print('X_train_batched.shape: {}'.format(X_train_batched.shape))

In [0]:
# We will use the existing dimensions for now
# model = Model(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM)
model = Model_conv_lstm_glove(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM)
# model = Model_conv_lstm(INPUT_DIM, EMBEDDING_DIM)
# model = Model_lstm_conv(INPUT_DIM, EMBEDDING_DIM)
# model = Model_lstm(INPUT_DIM, EMBEDDING_DIM)
# model = Model_conv_gru_2linear(INPUT_DIM, EMBEDDING_DIM)

params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Total number of parameters is: {}".format(params))
print(model)

# we use the stochastic gradient descent (SGD) optimizer
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5)

# # we use the binary cross-entropy loss with sigmoid (applied to logits) 
# #Recall we did not apply any activation to our output layer, we need to make our outputs look like probality.
# loss_fn = nn.BCEWithLogitsLoss()

# we use the binary cross-entropy loss with sigmoid (applied to logits) 
def loss_fn(output, targets, loss_weights=None): 
    criterion = nn.BCEWithLogitsLoss()
    if loss_weights is not None:
        criterion.weight=loss_weights
    
    return criterion.forward(output, targets)

# Training

In [0]:
################
### TRAINING ###
################

#to ensure the dropout (exlained later) is "turned on" while training
#good practice to include even if do not use here

model.train()
for epoch in range(1, epochs+1):
    total_loss = 0
    losses = []
#     hidden = model.init_hidden(BATCH_SIZE)
    total_accuracy = 0
    total_precision = 0
    total_recall = 0
    total_f1_score = 0
#     print("Hidden:", hidden.shape)
    for i, (inputs, targets) in enumerate(getBatch(X_train_batched, y_train, NUM_FEATURES, BATCH_SIZE)):
        
#         inputs, targets = batch
#         print("Inputs size: {}, Targets size: {}".format(inputs.shape, targets.shape))
        
        # we are detaching the hidden state from the 
        # computational graph to prevent backpropping 
        # entirely to previous batches 
#         hidden = hidden.detach()
        
        # we zero the gradients as they are not removed automatically
        model.zero_grad()
      
        # squeeze is needed as the predictions are initially size (batch size, 1) and we need to remove the dimension of size 1 
        output = model(inputs)
        output = output.squeeze()

        loss_weights = torch.ones(targets.shape[0])

        off_idx = [i for i, x in enumerate(targets) if x == 1.0]
        not_idx = [i for i, x in enumerate(targets) if x == 0.0]

        loss_weights[off_idx] = weight_off
        loss_weights[not_idx] = weight_not

        loss = loss_fn(output, targets, loss_weights)

        losses.append(loss.item())

        acc = accuracy(output, targets)
        
        predictions = torch.round(torch.sigmoid(output))
        acc_sb, rec_sb, prec_sb, f1_sb = calculate_metrics(targets.numpy(), predictions.data.numpy())
        
        total_accuracy += acc_sb
        total_precision += prec_sb
        total_recall += rec_sb
        total_f1_score += f1_sb
        

        #calculate the gradient of each parameter
        loss.backward()

        #update the parameters using the gradients and optimizer algorithm 
        optimizer.step()

        epoch_loss = loss.item()
        epoch_acc = acc

    print('Epoch: {:02} | Train Loss:      {:.3f} | F1: {:.4f} | Accuracy: {:.4f} | Precision: {:.4f} | Recall: {:.4f}'.format(epoch, np.array(losses).mean(), total_f1_score/(i+1), total_accuracy/(i+1), total_precision/(i+1), total_recall/(i+1)))
    

# VALIDATION
    model.eval()  # set model to evaluation mode
    with torch.no_grad():
        optimizer.zero_grad()

        output = model(X_val)
        output = output.squeeze(1)
        
        loss_weights = torch.ones(y_val.shape[0])

        off_idx = [i for i, x in enumerate(y_val) if x == 1.0]
        not_idx = [i for i, x in enumerate(y_val) if x == 0.0]

        loss_weights[off_idx] = weight_off
        loss_weights[not_idx] = weight_not

        loss = loss_fn(output, y_val, loss_weights)
        
        # Save the first loss to compare in following epochs, to stop overfitting
        if epoch == 1:
            first_loss = loss
        elif (loss - first_loss) > 0.15:
            print("Loss: {}, First loss: {}".format(loss, first_loss))
            print("Loss is going up above your tolerance. So we stop training")
            break
        predictions = torch.round(torch.sigmoid(output))
        
        acc, recall, precision, f1 = calculate_metrics(y_val.numpy(), predictions.data.numpy())
        
        print('            Validation Loss: {:.3f} | F1: {:.4f} | Accuracy: {:.4f} | Precision: {:.4f} | Recall: {:.4f}'.format(loss, f1, acc, precision, recall))


# Testing

In [0]:
###############
### TESTING ###
###############

# Check metrics on test set
model.eval()
# hidden = model.init_hidden(BATCH_SIZE)
with torch.no_grad():
    # squeeze is needed as the predictions are initially size (batch size, 1) and we need to remove the dimension of size 1 
    output_full = model(X_test)
    output_full = output_full.squeeze(1)
    
    loss_weights = torch.ones(y_test.shape[0])
        
    off_idx = [i for i, x in enumerate(y_test) if x == 1.0]
    not_idx = [i for i, x in enumerate(y_test) if x == 0.0]

    loss_weights[off_idx] = weight_off
    loss_weights[not_idx] = weight_not

    loss_full = loss_fn(output_full, y_test, loss_weights)

    predictions_full = torch.round(torch.sigmoid(output_full))
        
    acc_full, recall_full, precision_full, f1_full = calculate_metrics(y_test.numpy(), predictions_full.data.numpy())
        
print('FULL TEST SET: Test Loss: {:.3f} | F1: {:.4f} | Accuracy: {:.4f} | Precision: {:.4f} | Recall: {:.4f}\n'.format(loss_full, f1_full, acc_full, precision_full, recall_full))


# Exporting labels for the submission
Run the given test data which does not contain any labels

## Data Processing

In [0]:
# Path files for training and test data
test_task_a_path = 'testset-taska.tsv'

test_tweets = []
test_ids = []
with open(test_task_a_path) as tsvfile:
    reader = csv.DictReader(tsvfile, dialect='excel-tab')
    print(reader.fieldnames)
    for row in reader:
        test_ids.append(row['id'])
        test_tweets.append(row['tweet'])

test = clean_text(test_tweets)



tokenized_corpus = get_tokenized_corpus(test)
print("Tokenized corpus size:", len(tokenized_corpus))

sent_lengths = [len(sent) for sent in tokenized_corpus]
# max_len = np.max(np.array(sent_lengths))

word2idx = get_word2idx(tokenized_corpus)
print("Word2Idx size:", len(word2idx))

data = get_model_inputs_test(tokenized_corpus, word2idx, max_len)

## Forward Pass


In [0]:
# Check metrics on test set
model.eval()
# hidden = model.init_hidden(BATCH_SIZE)
with torch.no_grad():
    # squeeze is needed as the predictions are initially size (batch size, 1) and we need to remove the dimension of size 1 
    _predictions = model(data)
    _predictions = _predictions.squeeze()

    predictions = torch.round(torch.sigmoid(_predictions))
     

In [0]:
print(predictions)
print(predictions.shape)
print(data.shape)

## Write to CSV file

In [0]:
# 'OFF' == 1.
# 'NOT' == 0.
output = np.array(["empty" for x in range(predictions.shape[0])])
        
off_idx = [i for i, x in enumerate(predictions) if x == 1.0]
not_idx = [i for i, x in enumerate(predictions) if x == 0.0]

output[off_idx] = 'OFF'
output[not_idx] = 'NOT'

print(type(test_ids))
print(type(output))
print(len(test_ids))
print(output.shape)

output = np.vstack((test_ids, output)).T

# write to csv file
with open('taska_submission.csv', 'w') as myFile:  
    writer = csv.writer(myFile)
    writer.writerows(output)

In [0]:
"""
- re-run training with lstm conv
- re-run submission generation
"""