### Read training, dev and unlabeled test data

The following provides a starting code (Python 3) of how to read the labeled training and dev cipher text, and unlabeled test cipher text, into lists.

In [103]:
train, dev, test = [], [], []

In [104]:
for x in open('./train_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0]) 
    train.append(x)
print (len(train))
print (train[:3])

16220
[[0, 'lkêcê yoúc cêêö y#êjl lw mówám Újám j Úêê# ütlk Úol lkêú z#ê ctöé8ú ówl xoóóú éê#xw#öê#c .'], [0, '6êcétlê jolêot8 zc éê#xw#öjóáê , tl zc j #jlkê# 8tcl8êcc jöÚ8ê 6wüó lkê öt668ê wx lkê #wj6 , ükê#ê lkê lkêöjltá t#wótêc j#ê lww wÚ2twoc jó6 lkê cê+oj8 éw8tltác lww cöoy .'], [0, 'tx lktc kw8t6jú öw2tê tc coééwcê6 lw Úê j ytxl , cwöêÚw6ú oóü#jééê6 tl êj#8ú , lwwm wol j88 lkê yww6 cloxx , jó6 8êxl Úêktó6 lkê á#jé ( 8tlê#j88ú ) .']]


In [105]:
for x in open('./dev_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0]) 
    dev.append(x)
print (len(dev))
print (dev[:3])

2027
[[1, 'ów8jó Ú#j2ê8ú l#êj6c ükê#ê xêü jöê#tájó xt8öc 6j#ê lw 6ê82ê 77 tólw lkê üw#86 wx jöÚt2j8êóáê jó6 jöÚtyotlú <<<'], [0, 'ê2êó öo#ékú zc ê+éê#l áwötá ltötóy jó6 xjöê6 ákj#tcöj áj ózl #êcáoê lktc êxxw#l .'], [1, 'üt88 jcco#ê68ú #jóm jc wóê wx lkê á8ê2ê#êcl , öwcl 6êáêélt2ê8ú jöoctóy áwöê6têc wx lkê úêj# .']]


#### Different from 'train' and 'dev' that are both list of tuples, 'test' will be just a list.

In [106]:
for x in open('./test_enc_unlabeled.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r')
    test.append(x)
print (len(test))
print (test[:3])

2028
['j 6t6jáltá jó6 6o88 6wáoöêólj#ú y8w#txútóy cwxlüj#ê jój#ákú .', 'ówlktóy cltámc , #êj88ú , ê+áêél j 8tóyê#tóy á#êêétóêcc wóê xêê8c x#wö Úêtóy 6#jyyê6 lk#woyk j cj6 , cw#6t6 oót2ê#cê wx yoóc , 6#oyc , j2j#táê jó6 6jöjyê6 6#êjöc .', 'öo#ékú jó6 üt8cwó jáloj88ú öjmê j é#êllú yww6 lêjö <<< Úol lkê é#wvêál co##woó6tóy lkêö tc 6tcl#êcctóy8ú #wlê .']


#### You can split every sentence into lists of words by white spaces.

In [107]:
train_split = [[x[0], x[1].split(' ')] for x in train]
dev_split = [[x[0], x[1].split(' ')] for x in dev]
test_split = [[x.split(' ')] for x in test]

### Main Code Body

You may choose to experiment with different methods using your program. However, you need to embed the training and inference processes at here. We will use your prediction on the unlabeled test data to grade, while checking this part to understand how your method has produced the predictions.

In [108]:
import time
import numpy as np
import pandas as pd
import torch
from torchtext.legacy import data
import torch.nn as nn
import spacy

# import gensim
# from gensim.models import Word2Vec

In [7]:
# train_split1 = [x[1].split(' ') for x in train] # text 
# train_split2 = [x[0] for x in train] # labels
# dev_split1 = [x[1].split(' ') for x in dev]
# dev_split2 = [x[0] for x in dev]

# train_df = pd.DataFrame(train_split, columns = ['label', 'text'])
# dev_df = pd.DataFrame(dev_split, columns = ['label', 'text'])
# test_df = pd.DataFrame(test_split, columns = ['text'])

# train_df.T.to_dict().values()
# dev_df.T.to_dict().values()
# test_df.T.to_dict().values()

In [109]:
seed = 42
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [110]:
TEXT = data.Field(tokenize = 'spacy', include_lengths = True) # to process the text data, to convert text to tensors. Spacy will be used to tokenize the field objects.
LABEL = data.LabelField(dtype = torch.float) # to process the label data
fields = [('label',LABEL) , ('text',TEXT) ]

In [111]:
# to facilitate loading the text data and labels as a dataset.
class DataFrame_DataSet(data.Dataset):

    def __init__(self, df, fields, is_test=False, **kwargs):
        output = []
        # print(df)
        if is_test:
          for sentence in df:
            output.append(data.Example.fromlist([sentence], fields))
        else:
          for i, sentence in df:
            text = sentence if not is_test else None
            label = i
            # print('text::' , text)
            # print('label::' , label)
            output.append(data.Example.fromlist([label, text], fields))
        super().__init__(output, fields, **kwargs)

    @staticmethod
    def sort_key(ex):
        return len(ex.text)

In [112]:

# training dataset
train_enc_ds = DataFrame_DataSet(train , fields)
print(vars(train_enc_ds[1]))

# validation/dev dataset
dev_enc_ds = DataFrame_DataSet(dev, fields)
print(vars(dev_enc_ds[1]))

# test
test_enc_ds = DataFrame_DataSet(test, [('text', TEXT)], True)
print(vars(test_enc_ds[1]))


{'label': 0, 'text': ['6êcétlê', 'jolêot8', 'zc', 'éê#xw#öjóáê', ',', 'tl', 'zc', 'j', '#', 'jlkê', '#', '8tcl8êcc', 'jöÚ8ê', '6wüó', 'lkê', 'öt668ê', 'wx', 'lkê', '#', 'wj6', ',', 'ükê#ê', 'lkê', 'lkêöjltá', 't#wótêc', 'j#ê', 'lww', 'wÚ2twoc', 'jó6', 'lkê', 'cê+oj8', 'éw8tltác', 'lww', 'cöoy', '.']}
{'label': 0, 'text': ['ê2êó', 'öo#ékú', 'zc', 'ê+éê#l', 'áwötá', 'ltötóy', 'jó6', 'xjöê6', 'ákj#tcöj', 'áj', 'ózl', '#', 'êcáoê', 'lktc', 'êxxw#l', '.']}
{'text': ['ówlktóy', 'cltámc', ',', '#', 'êj88ú', ',', 'ê+áêél', 'j', '8tóyê#tóy', 'á#êêétóêcc', 'wóê', 'xêê8c', 'x#wö', 'Úêtóy', '6#jyyê6', 'lk#woyk', 'j', 'cj6', ',', 'cw#6t6', 'oót2ê#cê', 'wx', 'yoóc', ',', '6#oyc', ',', 'j2j#táê', 'jó6', '6jöjyê6', '6#êjöc', '.']}


In [None]:
# # word2vec embeddings
# W2V_SIZE = 200
# W2V_WINDOW = 7
# W2V_EPOCH = 50
# W2V_MIN_COUNT = 1

# # build_vocab for text
# text_vocab = gensim.models.Word2Vec(size=W2V_SIZE, window=W2V_WINDOW, min_count=W2V_MIN_COUNT)
# text_vocab.build_vocab(train_split1)
# words = text_vocab.wv.vocab.keys()
# vocab_size = len(words)
# print(words)
# print(text_vocab)

# # word_model contains the vector that is given as an input to the network i.e. the LSTM encoder
# # 1.1.1 Train Word Embeddings
# text_vocab.train(train_split1, total_examples=len(train_split1), epochs=W2V_EPOCH)
# text_vocab.save('word_vec_embeddings.txt')

# # label_vocab = gensim.models.Word2Vec(size=W2V_SIZE, window=W2V_WINDOW, min_count=W2V_MIN_COUNT) # Word2Vec(vocab=7653, size=100, alpha=0.025) , min_count matched
# # label_vocab.build_vocab(train_ds)
# # words = label_vocab.wv.vocab.keys()
# # print(words)


In [115]:
# initialize GloVe embeddings
max_vocab_size = 25000
# build the vocabulary on the training dataset
TEXT.build_vocab(train_enc_ds, 
                 max_size = max_vocab_size, 
                 vectors = 'glove.6B.200d',
                 unk_init = torch.Tensor.zero_)
LABEL.build_vocab(train_enc_ds)

print("TEXT vocabulary size: ", len(TEXT.vocab)) # unique tokens in text
print("LABEL vocabulary size: ", len(LABEL.vocab)) # unique tokens in label
print(TEXT.vocab.stoi) # word dictionary

TEXT vocabulary size:  20464
LABEL vocabulary size:  2
defaultdict(<bound method Vocab._default_unk_index of <torchtext.legacy.vocab.Vocab object at 0x7fe803edc950>>, {'<unk>': 0, '<pad>': 1, '#': 2, '.': 3, 'lkê': 4, ',': 5, 'j': 6, 'jó6': 7, 'wx': 8, 'lw': 9, 'tc': 10, 'tl': 11, 'tó': 12, 'lkjl': 13, 'jc': 14, 'ütlk': 15, 'zc': 16, 'Úol': 17, 'xt8ö': 18, 'lktc': 19, 'xw': 20, 'tlc': 21, 'jó': 22, '<': 23, 'öw2tê': 24, 'úwo': 25, 'Úê': 26, 'wó': 27, 'Úú': 28, 'ówl': 29, 'j#ê': 30, 'jÚwol': 31, 'öw#ê': 32, 'wóê': 33, '8tmê': 34, 'kjc': 35, 'jl': 36, '77': 37, 'lkjó': 38, 'x#wö': 39, '!': 40, 'j88': 41, 'ktc': 42, 'kj2ê': 43, 'tlzc': 44, 'cw': 45, 't': 46, 'w': 47, 'tx': 48, "'": 49, 'clw#ú': 50, 'lww': 51, 'kê': 52, 'ükw': 53, 'ükjl': 54, 'vocl': 55, 'tólw': 56, 'ów': 57, 'öwcl': 58, 'wol': 59, 'ê2êó': 60, 'ózl': 61, 'öoák': 62, 'oé': 63, 'yww6': 64, 'áwöê6ú': 65, 'üt88': 66, 'ájó': 67, 'ltöê': 68, '"': 69, 'cwöê': 70, 'lkê#ê': 71, 'ákj#jálê#c': 72, 'üjú': 73, '8tll8ê': 74, 'wó8ú': 75,

In [116]:
BATCH_SIZE = 128  # BATCH_SIZE = 256
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# preparing batches for training the model
train_enc_itr, dev_enc_itr, test_enc_itr = data.BucketIterator.splits(
                                (train_enc_ds, dev_enc_ds, test_enc_ds), batch_size = BATCH_SIZE, device = device)

In [117]:
class LSTMModule(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, 
                           bidirectional=bidirectional, dropout=dropout)
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
        # self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        embedded = self.embedding(text)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'), enforce_sorted= False) #pack sequence
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output) # unpack sequence
        # output over padding tokens are zero tensors
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))  # concat final forward and backward hidden layers and apply dropout
        output = self.fc1(hidden)
        output = self.dropout(self.fc2(output))       
        return output

In [118]:
# hyperparameters
num_epochs = 12
learning_rate = 0.008
input_dim = len(TEXT.vocab)
embed_dim = 200
hidden_dim = 512
output_dim = 1
n_layers = 2
bidirectional = True
dropout = 0.2
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]

In [119]:
# initialize the model
model = LSTMModule(input_dim, 
            embed_dim, 
            hidden_dim, 
            output_dim, 
            n_layers, 
            bidirectional, 
            dropout)

print(model)

LSTMModule(
  (embedding): Embedding(20464, 200)
  (rnn): LSTM(200, 512, num_layers=2, dropout=0.2, bidirectional=True)
  (fc1): Linear(in_features=1024, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [120]:
# load the GLoVe embeddings created
pretrained_embed = TEXT.vocab.vectors
print(pretrained_embed.shape)
model.embedding.weight.data.copy_(pretrained_embed)

torch.Size([20464, 200])


tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.8220,  0.6198, -0.6535,  ...,  0.8002,  0.1282,  0.0968],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [121]:
# to initialize zeros padding
model.embedding.weight.data[pad_idx] = torch.zeros(embed_dim)
print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.8220,  0.6198, -0.6535,  ...,  0.8002,  0.1282,  0.0968],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])


In [None]:
# Loading word2vec pre-trained vector embeddings created 
# # loading the pretrained vectors into the embedding matrix
# # pretrained_embeddings = model.wv.vectors
# # print(pretrained_embeddings.shape)
# # model.embedding.weight.data.copy_(pretrained_embeddings)


# # 1.1.2 Load the word2vec embeddings into the PyTorch model
# text_vocab_model = gensim.models.Word2Vec.load('word_vec_embeddings.txt')
# pretrained_embeddings = torch.FloatTensor(text_vocab_model.wv.vectors)
# print(pretrained_embeddings.shape)
# text_vocab_model.embedding.weight.data.copy_(pretrained_embeddings)

# #  to initialize padded to zeros
# # model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

# print('here' , model.embedding.weight.data)

In [122]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print(device)
model.to(device)

# loss and optimizer
criterion = nn.BCEWithLogitsLoss()
# criterion = nn.BCELoss() - not used to resolve 'RuntimeError: all elements of input should be between 0 and 1'
# criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
print(optimizer)

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.008
    weight_decay: 0
)


In [123]:
def calc_accuracy(predictions, y): # binary classification
    #round to closest integer
    round_predictions = torch.round(torch.sigmoid(predictions)) # [0 or 1]
    # print('calc_accuracy:: ' , round_predictions)
    correct_value = (round_predictions == y).float() # convert to float
    accuracy = correct_value.sum() / len(correct_value)
    return accuracy

In [124]:
# training function 
def train(model, iterator):
    epoch_loss = 0
    epoch_accuracy = 0
    model.train()
    for line in iterator:
        # print('line:' , line)
        text, text_lengths = line.text
        # print('text:' , text)
        # print('text_lengths' , text_lengths.shape)
        # print('text shape :' , text.shape)
        # print('text_length 0:' , text_lengths[0].shape)
        # print('text 0 :' , text[0].shape)
        optimizer.zero_grad()

        preds = model(text, text_lengths)
        # print('Shape: ' , preds.shape)
        
        predictions = preds.squeeze(1)
        # print('Squeezed shape: ', predictions.shape)
        loss = criterion(predictions, line.label)
        accuracy = calc_accuracy(predictions, line.label)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_accuracy += accuracy.item()
    return epoch_loss / len(iterator), epoch_accuracy / len(iterator)

In [125]:
# validation function
def evaluation(model, iterator):
    epoch_accuracy = 0
    epoch_loss = 0
    model.eval()
    # deactivates auto-grad
    with torch.no_grad():
        for line in iterator:
            text, text_lengths = line.text
            predictions = model(text, text_lengths).squeeze(1)
            loss = criterion(predictions, line.label)
            accuracy = calc_accuracy(predictions, line.label)
            
            epoch_loss += loss.item()
            epoch_accuracy += accuracy.item()        
    return epoch_loss / len(iterator) , epoch_accuracy / len(iterator)

In [126]:
from torchtext.legacy.data import iterator
t = time.time()
validation_accuracy = []
loss = []
accuracy = []

for epoch in range(num_epochs):
    training_loss, training_acc = train(model, train_enc_itr)
    validation_loss, validation_acc = evaluation(model, dev_enc_itr) # evaluate
    
    print('\n==========================================================================')
    print(f'\tEpoch: {epoch + 1}')
    print(f'\tTrain Loss: {training_loss:.3f} and Train Accuracy: {training_acc*100:.2f}%')
    print(f'\tValidation Loss: {validation_loss:.3f} and Validation Accuracy: {validation_acc*100:.2f}%')
    print('==========================================================================')
    loss.append(training_loss)
    accuracy.append(training_acc)
    validation_accuracy.append(validation_acc)
    
print(f'time:{time.time()-t:.3f}')


	Epoch: 1
	Train Loss: 0.834 and Train Accuracy: 50.08%
	Validation Loss: 0.692 and Validation Accuracy: 53.09%

	Epoch: 2
	Train Loss: 0.700 and Train Accuracy: 50.96%
	Validation Loss: 0.688 and Validation Accuracy: 53.13%

	Epoch: 3
	Train Loss: 0.654 and Train Accuracy: 60.26%
	Validation Loss: 0.584 and Validation Accuracy: 70.12%

	Epoch: 4
	Train Loss: 0.495 and Train Accuracy: 74.34%
	Validation Loss: 0.481 and Validation Accuracy: 80.29%

	Epoch: 5
	Train Loss: 0.346 and Train Accuracy: 82.65%
	Validation Loss: 0.385 and Validation Accuracy: 84.96%

	Epoch: 6
	Train Loss: 0.286 and Train Accuracy: 85.24%
	Validation Loss: 0.402 and Validation Accuracy: 85.59%

	Epoch: 7
	Train Loss: 0.251 and Train Accuracy: 86.61%
	Validation Loss: 0.417 and Validation Accuracy: 86.94%

	Epoch: 8
	Train Loss: 0.225 and Train Accuracy: 87.19%
	Validation Loss: 0.455 and Validation Accuracy: 87.09%

	Epoch: 9
	Train Loss: 0.218 and Train Accuracy: 87.71%
	Validation Loss: 0.469 and Validation 

In [127]:
def epoch_time(start, end):
  elapse = end - start
  elapse_min = int(elapse/60)
  elapse_sec = int(elapse - (elapse_min * 60))
  return elapse_min, elapse_sec

In [None]:
# n_epochs = 15
# best_validation_loss = float('inf')

# for epoch in range(n_epochs):
#   start_time = time.time()

#   training_loss, training_acc = train(model, train_enc_itr)
#   validation_loss, validation_acc = evaluation(model, dev_enc_itr)

#   end_time = time.time()

#   epoch_mins, epoch_secs = epoch_time(start_time, end_time)

#   if validation_loss < best_validation_loss:
#     best_validation_loss = validation_loss
#     torch.save(model.state_dict(), 'result.pt')

#   print('\n==========================================================================')
#   print(f'Epoch {epoch+1}, Time: {epoch_mins} min, {epoch_secs} secs')
#   print(f'\tTrain Loss: {training_loss:.3f} and Train Accuracy: {training_acc*100:.2f}%')
#   print(f'\tBest Validation Loss: {best_validation_loss:.3f} , Validation Loss: {validation_loss:.3f} and Validation Accuracy: {validation_acc*100:.2f}%')
#   print('\n==========================================================================')



In [102]:
# nlp = spacy.load('en_core_web_sm')

# def predict_results(text, model, device):
#     model.eval()
#     tokenized = [i.text for i in nlp.tokenizer(text)]
#     ids = [TEXT.vocab.stoi[t] for t in tokenized]
    
#     length = torch.LongTensor([len(ids)])
#     # tensor = torch.LongTensor(ids).unsqueeze(-1).to(device)
    
#     # prediction = model(tensor, length)
#     # print('prediction:' , prediction)

#     # try 1
#     # if (prediction.item() >= 0.5):
#     #   predicted_class = 1
#     # else:
#     #   predicted_class = 0
    
#     # predicted_class = torch.round(torch.sigmoid(prediction)) # 2
#     # predicted_class = prediction.argmax(-1).item() # 3
#     # predicted_class = torch.round(torch.softmax(prediction, -1)) # 4
#     # predicted_class = torch.round(torch.logit(prediction, -1)) #5
#     # predicted_class = torch.round(torch.softmax(prediction, 0)) # 6
    
#     # print('predicted class:' , predicted_class)


#     tensor = torch.LongTensor(ids).unsqueeze(-1).to(device)
#     prediction = model(tensor, length)
#     predicted_class = torch.round(torch.tanh(prediction)) # 7
#     print('predicted class' , predicted_class)
#     if (predicted_class.item() < 0):
#       predicted_class = 0
#     else:
#       predicted_class = 1

#     # print('prediction:' , prediction)
#     # predicted_class = torch.softmax(prediction, dim=-1)
#     # predicted_class = prediction.argmax(-1).item()
#     # predicted_probability = probability[predicted_class].item()    
#     return predicted_class

# text1 = "6êcétlê jolêot8 zc éê#xw#öjóáê , tl zc j #jlkê# 8tcl8êcc jöÚ8ê 6wüó lkê öt668ê wx lkê #wj6 , ükê#ê lkê lkêöjltá t#wótêc j#ê lww wÚ2twoc jó6 lkê cê+oj8 éw8tltác lww cöoy ."
# predict_results(text1, model, device)

predicted class tensor([[1.]], device='cuda:0', grad_fn=<RoundBackward0>)


1

In [128]:
results = []

for batch in test_enc_itr:
  text, text_length = batch.text
  # print('text:' , text)
  # print('text shape:' , text.shape, end='\t')
  predictions = model(text, text_length).squeeze(1)
  # print(predictions.shape)
  # predictions = torch.round(torch.sigmoid(predictions)).type(torch.IntTensor).tolist()
  # print(torch.round(torch.sigmoid(predictions)).shape)
  results.extend(torch.round(torch.sigmoid(predictions)).type(torch.IntTensor).tolist())
# print(len(results))

In [None]:
# # Predict the model on test data set
# nlp = spacy.load('en_core_web_sm')

# def predict_results(text, model, device):
#     tokenized = [i.text for i in nlp.tokenizer(text)]
#     print(tokenized)
#     # print(tokenized)
#     id = [TEXT.vocab.stoi[t] for t in tokenized]
#     text_lengths = torch.LongTensor([len(id)]).to(device)
  
#     tensor = torch.LongTensor(id).unsqueeze(0).to(device)
#     print('tensor :: ', tensor)
#     print('tensor shape' , tensor.shape)
#     print('text_length :: ', text_lengths)

#     prediction = model(tensor, text_lengths)
#     print('prediction:: ' , prediction.item())
#     # probability = torch.softmax(prediction, dim=1)
#     # print('probability::' , probability)
#     predicted_class = torch.round(torch.sigmoid(prediction))
#     # if (prediction.item() >= 0.5):
#     #   predicted_class = 1
#     # else:
#     #   predicted_class = 0
#     # predicted_class = prediction.argmax(dim=-1).item()
#     print('predicted class:' , predicted_class)
#     # predicted_probability = probability[predicted_class].item()
#     return predicted_class

# # test for these sentences
# text = "j 6t6jáltá jó6 6o88 6wáoöêólj#ú y8w#txútóy cwxlüj#ê jój#ákú ."
# text1 = "6êcétlê jolêot8 zc éê#xw#öjóáê , tl zc j #jlkê# 8tcl8êcc jöÚ8ê 6wüó lkê öt668ê wx lkê #wj6 , ükê#ê lkê lkêöjltá t#wótêc j#ê lww wÚ2twoc jó6 lkê cê+oj8 éw8tltác lww cöoy ."
# text2 = "vocl ükêó úwo lktóm lkjl ê2ê#ú éwcctÚ8ê jóy8ê kjc Úêêó ê+kjoclê6 Úú 6wáoöêólj#tjóc , jówlkê# óêü xt8ö êöê#yêc ütlk úêl jówlkê# #êöj#mjÚ8ê úêl ckwámtóy8ú 8tll8ê7mówüó éê#céêált2ê ."
# predict_results(text2, model, device)

In [None]:
# nlp = spacy.load('en_core_web_sm')

# def predict_sentiment(text, model, device):
#     tokenized = [i.text for i in nlp.tokenizer(text)]
#     # tokens = tokenizer(text)
#     ids = [TEXT.vocab.stoi[t] for t in tokenized]
#     length = torch.LongTensor([len(ids)])
#     tensor = torch.LongTensor(ids).unsqueeze(dim=0).to(device)
#     # tensor = torch.LongTensor(ids).to(device)
#     prediction = model(tensor, length).squeeze(dim=0)
#     probability = torch.softmax(prediction, dim=-1)
#     predicted_class = prediction.argmax(dim=-1).item()
#     predicted_probability = probability[predicted_class].item()
#     return predicted_class, predicted_probability

# text = "j 6t6jáltá jó6 6o88 6wáoöêólj#ú y8w#txútóy cwxlüj#ê jój#ákú ."
# text1 = "6êcétlê jolêot8 zc éê#xw#öjóáê , tl zc j #jlkê# 8tcl8êcc jöÚ8ê 6wüó lkê öt668ê wx lkê #wj6 , ükê#ê lkê lkêöjltá t#wótêc j#ê lww wÚ2twoc jó6 lkê cê+oj8 éw8tltác lww cöoy ."
# text2 = "vocl ükêó úwo lktóm lkjl ê2ê#ú éwcctÚ8ê jóy8ê kjc Úêêó ê+kjoclê6 Úú 6wáoöêólj#tjóc , jówlkê# óêü xt8ö êöê#yêc ütlk úêl jówlkê# #êöj#mjÚ8ê úêl ckwámtóy8ú 8tll8ê7mówüó éê#céêált2ê ."
# predict_sentiment(text2, model, device)

In [25]:
# Eventually, results need to be a list of 2028 0 or 1's
# results = []

In [96]:
# results = []
# input_file = open('./test_enc_unlabeled.tsv')
# lines = input_file.readlines()
# results = []
# for i in lines:
#     p = predict_results(i, model, device)
#     results.append(p)
# print(results)

[1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 

### Output Prediction Result File

You will need to submit a prediction result file. It should have 2028 lines, every line should be either 0 or 1, which is your model's prediction on the respective test set instance.

In [129]:
# suppose you had your model's predictions on the 2028 test cases read from test_enc_unlabeled.tsv, and 
#those results are in the list called 'results'
assert (len(results) == 2028)

In [86]:
# make sure the results are not float numbers, but intergers 0 and 1
results = [int(x) for x in results]


In [130]:
# write your prediction results to 'upload_predictions.txt' and upload that later
print(results)
print('Count of 0\'s :' , results.count(0))
print('Count of 1\'s :' , results.count(1))

with open('upload_predictions.txt', 'w', encoding = 'utf-8') as fp:
    for x in results:
        fp.write(str(x) + '\n')

[0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 