<a href="https://colab.research.google.com/github/kabirwalia8300/HMT_vs_NMT/blob/main/RNN_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Classify if a sentence was translated by a machine or a human

In [1]:
# mounting
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
# General imports
import json
import math
import os
from pathlib import Path
import random
import time
from tqdm.notebook import tqdm, trange
from typing import Dict, List, Set, Tuple

import numpy as np
import torch
import torchtext
import torch.nn as nn
from torch.nn import init
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from tqdm.notebook import tqdm, trange
import pandas as pd
from torch.nn import functional as F

In [3]:
# Lambda to switch to GPU if available
get_device = lambda : "cuda:0" if torch.cuda.is_available() else "cpu"

Parse in data


In [4]:
from collections import defaultdict
# training 
f=open('gdrive/MyDrive/HMT_vs_NMT/train.txt', 'r')
train_txt = []
temp = []
for line in f:
    if line=='\n':
        train_txt.append(temp)
        temp = []
        continue
    temp.append(line)

# training dataframe
train_df = pd.DataFrame(train_txt, columns=['orig', 'ref', 'candidate', 'score', 'label'])

# testing 
t=open('gdrive/MyDrive/HMT_vs_NMT/test.txt', 'r')
test_txt = []
temp = []
for line in t:
    if line=='\n':
        test_txt.append(temp)
        temp = []
        continue
    temp.append(line)

# testing dataframe
test_df = pd.DataFrame(test_txt, columns=['orig', 'ref', 'candidate', 'score', 'label'])
# training
train_df['orig']=train_df.orig.apply(lambda x: x.strip('\n'))
train_df['candidate']=train_df.candidate.apply(lambda x: x.strip('\n'))
train_df['label']=train_df.label.apply(lambda x: x.strip('\n'))
train_df['score']=train_df.score.apply(lambda x: x.strip('\n'))

# testing
test_df['orig']=test_df.orig.apply(lambda x: x.strip('\n'))
test_df['candidate']=test_df.candidate.apply(lambda x: x.strip('\n'))
test_df['label']=test_df.label.apply(lambda x: x.strip('\n'))
test_df['score']=test_df.score.apply(lambda x: x.strip('\n'))
msk = np.random.rand(len(train_df)) < 0.8
train = train_df[msk]
train = train.reset_index(drop=True)
val = train_df[~msk]
val = val.reset_index(drop=True)
# Add Score Here
train = train[['orig', 'candidate', 'score', 'label']]
val = val[['orig', 'candidate', 'score', 'label']]
train['label_t'] = train.label.apply(lambda x: 0 if x=='H' else 1)
val['label_t'] = val.label.apply(lambda x: 0 if x=='H' else 1)
del train['label']
del val['label']
# Add Score Here
test = test_df[['orig', 'candidate', 'score', 'label']]
test['label_t'] = test.label.apply(lambda x: 0 if x=='H' else 1)
train=train.rename(columns={'orig':'text_a', 'candidate': 'text_b', 'label_t':'labels'})
val =val.rename(columns={'orig':'text_a', 'candidate': 'text_b', 'label_t':'labels'})
test =test.rename(columns={'orig':'text_a', 'candidate': 'text_b', 'label_t':'labels'})
del test['label']
test.head()

Unnamed: 0,text_a,text_b,score,labels
0,"发言人 说 , 世人 共 知 , 中国 的 发展 不仅 符合 13 亿 中国人民 的 根本利...","the spokesman said , the world , a total of 13...",0.51,1
1,"在 国际 政治 舞台上 , 中国 是 一个 负责任 的 国家 , 对 维护 世界和平 与 稳...","in the international political arena , china i...",0.5996,0
2,"在 核 不 扩散 问题 上 , 中国 始终 严格 履行 有关 国际公约 , 坚决 反对 任何...",on the issue of non-proliferation of nuclear w...,0.6667,0
3,"发言人 指出 , 令人 遗憾 的 是 , 海德 先生 无视 客观 事实 , 在 中美 关系 ...",the spokesman stated that it was regretful tha...,0.5326,0
4,中国 最 大规模 的 海水 淡化 工程 落户 舟山,china's largest seawater desalination project ...,0.6014,0


In [5]:
max_sequence_length= train['text_b'].map(lambda x: len(str(x))).max()
def all_zeros(x):
  return torch.all(x.eq(torch.zeros(x.shape)))

In [6]:
class TranslationDatasetRNN(Dataset):
    def __init__(self, data):
        self.X = [X for X, _, _ in data]
        self.y = torch.LongTensor([y for _, _, y in data])
        self.data=data
        self.l = [l for _, l, _ in data]
        self.len = len(data)
    
    def __len__(self):
        return self.len
    
    def __getitem__(self, index):
        x = torch.stack(self.X[index])
        l= torch.tensor(int(self.l[index]))
        y= self.y[index]
        return x,l,y

def get_data_loaders_rnn(train, val, batch_size=16):
    # First we create the dataset given our train and validation lists
    dataset = TranslationDatasetRNN(train + val)

    # Then, we create a list of indices for all samples in the dataset
    train_indices = [i for i in range(len(train))]
    val_indices = [i for i in range(len(train), len(train) + len(val))]

    # Now we define samplers and loaders for train and val
    train_sampler = SubsetRandomSampler(train_indices)
    train_loader = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)
    
    val_sampler = SubsetRandomSampler(val_indices)
    val_loader = DataLoader(dataset, batch_size=batch_size, sampler=val_sampler)

    return train_loader, val_loader

In [7]:
def rnn_preprocessor(data, max_seq_len, embed_model, test=False):
    vectorized_data = []
    # In test there are no labels
    if test:
      for i in range(len(data)):
        row=data.iloc[i]
        vectors=[]
        for word in row['text_b']:
          vector = embed_model[word] # Get the word embedding
          if not all_zeros(vector):  
            vectors.append(vector)
        # score = torch.tensor([float(row['score'])]*300)
        # vectors.append(score)
        # Pad with all 0 embeddings
        while (len(vectors) < max_seq_len+1):
          vectors.append(embed_model[""])

        # Add embedings and original doc length
        vectorized_data.append((vectors,len(row['text_b'])+1, row['labels']))

    else:
      for i in range(len(data)):
        row=data.iloc[i]
        vectors=[]
        for word in row['text_b']:
          vector = embed_model[word] # Will be zeros if unknown

          # Only add embedding if it is a known word
          if not all_zeros(vector):
            vectors.append(vector)
        # score = torch.tensor([float(row['score'])]*300)
        # vectors.append(score)
        # Pad with all 0 embeddings
        while (len(vectors)<max_seq_len+1):
          vectors.append(embed_model[""])
        # Add embeddings, original doc length, and true label
        vectorized_data.append((vectors,len(row['text_b'])+1, row['labels']))

    return vectorized_data

In [8]:
class RNN(nn.Module):
  def __init__(self, input_dim, hidden_dim, output_dim, n_layers=1, non_linearity="relu", lstm=False, bi_dir=False):
    super(RNN, self).__init__()
    self.n_layers = n_layers
    self.hidden_dim = hidden_dim
    self.lstm = lstm
    self.rnn = nn.RNN(input_dim, hidden_dim, n_layers, nonlinearity=non_linearity, batch_first=True, bidirectional=bi_dir)
    if self.lstm:
      self.rnn = nn.LSTM(input_dim, hidden_dim, n_layers, batch_first=True, bidirectional=bi_dir)
     
    self.dropout = nn.Dropout(p=0.3)
    self.linear = nn.Linear(hidden_dim, output_dim)
    self.softmax = nn.LogSoftmax(dim=1)
    self.loss = nn.NLLLoss()

  def compute_Loss(self, predicted_vector, gold_label):
    return self.loss(predicted_vector, gold_label)

  def forward(self, inputs, lengths):

    # Pack inputs, which have been padded with 0s so they are all
    # the same length
    inputs = pack_padded_sequence(inputs, lengths, batch_first=True, enforce_sorted=False)

    output, hidden = self.rnn(inputs)
    hidden = self.dropout(hidden)
    # LSTM output is (hn, cn)
    if self.lstm:
      hidden = hidden[0]

    # Undo packing operation
    output,_ = pad_packed_sequence(output, batch_first=True)

    # Use the hidden layer of the last word in sequence as output to linear
    out = self.linear(hidden[-1])
    
    predicted_vector = self.softmax(out)
    return predicted_vector

  def load_model(self, save_path):
    self.load_state_dict(torch.load(save_path))

  def save_model(self, save_path):
    torch.save(self.state_dict(), save_path)

In [31]:
def train_epoch_rnn(model, train_loader, optimizer):
	model.train()
	total = 0
	loss = 0
	total_loss = 0
	correct = 0
	for (input_batch, input_len, expected_out) in tqdm(train_loader, leave=False, desc="Training Batches"):
		output = model(input_batch.to(get_device()), input_len)
		total += output.size()[0]
		_, predicted = torch.max(output, 1)
		correct += (expected_out == predicted.to("cpu")).cpu().numpy().sum()
		loss = model.compute_Loss(output, expected_out.to(get_device()))
		total_loss += loss
		optimizer.zero_grad()
		loss.backward()
		optimizer.step()


def evaluation_rnn(model, val_loader, verbose=False):
	model.eval()
	loss = 0
	correct = 0
	total = 0
	for (input_batch, input_len, expected_out) in tqdm(val_loader, leave=False, desc="Validation Batches"):
		output = model(input_batch.to(get_device()), input_len)
		total += output.size()[0]
		_, predicted = torch.max(output, 1)
		correct += (expected_out.to("cpu") == predicted.to("cpu")).cpu().numpy().sum()
		loss += model.compute_Loss(output, expected_out.to(get_device()))
	loss /= len(val_loader)
	# Print validation metrics
	if verbose:
		print(f"\nEval ave loss: {loss}")
		print(f"Eval accuracy: {correct / total}")
 
	return loss, correct / total

def train_and_evaluate_rnn(number_of_epochs, model, train_loader, val_loader, learning_rate=0.001, momentum=0.9, verbose=False):
	optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

	losses = []
	accuracies = []

	for epoch in trange(number_of_epochs, desc="Epochs"):
		train_epoch_rnn(model, train_loader, optimizer)
		loss, correct =  evaluation_rnn(model, val_loader, verbose)
		losses.append(loss)
		accuracies.append(correct)

	return losses[-1], accuracies[-1]

In [None]:
!pip install torch



In [10]:
# read in word embeddings
import pickle
with open('gdrive/MyDrive/P4/embeddings_300d_840B.pickle', 'rb') as f:
     embed_model = pickle.load(f)

In [None]:
embed_model[0].shape

torch.Size([300])

In [11]:
train_vectorized_rnn = rnn_preprocessor(train, max_sequence_length, embed_model)
val_vectorized_rnn = rnn_preprocessor(val, max_sequence_length, embed_model)
test_vectorized_rnn = rnn_preprocessor(test, max_sequence_length, embed_model, test=True)

In [17]:
h=32
input_dim = len(train_vectorized_rnn[0][0][0])
n_layers = 2
output_dim = 2
n_epochs = 10
non_linearity = "tanh"
batch_size = 32

rnn_train_loader, rnn_val_loader =  get_data_loaders_rnn(train_vectorized_rnn, val_vectorized_rnn, batch_size=batch_size)

model = RNN(input_dim, h, output_dim, n_layers, non_linearity=non_linearity, bi_dir=True).to(get_device())
train_and_evaluate_rnn(n_epochs, model, rnn_train_loader, rnn_val_loader, learning_rate=0.0001, verbose=True)

HBox(children=(FloatProgress(value=0.0, description='Epochs', max=10.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=15.0, style=ProgressStyle(descript…

HBox(children=(FloatProgress(value=0.0, description='Validation Batches', max=4.0, style=ProgressStyle(descrip…


Eval ave loss: 0.6930001974105835
Eval accuracy: 0.5284552845528455


HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=15.0, style=ProgressStyle(descript…

HBox(children=(FloatProgress(value=0.0, description='Validation Batches', max=4.0, style=ProgressStyle(descrip…


Eval ave loss: 0.6910090446472168
Eval accuracy: 0.5121951219512195


HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=15.0, style=ProgressStyle(descript…

HBox(children=(FloatProgress(value=0.0, description='Validation Batches', max=4.0, style=ProgressStyle(descrip…


Eval ave loss: 0.689914345741272
Eval accuracy: 0.5691056910569106


HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=15.0, style=ProgressStyle(descript…

HBox(children=(FloatProgress(value=0.0, description='Validation Batches', max=4.0, style=ProgressStyle(descrip…


Eval ave loss: 0.6867190599441528
Eval accuracy: 0.5528455284552846


HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=15.0, style=ProgressStyle(descript…

HBox(children=(FloatProgress(value=0.0, description='Validation Batches', max=4.0, style=ProgressStyle(descrip…


Eval ave loss: 0.6851322054862976
Eval accuracy: 0.5528455284552846


HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=15.0, style=ProgressStyle(descript…

HBox(children=(FloatProgress(value=0.0, description='Validation Batches', max=4.0, style=ProgressStyle(descrip…


Eval ave loss: 0.6831752061843872
Eval accuracy: 0.5447154471544715


HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=15.0, style=ProgressStyle(descript…

HBox(children=(FloatProgress(value=0.0, description='Validation Batches', max=4.0, style=ProgressStyle(descrip…


Eval ave loss: 0.6811241507530212
Eval accuracy: 0.5203252032520326


HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=15.0, style=ProgressStyle(descript…

HBox(children=(FloatProgress(value=0.0, description='Validation Batches', max=4.0, style=ProgressStyle(descrip…


Eval ave loss: 0.6794731020927429
Eval accuracy: 0.5121951219512195


HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=15.0, style=ProgressStyle(descript…

HBox(children=(FloatProgress(value=0.0, description='Validation Batches', max=4.0, style=ProgressStyle(descrip…


Eval ave loss: 0.6810262799263
Eval accuracy: 0.5447154471544715


HBox(children=(FloatProgress(value=0.0, description='Training Batches', max=15.0, style=ProgressStyle(descript…

HBox(children=(FloatProgress(value=0.0, description='Validation Batches', max=4.0, style=ProgressStyle(descrip…


Eval ave loss: 0.6807591915130615
Eval accuracy: 0.5528455284552846



(tensor(0.6808, device='cuda:0', grad_fn=<DivBackward0>), 0.5528455284552846)

In [51]:
model.save_model('./rnn_hmt.pt')