**Using a Bi-LSTM Model for Language Detection**

**Drive Link (data, prediction and models) :** https://drive.google.com/drive/folders/1UWe1KH3Hyppc1U52b13k_v7P1uRwt16e?usp=sharing

**Dataset :** http://www.statmt.org/europarl/

I cleaned the data, generated a csv file from each language corpus and then merged these csv files to create a single (multi-label) dataset, so that we can use it in the supervised training.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd drive/MyDrive/lang_detect

/content/drive/.shortcut-targets-by-id/1UWe1KH3Hyppc1U52b13k_v7P1uRwt16e/lang_detect


In [None]:
import torch
import numpy as np
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader

import re
import json
torch.autograd.set_detect_anomaly(False)
torch.autograd.profiler.profile(False)
torch.autograd.profiler.emit_nvtx(False)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def word_tokenizer(data, senlen):
    word_tokenizer = Tokenizer(oov_token="<OOV>")
    word_tokenizer.fit_on_texts(data)
    word_index = word_tokenizer.word_index
    train_word_sequences = word_tokenizer.texts_to_sequences(data)
    padded_word_sequences_dis = pad_sequences(train_word_sequences, padding='post', maxlen=senlen, truncating="post")
    return word_index, padded_word_sequences_dis

def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list, tuple)):

        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""

    def __init__(self, dl, device):
        self.dl = dl
        self.device = device

    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl:
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)


def get_data(sentences, labels, label_index, batch_size):
    sentence = torch.tensor(sentences, dtype=torch.long)
    label = torch.tensor(labels, dtype=torch.long)
    train_ds = TensorDataset(sentence, label)
    train_dl = DataLoader(train_ds, batch_size, shuffle=True)
    return DeviceDataLoader(train_dl, get_default_device())

def get_default_device():

    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        # raise SystemError("CUDA NOT FOUND")
        return torch.device('cpu')
        
def read_txt(file_path):
    sentence_list = []
    label_list = [] 
    with open(file_path, 'r', encoding='utf8') as f:
      lines = f.readlines()
      for line in lines:
        sentence = ""
        label = ""
        for i in range(len(line)):
          if(line[i] == ","):
            label = line[0 : i].strip()
            sentence = line[i+1 : ].strip()
            # print(label , " ", sentence)
            break
        sentence_list.append(sentence)
        label_list.append(label)
        
      
    return sentence_list, label_list

In [None]:
def load_data(train_file_path, valid_file_path, test_file_path,batch_size=32):
  train_sentences, train_labels = read_txt(train_file_path)
  valid_sentences, valid_labels = read_txt(valid_file_path)
  test_sentences, test_labels = read_txt(test_file_path)

  train_len = len(train_sentences)
  valid_len = len(valid_sentences)
  test_len = len(test_sentences)

  device = get_default_device()

  sentences = train_sentences + valid_sentences + test_sentences
  labels = train_labels + valid_labels + test_labels

  max_sen_len = -1

  for i in range(len(sentences)):
    max_sen_len = max(max_sen_len , len(sentences[i].strip().split(" ")))
    sentences[i] = sentences[i].strip().split(" ")
    
  # print(max_sen_len)
  max_sen_len =min(max_sen_len,500)


  word_index , tokenized_word_data = word_tokenizer(sentences, senlen=max_sen_len)
  label_index, tokenized_label_data = word_tokenizer(labels , senlen=max_sen_len)

  train_data_dl = get_data(tokenized_word_data[:train_len], tokenized_label_data[:train_len], label_index, batch_size)
  valid_data_dl = get_data(tokenized_word_data[train_len:train_len+valid_len], tokenized_label_data[train_len:train_len+valid_len], label_index, batch_size)
  test_data_dl = get_data(tokenized_word_data[-test_len:], tokenized_label_data[-test_len:], label_index, batch_size)
  data = {'train_data_dl': train_data_dl, 'valid_data_dl': valid_data_dl, 'test_data_dl': test_data_dl, 'word_index': word_index,'label_index': label_index,'max_sen_len':max_sen_len}
  
  return data


if __name__ == '__main__':
    main_data = load_data("data/europarl.pp.train" , "data/europarl.pp.eval","data/europarl_normalized.pp.test")


In [None]:
#Base Model
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader
# import torchtext
import torch
# from torchtext.vocab import Vocab
import io
import matplotlib.pyplot as plt
import torch.nn.functional as F

import torch.nn as nn

# import Data_preprocesser

def get_default_device():

    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        # raise SystemError("CUDA NOT FOUND")
        return torch.device('cpu')

def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list, tuple)):

        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)
       

class Lang_Identify(nn.Module):
    def __init__(self, vocab_size, max_sentence_len, input_size=300, word_embed_dim=300, label_size=5):
        super().__init__()

        self.word_embedding = nn.Embedding(vocab_size, word_embed_dim)
        
        self.bilstm = nn.LSTM(input_size, hidden_size=128, bidirectional=True)
        self.bilstm_2 = nn.LSTM(128 * 2, hidden_size=128, bidirectional=True)
        self.lstm = nn.LSTM(128 * 2, hidden_size=128)
        self.fc = nn.Linear(128, label_size)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input1):

        self.sentence = input1
        
        we = self.word_embedding(self.sentence)  # (,123,300)
        # print("Word Embedding : ", we.shape)
    
        lstm_1, (hidden, cell) = self.bilstm(we)
        # print("LSTM_1 : ",lstm_1.shape)

        lstm_2, (hidden, cell) = self.lstm(lstm_1)
        # print("LSTM_2 : ",lstm_2.shape)

        final_fc = self.fc(lstm_2)
        # print("Final_FC " ,final_fc.shape)
        output = self.softmax(final_fc)
        # print("Output : ",output.shape)

        return output

In [None]:
import numpy 
from collections import Counter
import pandas as pd

def classification_report_org(y_true, y_pred, labels):
	'''Similar to the one in sklearn.metrics,
	reports per classs recall, precision and F1 score'''
	# print(y_pred)
	# print(y_true)
  
	y_true = numpy.asarray(y_true).ravel()
	y_pred = numpy.asarray(y_pred).ravel()

	corrects = Counter(yt for yt, yp in zip(y_true, y_pred) if yt == yp)
	y_true_counts = Counter(y_true)
	y_pred_counts = Counter(y_pred)
	report = ((lab,  # label
			   corrects[i] / max(1, y_true_counts[i]),  # recall
			   corrects[i] / max(1, y_pred_counts[i]),  # precision
			   y_true_counts[i]  # support
			   ) for i, lab in enumerate(labels))
	report = [(l, r, p, 2 * r * p / max(1e-9, r + p), s) for l, r, p, s in report]

	print('{:<15}{:>10}{:>10}{:>10}{:>10}\n'.format('',
													'recall',
													'precision',
													'f1-score',
													'support'))
	formatter = '{:<15}{:>10.4f}{:>10.4f}{:>10.4f}{:>10.4f}'.format
	for r in report:
		print(formatter(*r))
	print('')
	report2 = list(zip(*[(r * s, p * s, f1 * s) for l, r, p, f1, s in report]))
	N = len(y_true)
	print(formatter('avg / total',
					sum(report2[0]) / N,
					sum(report2[1]) / N,
					sum(report2[2]) / N, N) + '\n')
	actual = Counter(y_true)
	del actual[-1]
	accuracy = sum(corrects.values()) / sum(actual.values())
	print('Accuracy:', accuracy)

In [None]:
# !pip install fastai --upgrade

In [None]:
import torch
import argparse
import torch.nn as nn
import numpy as np
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader
import json
# import torchtext
import torch
# from torchtext.vocab import Vocab
import io
import matplotlib.pyplot as plt
import torch.nn.functional as F
import torch.nn as nn

from fastai.test_utils import *
from fastai.text.all import *


data = main_data
# Arguments and data
train_data = data['train_data_dl']
valid_data = data['valid_data_dl']
test_data = data['test_data_dl']

word_index = data['word_index']
label_index = data['label_index']
max_sen_len = data['max_sen_len']

word_embedding_size = 300

device = get_default_device()

loss = nn.CrossEntropyLoss()

print("#################### Starting Training Model ####################")


model = Lang_Identify(vocab_size=len( word_index)+1,  max_sentence_len=max_sen_len, word_embed_dim = 300, label_size=len(label_index)+1)

model = to_device(model, device)

opt = torch.optim.Adam

dls = DataLoaders(train_data, valid_data)
learner = Learner(dls, model, loss_func=CrossEntropyLossFlat(flatten=True),metrics=[accuracy], lr=0.005, opt_func=Adam)
# learner.fit(arg["epochs"], lr = arg["lr"] )


# lr_min,lr_steep = learner.lr_find()
# print(f"Minimum/10: {lr_min:.2e}, steepest point: {lr_steep:.2e}")

learner.fit_one_cycle(10, lr_max = 0.005)


print("----Ending Training-------- \n Starting Validating Model:")

result_predic = []
result_orig = []
for *xb, yb in test_data:

    preds = model(*xb)
    preds = torch.argmax(preds, dim = -1)
    result_predic.append(preds)
    result_orig.append(yb)

predic_f = torch.cat(result_predic).cpu()
orig_f = torch.cat(result_orig).cpu()

classification_report_org(orig_f, predic_f, label_index )
print("#################### Saving ####################")

torch.save(model.state_dict()," bi-lstm_model/")

print("######################### End #########################")



epoch,train_loss,valid_loss,accuracy,time
0,3.174304,3.174028,0.991781,35:04
1,3.174232,3.174012,0.993948,35:08
2,3.174252,3.173991,0.996563,35:07
3,3.174207,3.173995,0.995737,35:06
4,3.174287,3.17407,0.968439,35:21
5,3.174207,3.173987,0.995023,35:21
6,3.174199,3.17399,0.993714,35:26
7,3.174184,3.173987,0.994759,35:24
8,3.174258,3.173986,0.996451,35:14
9,3.174238,3.173986,0.997149,35:01


----Ending Training-------- 
 Starting Validating Model:
                   recall precision  f1-score   support

<OOV>              0.9980    1.0000    0.999010372344.0000
label              0.0000    0.0000    0.0000    0.0000
en                 0.9985    0.9735    0.985820828.0000
nl                 0.3570    0.0488    0.0858 1000.0000
da                 0.0490    0.0469    0.0479 1000.0000
sv                 0.0280    0.0504    0.0360 1000.0000
pt                 0.0000    0.0000    0.0000 1000.0000
es                 0.0460    0.0510    0.0484 1000.0000
it                 0.3300    0.0448    0.0788 1000.0000
fr                 0.0000    0.0000    0.0000 1000.0000
de                 0.0760    0.0560    0.0645 1000.0000
el                 0.0100    0.0391    0.0159 1000.0000
bg                 0.0958    0.0522    0.0676  992.0000
fi                 0.0000    0.0000    0.0000 1000.0000
cs                 0.0020    0.0153    0.0035 1000.0000
sl                 0.0000    0.0000    0.00

IsADirectoryError: ignored

**I have used a 2 layer LSTM model in which the 1st layer ia a Bi-directional LSTM and the 2nd layer is a normal LSTM layer.
Using the following model I obtain the follwing results on the validation set :**

**1)Precision : 99.8%**

**2)Recall    : 99.6%**

**3)F1_Score  : 99.7%**

**Due to resource constraints, I was able to train the model for 10 epochs only.
If we train the model for more number of epochs and tune the hyperparameters properly, we'll definitely obtain better results.**

**Moreover, due to resource constraints I kept the maximum sentence length, on which the model is trained, as 500 when the actual the maximum sentence length is 15062. Nevertheless, the model was able to learn the structural composition of the words and phrases as well the dependancies between the words and gave good results.**