# Language Identification based on deep neural networks and ngrams
This approach of language identification follows the paper: Language Identification a Neural Network Approach, https://core.ac.uk/download/pdf/62918899.pdf

Feature extraction is partly inspired by:
https://github.com/conorosully/medium-articles/blob/master/src/language_classification.ipynb

## Dataset
The data can be downloaded from: https://downloads.tatoeba.org/exports/

In [1]:
# imports
import pandas as pd

In [2]:
# define constants
# TODO: Justify assumptions
MIN_LEN = 20
MAX_LEN = 200

LANG = ['deu', 'eng', 'fra']

DATA_SIZE = 5000
TEST_SIZE = 0.2

In [3]:
data = pd.read_csv('sentences.csv',
                  sep='\t',
                  encoding='utf8',
                  index_col=0,
                  names=['lang', 'text'])

In [4]:
# Filter text by length
filter_len = [True if MIN_LEN <= len(t) <= MAX_LEN else False for t in data['text']]
data = data[filter_len]

# Filter text by language
filter_lang = [True if l in LANG else False for l in data['lang']]
data = data[filter_lang]

In [5]:
# Shuffle and crop data
data_sample = data.sample(n=DATA_SIZE)

# Split data into test set and training set
offset = int(TEST_SIZE * DATA_SIZE)
data_test = data_sample[:offset]
data_train = data_sample[offset:]

## Feature extraction

In [6]:
# imports
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
# get ngrams for a specific language
def get_ngrams(corpus, n, max_features):
    vectorizer = CountVectorizer(analyzer='char',
                                ngram_range=(n, n),
                                max_features=max_features)
    
    X = vectorizer.fit_transform(corpus)
    
    feature_names = vectorizer.get_feature_names()
    return X, feature_names

In [53]:
features = {}
feature_names = get_ngrams(data_train['text'], 4, 20)
feature_names

(<4000x20 sparse matrix of type '<class 'numpy.int64'>'
 	with 7765 stored elements in Compressed Sparse Row format>,
 [' is ',
  ' mar',
  ' tha',
  ' the',
  ' to ',
  ' tom',
  ' was',
  ' you',
  'and ',
  'ary ',
  'hat ',
  'ich ',
  'ing ',
  'mary',
  "n't ",
  'that',
  'the ',
  'tom ',
  'was ',
  'you '])

In [67]:
# get most frequent ngrams for every language
def get_feature_names(data, n, max_features):
    
    features = set()
        
    # get features for every language
    for l in LANG:
        corpus = data[data.lang==l]['text']
        _, ngrams = get_ngrams(corpus, n, 20)
        features.update(ngrams)
    
    return features

In [68]:
# get normalized frequency matrix for specified ngrams
def get_feature_matrix(data, n, ngrams):
    vocab = {}
    for i, fn in enumerate(ngrams):
        vocab[fn]=i
    
    vectorizer = CountVectorizer(analyzer='char',
                                ngram_range=(n, n),
                                vocabulary=vocab)
    
    X = vectorizer.transform(data['text'])
    feature_names = vectorizer.get_feature_names()
    
    feature_matrix = pd.DataFrame(data=X.toarray(), columns=feature_names)
    
    # normalize matrix
    count_min = feature_matrix.min()
    count_max = feature_matrix.max()
    
    feature_matrix = (feature_matrix - count_min) / (count_max - count_min)
    
    # add target variable
    target_var = data['lang']
    return feature_matrix, target_var

In [96]:
# generate feature matrix for test and train data
ngrams = get_feature_names(data_train, 4, 50)
train_X, train_Y = get_feature_matrix(data_train, 4, ngrams)
test_X, test_Y = get_feature_matrix(data_test, 4, ngrams)
print(test_X.shape, test_Y.shape)

(1000, 59) (1000,)


# Preprocessing

In [97]:
# Find the index of a language
def langToIndex(lang):
    return int(LANG.index(lang))

In [98]:
def lineToTensor(line):
    tensor = torch.zeros(len(line))
    for li, lang in enumerate(line):
        tensor[li] = langToIndex(lang)
        tensor = tensor.long()
    return tensor

# Modelling

In [99]:
# imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [100]:
# number of ngrams corresponds to the number of cols in the feature matrix minus 1 for the target variable
DIM_INPUT = train_X.shape[1]
DIM_OUTPUT = len(LANG)

N_EPOCHS = 1000
L_RATE = 0.001

In [101]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(DIM_INPUT, DIM_INPUT)
        self.fc2 = nn.Linear(DIM_INPUT, DIM_INPUT)
        self.fc3 = nn.Linear(DIM_INPUT, DIM_OUTPUT)
        
    def forward(self, x):
        x = F.sigmoid(self.fc1(x))
        x = F.sigmoid(self.fc2(x))
        x = F.softmax(self.fc3(x))
        return x

In [102]:
def train(model, x, y, optimizer, criterion):
    #model.zero_grad()
    y_pred = model(x)
    loss = criterion(y_pred, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss, y_pred

In [103]:
net = Net()
opt = torch.optim.Adam(net.parameters(), lr=L_RATE)
crit = nn.CrossEntropyLoss()

tensor_x = torch.tensor(train_X.values.astype(np.float32))
tensor_y = lineToTensor(train_Y)

In [104]:
for e in range(N_EPOCHS):
    loss, output = train(net, tensor_x, tensor_y, opt, crit)
    if e % 10 == 9:
        print(e, loss.item())


  x = F.softmax(self.fc3(x))


9 1.0343570709228516
19 0.9805341958999634
29 0.9593043327331543
39 0.952098548412323
49 0.9493088722229004
59 0.9479894638061523
69 0.9472411870956421
79 0.946749746799469
89 0.9463869333267212
99 0.9461105465888977
109 0.9458827972412109
119 0.9456905126571655
129 0.9455293416976929
139 0.9453927874565125
149 0.9452614784240723
159 0.9451424479484558
169 0.9450288414955139
179 0.9449203014373779
189 0.9448147416114807
199 0.9447058439254761
209 0.9445931911468506
219 0.9444803595542908
229 0.9443602561950684
239 0.9442311525344849
249 0.9440850019454956
259 0.9439287185668945
269 0.9437574744224548
279 0.9435518980026245
289 0.9433176517486572
299 0.9430469274520874
309 0.942733645439148
319 0.9423544406890869
329 0.9418937563896179
339 0.9413370490074158
349 0.9406454563140869
359 0.9397801160812378
369 0.9386893510818481
379 0.9372863173484802
389 0.9354479312896729
399 0.9330137372016907
409 0.929719090461731
419 0.9251969456672668
429 0.9189499616622925
439 0.9103779792785645
449

In [106]:
correct = 0
total = 0
with torch.no_grad():
    for i in range(tensor_x.shape[0]):
        inp = tensor_x[i]
        ground_truth = tensor_y[i]
        outputs = net(inp)
        _, predicted = torch.max(outputs.data, 0)
        total += 1
        correct += (predicted == ground_truth).sum().item()

print('Accuracy of the network on the 10000 test images: %d %%' % (
    100 * correct / total))

  x = F.softmax(self.fc3(x))


Accuracy of the network on the 10000 test images: 94 %


In [33]:
print(tensor_x[0].shape)

torch.Size([58])
