# Language Identification based on deep neural networks and ngrams
This approach of language identification follows the paper: Language Identification a Neural Network Approach, https://core.ac.uk/download/pdf/62918899.pdf

Feature extraction is partly inspired by:
https://github.com/conorosully/medium-articles/blob/master/src/language_classification.ipynb

## Dataset
The data can be downloaded from: https://downloads.tatoeba.org/exports/

In [3]:
# imports
import pandas as pd

In [None]:
# define constants
# TODO: Justify assumptions
MIN_LEN = 20
MAX_LEN = 200

LANG = ['deu', 'eng', 'fra']

DATA_SIZE = 5000
TEST_SIZE = 0.2

In [7]:
data = pd.read_csv('sentences.csv',
                  sep='\t',
                  encoding='utf8',
                  index_col=0,
                  names=['lang', 'text'])

In [8]:
# Filter text by length
filter_len = [True if MIN_LEN <= len(t) <= MAX_LEN else False for t in data['text']]
data = data[filter_len]

# Filter text by language
filter_lang = [True if l in LANG else False for l in data['lang']]
data = data[filter_lang]

In [14]:
# Shuffle and crop data
data_sample = data.sample(n=DATA_SIZE)

# Split data into test set and training set
offset = int(TEST_SIZE * DATA_SIZE)
data_test = data_sample[:offset]
data_train = data_sample[offset:]

## Preprocessing

In [11]:
# imports
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
# get ngrams for a specific language
def get_ngrams(corpus, n, max_features):
    vectorizer = CountVectorizer(analyzer='char',
                                ngram_range=(n, n),
                                max_features=max_features)
    
    X = vectorizer.fit_transform(corpus)
    
    feature_names = vectorizer.get_feature_names()
    return X, feature_names

In [18]:
features = {}
feature_names = get_ngrams(data_train['text'], 3, 20)
feature_names

[' de',
 ' ha',
 ' th',
 ' to',
 'as ',
 'at ',
 'e t',
 'en ',
 'er ',
 'hat',
 'he ',
 'ich',
 'ing',
 'is ',
 'om ',
 're ',
 'tha',
 'the',
 'to ',
 'tom']

In [29]:
# get most frequent ngrams for every language
def get_feature_names(data, n, max_features):
    
    features = set()
        
    # get features for every language
    for l in LANG:
        corpus = data[data.lang==l]['text']
        ngrams = get_ngrams(corpus, 3, 20)
        features.update(ngrams)
    
    return features

In [28]:
feat_train = get_feature_names(data_train, 3, 20)
len(feat_train)

58

In [33]:
# get normalized frequency matrix for specified ngrams
def get_feature_matrix(data, n, ngrams):
    vocab = {}
    for i, fn in enumerate(ngrams):
        vocab[fn]=i
    
    vectorizer = CountVectorizer(analyzer='char',
                                ngram_range=(n, n),
                                vocabulary=vocab)
    
    X = vectorizer.transform(data['text'])
    feature_names = vectorizer.get_feature_names()
    
    feature_matrix = pd.DataFrame(data=X.toarray(), columns=feature_names)
    
    # normalize matrix
    count_min = feature_matrix.min()
    count_max = feature_matrix.max()
    
    feature_matrix = (feature_matrix - count_min) / (count_max - count_min)
    
    # add target variable
    feature_matrix['lang'] = list(data['lang'])
    return feature_matrix

In [34]:
ngrams = get_feature_names(data_train, 3, 20)
feature_train = get_feature_matrix(data_train, 3, ngrams)



Unnamed: 0,er,ie,che,ing,le,en.,pas,cht,ine,tha,...,t t,us,hat,do,ch,in,e p,de,pa,lang
0,0.000,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.000000,0.00,...,0.000000,0.000000,0.000000,0.0,0.0,0.25,0.0,0.0,0.0,eng
1,0.000,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.000000,0.25,...,0.666667,0.000000,0.333333,0.0,0.0,0.00,0.0,0.0,0.0,eng
2,0.000,0.25,0.25,0.0,0.0,0.0,0.0,0.0,0.333333,0.00,...,0.000000,0.000000,0.000000,0.0,0.0,0.00,0.0,0.0,0.0,deu
3,0.125,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.000000,0.00,...,0.000000,0.000000,0.000000,0.0,0.0,0.00,0.0,0.0,0.0,eng
4,0.000,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.000000,0.00,...,0.000000,0.333333,0.000000,0.0,0.0,0.00,0.0,0.0,0.0,eng
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,0.000,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.000000,0.00,...,0.000000,0.000000,0.000000,0.0,0.0,0.00,0.0,0.0,0.0,eng
3996,0.000,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.000000,0.00,...,0.333333,0.000000,0.333333,0.0,0.0,0.00,0.0,0.0,0.0,eng
3997,0.000,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.000000,0.25,...,0.000000,0.000000,0.333333,0.0,0.0,0.00,0.0,0.0,0.0,eng
3998,0.125,0.00,0.00,0.0,0.0,0.0,0.0,0.0,0.000000,0.00,...,0.333333,0.000000,0.000000,0.0,0.0,0.00,0.0,0.0,0.0,eng
