<a href="https://colab.research.google.com/github/iarroyof/ukp_app/blob/main/UKP_language_detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
 
uploaded = files.upload()
 
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving Language Detection.csv to Language Detection.csv
User uploaded file "Language Detection.csv" with length 1911016 bytes


In [None]:
from collections import deque
import numpy as np
import pandas as pd
 
 
def tokenize(doc, ngram_range=(1, 3)):
 
    collected_ngrams = []
    for ng_size in range(*ngram_range):
        window = deque(maxlen=ng_size)
        for ch in doc:
            window.append(ch)
            collected_ngrams.append(''.join(list(window)))
 
    return collected_ngrams

In [7]:
# Load dataset
url = "https://raw.githubusercontent.com/iarroyof/ukp_app/main/Language%20Detection.csv" 
dataset = pd.read_csv(url).sample(frac = 0.01)
train_data = dataset.iloc[0:int(0.7 * len(dataset.index))]
test_data = dataset.iloc[int(0.7 * len(dataset.index)):]
 
X_train_data = train_data.Text.apply(tokenize)
X_train_data = X_train_data.apply(set)
X_train_data = X_train_data.apply(list)
Y_train_data = train_data.Language
 
X_train = []
Y_train = []
 
for x, y in zip(X_train_data, Y_train_data):
  Y_train += [y] * len(x)
  X_train += x
 
X_test_data = test_data.Text.apply(tokenize)
X_test_data = X_test_data.apply(set)
X_test_data = X_test_data.apply(list)
Y_test_data = test_data.Language
 
Y_test = []
X_test = []
for x, y in zip(X_test_data, Y_test_data):
  Y_test += [y] * len(x)
  X_test += x
# I first create a contingecy table
# Each cell contains the result of the indicator product function 
# f(x, y) = 1 if x == x' and y == y' ? 0 otherwise.
 
# I create sample spaces for each RV
(omega_x, Tx) = np.unique(X_train, return_counts=True)
(omega_y, Ty) = np.unique(Y_train, return_counts=True)
 
print(len(X_train))
print(omega_y)

9308
['Arabic' 'Danish' 'Dutch' 'English' 'French' 'German' 'Greek' 'Hindi'
 'Italian' 'Kannada' 'Malayalam' 'Portugeese' 'Russian' 'Spanish'
 'Sweedish' 'Tamil' 'Turkish']


In [8]:
# Bayes training
# Contigency table (Kronecker product)
f_xy = {}
for x in omega_x:
    for y in omega_y:
        f_xy[(x, y)] = sum([int(x_ == x and y_ == y)
          for x_, y_ in zip(X_train, Y_train)])
 
 
# Posterior computations
PYgX = {}
for y in omega_y:
    for x in omega_x:
        Zx = sum([f_xy[(x, y_)] for y_ in omega_y])
        PYgX[(y, x)] = f_xy[(x, y)] / Zx

In [9]:
def posterior(text):
 
    tokens = list(set(tokenize(text)))
    pmfs = []
    for x in tokens:
        try:
            pmfs.append([PYgX[(y, x)] for y in omega_y])
        except KeyError:
            pass
 
    prod = [1.0] * len(omega_y) 
    for ygx in pmfs:
        prod = np.multiply(prod, ygx)
 
    return list(zip(omega_y, prod))

In [13]:
posterior("c'est une phrase")

[('Arabic', 0.0),
 ('Danish', 0.0),
 ('Dutch', 0.0),
 ('English', 0.0),
 ('French', 9.100752070995522e-17),
 ('German', 0.0),
 ('Greek', 0.0),
 ('Hindi', 0.0),
 ('Italian', 0.0),
 ('Kannada', 0.0),
 ('Malayalam', 0.0),
 ('Portugeese', 0.0),
 ('Russian', 0.0),
 ('Spanish', 0.0),
 ('Sweedish', 0.0),
 ('Tamil', 0.0),
 ('Turkish', 0.0)]