# Language detector

In [2]:
import numpy as np
import pandas as pd

In [4]:
data = pd.read_csv(r'C:/Users/Kishlay kumar/Downloads/sentences.csv', on_bad_lines='skip')

In [2]:
data = pd.read_csv(r'C:\Users\Kishlay kumar\Downloads\sentences.csv',
                            sep='\t', 
                            encoding='utf8', 
                            index_col=0,
                            names=['lang','text'])

In [4]:
data


Unnamed: 0,lang,text
1,cmn,我們試試看！
2,cmn,我该去睡觉了。
3,cmn,你在干什麼啊？
4,cmn,這是什麼啊？
5,cmn,今天是６月１８号，也是Muiriel的生日！
...,...,...
11229419,dan,Hvorfor er du så dum?
11229420,dan,Hvorfor er I så dumme?
11229421,epo,Kial vi estas tiel stultaj?
11229422,spa,En fin la gente se envidia por cosas muy tontas.


In [5]:
len_cond = [True if 20<=len(s)<=200 else False for s in data['text']]
data = data[len_cond]

In [6]:
data


Unnamed: 0,lang,text
5,cmn,今天是６月１８号，也是Muiriel的生日！
21,cmn,选择什么是“对”或“错”是一项艰难的任务，我们却必须要完成它。
67,cmn,我们看东西不是看其实质，而是以我们的主观意识看它们的。
71,cmn,生活就是當你忙著進行你的計劃時總有其他的事情發生。
77,deu,Lass uns etwas versuchen!
...,...,...
11229419,dan,Hvorfor er du så dum?
11229420,dan,Hvorfor er I så dumme?
11229421,epo,Kial vi estas tiel stultaj?
11229422,spa,En fin la gente se envidia por cosas muy tontas.


In [7]:
lang = ['deu', 'eng', 'fra', 'ita', 'por', 'spa']
data = data[data['lang'].isin(lang)]

In [8]:
data

Unnamed: 0,lang,text
77,deu,Lass uns etwas versuchen!
78,deu,Ich muss schlafen gehen.
81,deu,Heute ist der 18. Juni und das ist der Geburts...
82,deu,"Herzlichen Glückwunsch zum Geburtstag, Muiriel!"
83,deu,Muiriel ist jetzt 20.
...,...,...
11229415,spa,¿Me esperará mañana?
11229416,spa,¿Me va a esperar mañana?
11229418,spa,Nos mataste con tus historias.
11229422,spa,En fin la gente se envidia por cosas muy tontas.


In [9]:
data_trim = pd.DataFrame(columns=['lang','text'])

In [10]:
data_trim

Unnamed: 0,lang,text


In [11]:
for l in lang:
    lang_trim = data[data['lang'] ==l].sample(50000,random_state = 100)
    data_trim = data_trim.append(lang_trim)

  data_trim = data_trim.append(lang_trim)
  data_trim = data_trim.append(lang_trim)
  data_trim = data_trim.append(lang_trim)
  data_trim = data_trim.append(lang_trim)
  data_trim = data_trim.append(lang_trim)
  data_trim = data_trim.append(lang_trim)


In [12]:
data_trim

Unnamed: 0,lang,text
10012920,deu,"Sagt Tom, dass ich auf ihn zu Hause warte."
2761961,deu,"Eine eigene Meinung ist ein Luxus, den sich ni..."
2911708,deu,"Warten wir, bis wir dran sind!"
368528,deu,"Er war furchtbar beunruhigt, als er jene Gesch..."
2059859,deu,"Mein lieber Freund, ich bin zu dem geworden, w..."
...,...,...
5769332,spa,No estaremos aquí a partir de las dos y media.
1496876,spa,Tom vino a mi oficina esta mañana.
2413403,spa,"Entonces llegó la vecina, que se puso a gritar..."
5170543,spa,¿Estás seguro de que no te olvidaste de nada?


In [13]:
data_shuffle = data_trim.sample(frac = 1)

In [14]:
data_shuffle

Unnamed: 0,lang,text
2105924,por,Falou-se de literatura.
6810135,ita,Non sono sicura di aver tradotto correttamente...
1004398,por,O menino usa óculos.
8897575,ita,Non mi lascerà aiutarlo.
380229,fra,Nous sommes en bonne santé.
...,...,...
4031223,por,Isso é o que chamamos amor verdadeiro.
8564265,por,De que forma eu posso te ajudar hoje?
8539120,por,Eu não acredito no que o Tom me disse.
6172786,ita,È urgente che lo veda.


In [15]:
train = data_shuffle[0:210000]
valid = data_shuffle[210000:270000]
test = data_shuffle[270000:300000]

In [16]:
test

Unnamed: 0,lang,text
3162122,eng,I think Tom doesn't want our help.
6301522,ita,Puoi svegliarla per me?
11166463,fra,Je pense que tu es le seul à t'en soucier.
7698961,fra,Tom l'a fait tout seul.
5679149,eng,We've put the Christmas presents under the tree.
...,...,...
4031223,por,Isso é o que chamamos amor verdadeiro.
8564265,por,De que forma eu posso te ajudar hoje?
8539120,por,Eu não acredito no que o Tom me disse.
6172786,ita,È urgente che lo veda.


In [17]:
from sklearn.feature_extraction.text import CountVectorizer
def get_trigrams(corpus,n_feat=200):
    """
    Returns a list of the N most common character trigrams from a list of sentences
    params
    ------------
        corpus: list of strings
        n_feat: integer
    """
    
    #fit the n-gram model
    vectorizer = CountVectorizer(analyzer='char',
                            ngram_range=(3, 3)
                            ,max_features=n_feat)
    
    X = vectorizer.fit_transform(corpus)
    
    #Get model feature names
    feature_names = vectorizer.get_feature_names()
    
    return feature_names

In [18]:
#obtain trigrams from each language
features = {}
features_set = set()

for l in lang:
    
    #get corpus filtered by language
    corpus = train[train.lang==l]['text']
    
    #get 200 most frequent trigrams
    trigrams = get_trigrams(corpus)
#     print(trigrams)
    
    #add to dict and set
    features[l] = trigrams 
#     print(features)
    features_set.update(trigrams)
#     print(features_set)
    
#create vocabulary list using feature set
vocab = dict()
for i,f in enumerate(features_set):
    vocab[f]=i
print(vocab)



{'olt': 0, 'un ': 1, 'n v': 2, 'ndo': 3, 'esp': 4, 'hac': 5, 'ent': 6, 'a v': 7, 'oro': 8, ' th': 9, 'm a': 10, 'a d': 11, 'at.': 12, ' ha': 13, 'r d': 14, ', w': 15, ' nã': 16, 'ría': 17, 's n': 18, 'una': 19, 'car': 20, ' vi': 21, 'hey': 22, 'fai': 23, 'das': 24, 't d': 25, 'hat': 26, "'es": 27, 'um ': 28, 'em ': 29, 'ce ': 30, ' si': 31, 's m': 32, 'wer': 33, 'thi': 34, "n't": 35, 'str': 36, 'ne ': 37, 'can': 38, 'ouv': 39, 'ht ': 40, 'o n': 41, 'o a': 42, 'a e': 43, 'ung': 44, 'ade': 45, 'ly ': 46, 'plu': 47, 'del': 48, ' he': 49, 'mit': 50, 'a t': 51, 'ner': 52, 'of ': 53, 'und': 54, 'nno': 55, 'ues': 56, 'wie': 57, ' dé': 58, 'ano': 59, 'ine': 60, 'ère': 61, 'tom': 62, 'wit': 63, ' é ': 64, 'aci': 65, 'ink': 66, 'voi': 67, 'o p': 68, 'sso': 69, 'osa': 70, ', d': 71, 's p': 72, 'anc': 73, 'er.': 74, 'jou': 75, 'her': 76, 'ión': 77, 'e w': 78, ' sc': 79, 's e': 80, 'uie': 81, 'ace': 82, 'tie': 83, 'ige': 84, 'eit': 85, 'hin': 86, ' of': 87, ' as': 88, 'ary': 89, 's q': 90, 'a l': 9

In [19]:
#train count vectoriser using vocabulary
vectorizer = CountVectorizer(analyzer='char',
                             ngram_range=(3, 3),
                            vocabulary=vocab)

#create feature matrix for training set
corpus = train['text']   
X = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names_out()

train_feat = pd.DataFrame(data=X.toarray(),columns=feature_names)
print(train_feat)
#Scale feature matrix 


        olt  un   n v  ndo  esp  hac  ent  a v  oro   th  ...  oul  are  tre  \
0         0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
1         0    0    0    0    0    0    1    0    0    0  ...    0    0    0   
2         0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
3         0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
4         0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
...     ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
209995    0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
209996    0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
209997    0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
209998    0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
209999    0    0    0    0    0    0    0    0    0    0  ...    0    0    0   

        ere  ato  iss   a   eme  et   o

In [20]:
#Scale feature matrix 
train_min = train_feat.min()
train_max = train_feat.max()
# train_feat = (train_feat - train_min)/(train_max-train_min)

#Add target variable 
train_feat['lang'] = list(train['lang'])
print(train_feat)

        olt  un   n v  ndo  esp  hac  ent  a v  oro   th  ...  are  tre  ere  \
0         0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
1         0    0    0    0    0    0    1    0    0    0  ...    0    0    0   
2         0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
3         0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
4         0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
...     ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
209995    0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
209996    0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
209997    0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
209998    0    0    0    0    0    0    0    0    0    0  ...    0    0    0   
209999    0    0    0    0    0    0    0    0    0    0  ...    0    0    0   

        ato  iss   a   eme  et   o e  l

In [21]:
#create feature matrix for validation set
corpus = valid['text']   
X = vectorizer.fit_transform(corpus)

valid_feat = pd.DataFrame(data=X.toarray(),columns=feature_names)
valid_feat = (valid_feat - train_min)/(train_max-train_min)
valid_feat['lang'] = list(valid['lang'])

#create feature matrix for test set
corpus = test['text']   
X = vectorizer.fit_transform(corpus)

test_feat = pd.DataFrame(data=X.toarray(),columns=feature_names)
test_feat = (test_feat - train_min)/(train_max-train_min)
test_feat['lang'] = list(test['lang'])

In [22]:

#create feature matrix for validation set
# corpus = valid['text']   
# X = vectorizer.fit_transform(corpus)

# https://towardsdatascience.com/deep-neural-network-language-identification-ae1c158f6a7d

# valid_feat = pd.DataFrame(data=X.toarray(),columns=feature_names)
# valid_feat = (valid_feat - train_min)/(train_max-train_min)
# valid_feat['lang'] = list(valid['lang'])

In [44]:
# pip install keras
# from collections.abc import Iterable
# pip install np_utils

In [31]:
from sklearn.preprocessing import LabelEncoder
# from keras.utils import np_utils
from keras.utils.np_utils import to_categorical

#Fit encoder
encoder = LabelEncoder()
encoder.fit(['deu', 'eng', 'fra', 'ita', 'por', 'spa'])

def encode(y):
    """
    Returns a list of one hot encodings
    Params
    ---------
        y: list of language labels
    """
    
    y_encoded = encoder.transform(y)
    y_dummy = np_utils.to_categorical(y_encoded)
    
    return y_dummy


# # from keras.utils import np_utils
# # import tensorflow as tf
# from keras.utils import to_categorical

In [42]:
import tensorflow as tf
from tf.keras.models import Sequential
from keras.layers import Dense

import tensorflow as tf
# model = tf.keras.Sequential()

#Get training data
x = train_feat.drop('lang',axis=1)
y = encode(train_feat['lang'])
print(y)

#Define model
model = Sequential()
model.add(Dense(500, input_dim=663, activation='relu'))
model.add(Dense(500, activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dense(6, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

#Train model
model.fit(x, y, epochs=5, batch_size=100)

ModuleNotFoundError: No module named 'tf'

In [33]:

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score,confusion_matrix

x_test = test_feat.drop('lang',axis=1)
y_test = test_feat['lang']

#Get predictions on test set
# predict_x=model.predict(X_test) 
# classes_x=np.argmax(predict_x,axis=1)
# labels = (model.predict(x_test) > 0.5).astype("int32")
# print(labels)
labels = np.argmax(model.predict(x_test), axis=-1)
print(labels)
# classes_x=np.argmax(labels,axis=1)
predictions = encoder.inverse_transform(labels)
print(predictions)

#Accuracy on test set
accuracy = accuracy_score(y_test,predictions)
print(accuracy)

#Create confusion matrix
lang = ['deu', 'eng', 'fra', 'ita', 'por', 'spa']
conf_matrix = confusion_matrix(y_test,predictions)
conf_matrix_df = pd.DataFrame(conf_matrix,columns=lang,index=lang)

#Plot confusion matrix heatmap
plt.figure(figsize=(10, 10), facecolor='w', edgecolor='k')
sns.set(font_scale=1.5)
sns.heatmap(conf_matrix_df,cmap='coolwarm',annot=True,fmt='.5g',cbar=False)
plt.xlabel('Predicted',fontsize=22)
plt.ylabel('Actual',fontsize=22)

NameError: name 'model' is not defined

In [None]:
pip install seaborn

In [None]:
import tensorflow as tf

In [None]:
for l in lang:
    
    #get corpus filtered by language
    corpus = train[train.lang==l]['text']
    print(corpus)
#     trigrams = get_trigrams(corpus)

In [None]:
corpus

In [None]:
a = [' a ', ' al', ' ca', ' co', ' cu', ' de', ' di', ' el', ' en', ' es', ' ha', ' in', ' la', ' le', ' lo', ' ma', ' me', ' mi', ' mu', ' no', ' pa', ' pe', ' po', ' pr', ' pu', ' qu', ' re', ' sa', ' se', ' si', ' so', ' su', ' ta', ' te', ' ti', ' to', ' tr', ' un', ' ve', ' vi', ' y ', 'a a', 'a c', 'a d', 'a e', 'a l', 'a m', 'a n', 'a p', 'a s', 'a t', 'a v', 'aba', 'ace', 'aci', 'ada', 'ado', 'al ', 'an ', 'and', 'ant', 'ar ', 'ara', 'as ', 'as.', 'cer', 'com', 'con', 'da ', 'dad', 'de ', 'des', 'do ', 'do.', 'dos', 'e a', 'e c', 'e d', 'e e', 'e h', 'e l', 'e m', 'e n', 'e p', 'e q', 'e s', 'e t', 'el ', 'ell', 'en ', 'end', 'ene', 'ent', 'er ', 'era', 'ere', 'ero', 'es ', 'es.', 'esp', 'est', 'go ', 'hab', 'hac', 'ida', 'ido', 'ien', 'ier', 'ión', 'la ', 'las', 'le ', 'lla', 'lo ', 'los', 'mar', 'me ', 'men', 'mi ', 'mo ', 'mos', 'más', 'n a', 'n c', 'n e', 'n l', 'n p', 'na ', 'ndo', 'no ', 'nos', 'nta', 'nte', 'nto', 'o a', 'o c', 'o d', 'o e', 'o l', 'o m', 'o p', 'o q', 'o s', 'o t', 'om ', 'on ', 'or ', 'os ', 'os.', 'par', 'per', 'por', 'pre', 'pue', 'que', 'qui', 'r a', 'r e', 'ra ', 'ran', 're ', 'rec', 'res', 'ro ', 'ría', 's a', 's c', 's d', 's e', 's l', 's m', 's p', 's s', 'se ', 'sta', 'ste', 'sto', 'stá', 'su ', 'ta ', 'tan', 'tar', 'te ', 'ten', 'tie', 'to ', 'tod', 'tom', 'tra', 'tá ', 'ue ', 'ued', 'uer', 'uie', 'un ', 'una', 'ver', 'ás ', 'él ', 'ía ']
print(len(a))

In [43]:
import tensorflow as tf;print(tf.__version__)

2.11.0


ModuleNotFoundError: No module named 'tf'