# Language Detection

In [57]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix
sns.set()

import nltk
import nltk.corpus
import string
import re
import pickle
import os
import joblib
%config Completer.use_jedi = False

In [2]:
#loading the dataset
data = pd.read_csv('dataset.csv')
df = data.copy()
df.head()

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch


In [3]:
df.describe()

Unnamed: 0,Text,language
count,22000,22000
unique,21859,22
top,haec commentatio automatice praeparata res ast...,Pushto
freq,48,1000


In [4]:
df.shape

(22000, 2)

In [5]:
df.language.value_counts()

Pushto        1000
Chinese       1000
Portugese     1000
Urdu          1000
Hindi         1000
Arabic        1000
Romanian      1000
Russian       1000
Indonesian    1000
Turkish       1000
Thai          1000
Dutch         1000
Korean        1000
Spanish       1000
French        1000
Swedish       1000
English       1000
Tamil         1000
Latin         1000
Estonian      1000
Persian       1000
Japanese      1000
Name: language, dtype: int64

In [6]:
df.isnull().sum()

Text        0
language    0
dtype: int64

In [7]:
df.iloc[1,0]

'sebes joseph pereira thomas  på eng the jesuits and the sino-russian treaty of nerchinsk  the diary of thomas pereira bibliotheca instituti historici s i --   rome libris '

In [8]:
'''
1. Tokenization
2. Stop words removal
3. Lower case conversion
4. Removing numeric digits
5. Removing Punctuations
6. Removing Character(for foriegn language)
7. Normalization
8. Stemming and Lemmatization
'''

def text_preprocessing(text):
    
    text_pre = []
    lang = []
    stp_words = nltk.corpus.stopwords.words('english')
    translate_table = dict((ord(char), None) for char in string.punctuation+string.digits)
    reg = re.compile(r'[0-9]')
    for i,val in text.iterrows():
        val['Text'] = re.sub(reg,'',val['Text'])
        val['Text'] = val['Text'].translate(translate_table)
        text_pre.append(val['Text'])
        lang.append(val['language'])
    return pd.DataFrame({'Text': text_pre, 'Language': lang})

df_pre = text_preprocessing(df)

In [10]:
X = df_pre['Text']
y = df_pre['Language']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [19]:
sss = StratifiedShuffleSplit(n_splits=5, random_state=0, test_size=0.2)

for train_index, test_index in sss.split(X, y):
    print('Train: ', train_index,'  Test: ', test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

Train:  [19001  8014 10020 ... 11116 18086 15323]   Test:  [ 5816 14740 17289 ...  3821 16220 16337]
Train:  [ 9367 16419  1072 ...  2319 21623  4872]   Test:  [ 5153 13411  5790 ... 18341 15807 17907]
Train:  [21242 20476 13646 ...  5662  7933  9124]   Test:  [20336 16385  1119 ... 10301 18813 14074]
Train:  [ 8964  2199  1324 ...  2401 21408 13382]   Test:  [12568  7823 10340 ... 21671  9831 16699]
Train:  [16673 11609 20491 ...  8953 16375  4646]   Test:  [20159 16474  9027 ...  6630 18618 19435]


In [37]:
pipe = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1,2), analyzer='char')),
                 ('lsvc', LinearSVC())])

pipe.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(analyzer='char', ngram_range=(1, 2))),
                ('lsvc', LinearSVC())])

In [38]:
y_pred = pipe.predict(X_test)

In [39]:
print(accuracy_score(y_pred, y_test))

0.9854545454545455


In [40]:
print(confusion_matrix(y_pred, y_test))

[[200   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [  0 198   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0
    0   0   0   0]
 [  0   0 198   0   1   0   0   0   0   0   1   0   1   0   0   0   0   0
    0   0   0   0]
 [  0   1   1 198   1   1   3   3   1   0   6   1   0   6   4   0   3   0
    2   3   2   1]
 [  0   0   0   1 198   0   0   0   0   0   2   0   0   0   0   1   0   0
    0   1   0   0]
 [  0   0   0   0   0 199   0   0   0   0   3   0   0   0   0   0   0   0
    0   0   0   0]
 [  0   0   0   0   0   0 197   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [  0   0   0   0   0   0   0 194   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [  0   0   0   0   0   0   0   0 199   0   0   0   0   0   0   0   0   0
    0   0   0   0]
 [  0   0   0   0   0   0   0   0   0 198   0   0   0   0   0   0   0   0
    0   0   0   0]
 [  0   1   0   0   0   0   0   2   0   1 188   0   1   0   1   0   0 

In [70]:
pipe_file = open('language_detect.pkl', 'wb')
pickle.dump(pipe, pipe_file)
pipe_file.close()

In [71]:
pickle.load(open('language_detect.pkl', 'rb'))

Pipeline(steps=[('tfidf', TfidfVectorizer(analyzer='char', ngram_range=(1, 2))),
                ('lsvc', LinearSVC())])

In [67]:
joblib.load(open('language_detect.joblib', 'rb'))

EOFError: 