In [13]:
import string
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
df = pd.read_csv('Language Detection.csv')
df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [6]:
def remove_pun(text):
    for pun in string.punctuation:
        text= text.replace(pun,"")
    text = text.lower()
    return(text)

In [19]:
 df['Text'] = df['Text'].apply(remove_pun)

In [20]:
df.head()

Unnamed: 0,Text,Language
0,nature in the broadest sense is the natural p...,English
1,nature can refer to the phenomena of the physi...,English
2,the study of nature is a large if not the only...,English
3,although humans are part of nature human activ...,English
4,1 the word nature is borrowed from the old fre...,English


In [5]:
from sklearn.model_selection import train_test_split

In [15]:
X= df.iloc[:,0]
Y= df.iloc[:,1]

In [16]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = .2)

In [17]:
X_train

3430    Il l’a notamment utilisée comme opérateur cent...
87               Water covers 71% of the Earth's surface.
4178        ça vous dérangerait de me prêter de l'argent?
9710    Sie können sagen, Sie sollten eine Extrameile ...
8028            ciddiye alma, o kadar üzülme, kalbe alma.
                              ...                        
798     [19]:25 Machine learning (ML), reorganized as ...
6214                   Последним на данный момент[когда?]
1124    don't worry no worries, which means don't worr...
3189             você pode esperar um momento, por favor?
6489    Ты можешь прикрыть меня, как ты можешь работат...
Name: Text, Length: 8269, dtype: object

In [18]:
from sklearn import feature_extraction

In [19]:
vec = feature_extraction.text.TfidfVectorizer(ngram_range=(1,2),analyzer='char')

In [34]:
from sklearn import pipeline
import sklearn
from sklearn import linear_model

In [35]:
model_pipe =pipeline.Pipeline([('vec',vec),('clf',linear_model.LogisticRegression())])

In [36]:
model_pipe.fit(X_train,Y_train)

Pipeline(steps=[('vec', TfidfVectorizer(analyzer='char', ngram_range=(1, 2))),
                ('clf', LogisticRegression())])

In [37]:
model_pipe.classes_

array(['Arabic', 'Danish', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Hindi', 'Italian', 'Kannada', 'Malayalam', 'Portugeese',
       'Russian', 'Spanish', 'Sweedish', 'Tamil', 'Turkish'], dtype=object)

In [39]:
predict_val = model_pipe.predict(X_test)

In [40]:
from sklearn import metrics

In [45]:
metrics.accuracy_score(Y_test,predict_val)*100

97.96905222437138

In [46]:
metrics.confusion_matrix(Y_test,predict_val)

array([[115,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,  79,   0,   1,   1,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   6,   0,   0],
       [  0,   1, 118,   3,   1,   1,   0,   0,   0,   0,   0,   0,   0,
          1,   0,   0,   0],
       [  0,   1,   0, 265,   1,   0,   0,   0,   2,   0,   0,   0,   0,
          1,   0,   0,   0],
       [  0,   0,   0,   1, 197,   1,   0,   0,   0,   0,   0,   0,   0,
          1,   0,   0,   0],
       [  0,   0,   0,   1,   0,  88,   0,   0,   0,   0,   0,   0,   0,
          0,   1,   0,   0],
       [  0,   0,   0,   0,   0,   0,  77,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,  18,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   0,   1,   0,   0,   0,   0,   0, 132,   0,   0,   0,   0,
          3,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,  68,   0,   0,   0,
         

In [48]:
model_pipe.predict(['My name is Aleena'])

array(['English'], dtype=object)

In [49]:
model_pipe.predict(['मेरा नाम अलीना है'])

array(['Hindi'], dtype=object)

In [50]:
import pickle

In [57]:
newfile = open('model.pckl','wb')
pickle.dump(model_pipe,newfile)
newfile.close()

In [55]:
import os

In [58]:
os.listdir()

['.android',
 '.cache',
 '.conda',
 '.condarc',
 '.config',
 '.continuum',
 '.idlerc',
 '.ipynb_checkpoints',
 '.ipython',
 '.jupyter',
 '.m2',
 '.matplotlib',
 '.node_repl_history',
 '.npmrc',
 '.VirtualBox',
 '.vscode',
 'anaconda3',
 'AppData',
 'Application Data',
 'Contacts',
 'Cookies',
 'Documents',
 'Downloads',
 'exp2.ipynb',
 'exp3.ipynb',
 'exp4.ipynb',
 'exp6.ipynb',
 'Favorites',
 'IdeaProjects',
 'IntelGraphicsProfiles',
 'Language Detection.csv',
 'Language_Detection.ipynb',
 'Links',
 'Local Settings',
 'model.pckl',
 'Music',
 'My Documents',
 'NetHood',
 'NTUSER.DAT',
 'ntuser.dat.LOG1',
 'ntuser.dat.LOG2',
 'NTUSER.DAT{06767dc1-59c6-11ed-95e4-8ab49598392b}.TM.blf',
 'NTUSER.DAT{06767dc1-59c6-11ed-95e4-8ab49598392b}.TMContainer00000000000000000001.regtrans-ms',
 'NTUSER.DAT{06767dc1-59c6-11ed-95e4-8ab49598392b}.TMContainer00000000000000000002.regtrans-ms',
 'ntuser.ini',
 'OneDrive',
 'Oracle',
 'PrintHood',
 'Recent',
 'Saved Games',
 'Searches',
 'SendTo',
 'source'