In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re

In [2]:
df = pd.read_csv("Language Detection.csv")

In [3]:
df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [4]:
df['Language'].unique()

array(['English', 'Malayalam', 'Hindi', 'Tamil', 'Portugeese', 'French',
       'Dutch', 'Spanish', 'Greek', 'Russian', 'Danish', 'Italian',
       'Turkish', 'Sweedish', 'Arabic', 'German', 'Kannada'], dtype=object)

In [5]:
def remove_pun(text):
    for i in string.punctuation:
        text = text.replace(i, "")
    text = text.lower()
    return text    

In [6]:
df['Text'] = df['Text'].apply(remove_pun)

In [7]:
df.head()

Unnamed: 0,Text,Language
0,nature in the broadest sense is the natural p...,English
1,nature can refer to the phenomena of the physi...,English
2,the study of nature is a large if not the only...,English
3,although humans are part of nature human activ...,English
4,1 the word nature is borrowed from the old fre...,English


In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X = df.iloc[:, 0]
Y = df.iloc[:, 1]

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2)

In [11]:
from sklearn import feature_extraction

In [12]:
vec = feature_extraction.text.TfidfVectorizer(ngram_range=(1,2), analyzer='char')

In [13]:
from sklearn import pipeline
from sklearn import linear_model

In [14]:
model_pipe = pipeline.Pipeline([('vec', vec),('clf',linear_model.LogisticRegression())])

In [15]:
model_pipe.fit(X_train, Y_train)

Pipeline(steps=[('vec', TfidfVectorizer(analyzer='char', ngram_range=(1, 2))),
                ('clf', LogisticRegression())])

In [16]:
model_pipe.classes_

array(['Arabic', 'Danish', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Hindi', 'Italian', 'Kannada', 'Malayalam', 'Portugeese',
       'Russian', 'Spanish', 'Sweedish', 'Tamil', 'Turkish'], dtype=object)

In [17]:
predict_val = model_pipe.predict(X_test)

In [18]:
from sklearn import metrics

In [19]:
metrics.accuracy_score(Y_test, predict_val)*100

97.5338491295938

In [20]:
metrics.confusion_matrix(Y_test, predict_val)

array([[108,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,  82,   0,   2,   2,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   0, 110,   1,   2,   2,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   1,   1, 279,   0,   0,   0,   0,   3,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   0,   0,   1, 201,   0,   0,   0,   3,   0,   0,   2,   0,
          1,   0,   0,   0],
       [  0,   0,   2,   2,   1,  92,   0,   0,   0,   0,   0,   0,   0,
          0,   1,   0,   1],
       [  0,   0,   0,   0,   0,   0,  77,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   9,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   3,   0,   1,   0,   0,   0,   0, 113,   0,   0,   0,   0,
          4,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,  79,   0,   0,   0,
         

In [27]:
# model_pipe.predict(['My name is john'])
model_pipe.predict(['பிற மொழிகளில் உள்ள உள்ளீட்டு முறைகள்:'])

array(['Tamil'], dtype=object)

In [28]:
import pickle

In [29]:
new_file = open('model.pckl', 'wb')
pickle.dump(model_pipe, new_file)
new_file.close()

In [30]:
import os

In [31]:
os.listdir()

['.ipynb_checkpoints',
 'app.py',
 'app.py.bak',
 'Language Detection Tool.ipynb',
 'Language Detection.csv',
 'Language_detection-main',
 'model.pckl',
 'play_game',
 'requirement.txt',
 'requirement.txt.bak']