In [1]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

In [2]:
data = pd.read_csv("data/Language Detection.zip", compression='zip')

In [3]:
data.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [4]:
x = data["Text"]
y = data["Language"]

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
y

array([3, 3, 3, ..., 9, 9, 9])

In [5]:
# Train test split
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(x, y, test_size=0.2, random_state=1, stratify=y)

# Text preprocessing
- Removing digits and special characters and closed brackets

In [6]:
data_list = []

for text in x:
    text = re.sub(r'[!@#$(),\n"%^*?\:;~`0-9]', ' ', text)
    text = re.sub(r'[[ ]]', '', text)
    text = text.lower()
    data_list.append(text)

  text = re.sub(r'[[ ]]', '', text)


In [7]:
# Train test split
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(data_list, y, test_size=0.2, random_state=1, stratify=y)

In [8]:
# Bag of words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv.fit(X_train)
x_train = cv.transform(X_train).toarray()
x_test = cv.transform(X_test).toarray()

In [9]:
print(x_train.shape)
print(x_test.shape)

(8269, 34492)
(2068, 34492)


## Model

In [10]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

from sklearn.metrics import accuracy_score, classification_report

ac = accuracy_score(y_test, y_pred)
cr = classification_report(y_test, y_pred)

# acuracy score
print("Accuracy is :",ac)

# classification report
print(cr)

Accuracy is : 0.9787234042553191
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       107
           1       1.00      0.94      0.97        86
           2       0.99      0.96      0.98       109
           3       0.90      0.99      0.94       277
           4       0.99      0.99      0.99       203
           5       0.99      1.00      0.99        94
           6       1.00      1.00      1.00        73
           7       1.00      1.00      1.00        12
           8       0.99      0.99      0.99       140
           9       1.00      0.96      0.98        74
          10       1.00      0.99      1.00       119
          11       0.99      0.97      0.98       148
          12       0.99      0.95      0.97       138
          13       0.98      0.99      0.98       164
          14       0.98      0.96      0.97       135
          15       1.00      0.98      0.99        94
          16       1.00      0.97      0.98     

## Pipeline for production
- symbol digit remover
- vectorizer
- classifier

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

In [12]:
def symbol_digit_remover(data):
    data = data.copy()
    for i, text in enumerate(data):
        text = re.sub(r'[!@#$(),\n"%^*?\:;~`0-9]', ' ', data[i])
        text = re.sub(r'[[ ]]', '', text)
        text = text.lower()
        data[i] = text
    return data

symbol_digit_transformer = FunctionTransformer(symbol_digit_remover)

In [13]:
# pipeline
pipeline = Pipeline([
    ('preprocessor', symbol_digit_transformer),
    ('vectorizer', CountVectorizer()),
    ('multinomialNB', MultinomialNB())
])

# fit pipeline with unprocess data
pipeline.fit(X_train, y_train)

# predict results
y_pred = pipeline.predict(X_test)
ac = accuracy_score(y_test, y_pred)
cr = classification_report(y_test, y_pred)

# acuracy score
print("Accuracy is :",ac)

# classification report
print(cr)

Accuracy is : 0.9787234042553191
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       107
           1       1.00      0.94      0.97        86
           2       0.99      0.96      0.98       109
           3       0.90      0.99      0.94       277
           4       0.99      0.99      0.99       203
           5       0.99      1.00      0.99        94
           6       1.00      1.00      1.00        73
           7       1.00      1.00      1.00        12
           8       0.99      0.99      0.99       140
           9       1.00      0.96      0.98        74
          10       1.00      0.99      1.00       119
          11       0.99      0.97      0.98       148
          12       0.99      0.95      0.97       138
          13       0.98      0.99      0.98       164
          14       0.98      0.96      0.97       135
          15       1.00      0.98      0.99        94
          16       1.00      0.97      0.98     

In [14]:
with open('model/trained_pipeline.joblib', 'wb') as f:
    joblib.dump(pipeline, f)

In [15]:
test = 'Hello, how are you?'
y_ = pipeline.predict([test])
le.classes_[y_[0]]

'English'

In [16]:
test = 'Ciao, come stai?'
y_ = pipeline.predict([test])
le.classes_[y_[0]]

'Italian'

In [17]:
test = 'Bonjour, comment va?'
y_ = pipeline.predict([test])
le.classes_[y_[0]]

'French'