In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import  LogisticRegression
from sklearn.naive_bayes import  MultinomialNB 
from sklearn.svm import  SVC
from sklearn.ensemble import  RandomForestClassifier
from sklearn.metrics import  classification_report, accuracy_score, confusion_matrix
import pickle
import warnings
warnings.filterwarnings(action='ignore')

Load Data


In [2]:
data = pd.read_csv("cleaned_language_detection.csv")

In [3]:
data.head()

Unnamed: 0,Text,Language
0,nature broadest natural physical material worl...,English
1,nature refer phenomenon physical world life ge...,English
2,the study nature large part science,English
3,although human part nature human activity unde...,English
4,the word nature borrowed old french nature der...,English


In [4]:
X = data["Text"]
y = data["Language"]

In [5]:
#Lets encode our Target features

In [6]:
le = LabelEncoder()
y = le.fit_transform(y)

In [7]:
#let's vectorise out  the code and make it more efficient for the model to read

In [8]:
X = X.fillna('')
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(X)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state = 100)

In [10]:
#lets define a fucntion that will help in traning all desired models.
def data_model():
    models = {"log_R": LogisticRegression(),
              "nb": MultinomialNB(),
              "supprt": SVC(),
              "rand_F": RandomForestClassifier()
              }
    return models

In [11]:
model_train = data_model()

for name,model in model_train.items():
    model.fit(X_train,y_train)

In [12]:
for name,model in model_train.items():
    predictions = model.predict(X_test)
    print(f'EVALUATION:{name}')
    print(f'ACCURACY:{accuracy_score(y_test,predictions)}')
    print(f'CLASSIFICATION REPORT:\n {classification_report(y_test,predictions)}')
    print(f'CONFUSION MATRIX:\n {confusion_matrix(y_test,predictions)}')
    print("HERE WE GO............")

EVALUATION:log_R
ACCURACY:0.8411657559198543
CLASSIFICATION REPORT:
               precision    recall  f1-score   support

           0       0.97      0.59      0.73       146
           1       0.99      0.67      0.80       194
           2       0.55      0.99      0.71       481
           3       0.96      0.95      0.96       316
           4       0.99      0.71      0.83       140
           5       1.00      0.76      0.86       119
           6       1.00      0.90      0.95        20
           7       1.00      0.83      0.91       249
           8       0.97      0.88      0.93       240
           9       1.00      0.85      0.92       199
          10       0.92      0.85      0.88       262
          11       0.96      0.85      0.90       222
          12       1.00      0.68      0.81       157

    accuracy                           0.84      2745
   macro avg       0.95      0.81      0.86      2745
weighted avg       0.90      0.84      0.85      2745

CONFUSION 

In [13]:
#hyperparameters on Logicstic Regression.
from sklearn.model_selection import GridSearchCV

In [14]:
log_R = LogisticRegression()

In [15]:
param_grid ={
    'solver':['liblinear','newton-cg','lbfgs'],

    'C':[0.01,0.1,1,10]
}

grid_search = GridSearchCV(log_R,param_grid,cv=3,n_jobs=-1,verbose=2,scoring='accuracy')
grid_search.fit(X_train,y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


In [16]:
best_param = grid_search.best_params_
print(f'this is the best parameter:{best_param}')

this is the best parameter:{'C': 10, 'solver': 'liblinear'}


In [17]:
best_model = LogisticRegression(**best_param)
best_model.fit(X_train,y_train)

In [18]:
y_predict = best_model.predict(X_test)

print('BEST MODEL EVALUTION')
print(f'ACCURACY:{accuracy_score(y_test, y_predict)}')
print(f'CLASSIFICTION REPORT:\n {classification_report(y_test, y_predict)}')
print(f'CONFUSSION MARIX:\n {confusion_matrix(y_test, y_predict)}')

BEST MODEL EVALUTION
ACCURACY:0.9045537340619307
CLASSIFICTION REPORT:
               precision    recall  f1-score   support

           0       0.91      0.79      0.85       146
           1       0.96      0.82      0.88       194
           2       0.71      0.98      0.82       481
           3       0.97      0.96      0.96       316
           4       0.99      0.85      0.92       140
           5       1.00      0.87      0.93       119
           6       1.00      0.95      0.97        20
           7       1.00      0.90      0.95       249
           8       0.96      0.90      0.93       240
           9       1.00      0.89      0.94       199
          10       0.92      0.89      0.91       262
          11       0.95      0.92      0.94       222
          12       1.00      0.87      0.93       157

    accuracy                           0.90      2745
   macro avg       0.95      0.89      0.92      2745
weighted avg       0.92      0.90      0.91      2745

CONFUSS

let's Test our Model.


In [19]:
def model_test(text):
    vector = tfidf.transform([text]).toarray()
    data = best_model.predict(vector)
    language_prediction = le.inverse_transform(data)
    print('PREDICTING.....')
    print('LANGUAGE IS:',language_prediction[0])

In [20]:
user_test = "hello"

In [21]:
model_test(user_test)
print(model_test)

PREDICTING.....
LANGUAGE IS: English
<function model_test at 0x000001EF50B13560>


In [22]:
with open('model_class.pkl','wb')as file:
   pickle.dump(best_model,file)

In [23]:
with open('tfidf_vectorizer.pkl','wb')as file:
    pickle.dump(tfidf,file)

In [24]:
with open('label_encoder.pkl','wb')as file:
    pickle.dump(le,file)