<a href="https://colab.research.google.com/github/inamdarmihir/researchdocs/blob/main/LD_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing the libraries:

In [18]:
import string
import re
import codecs
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import feature_extraction
from sklearn import linear_model
from sklearn import pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics
import warnings
warnings.simplefilter("ignore")

In [19]:
!pip install nlp
%matplotlib inline

import tensorflow as tf
import nlp
import random


def show_history(h):
    epochs_trained = len(h.history['loss'])
    plt.figure(figsize=(16, 6))

    plt.subplot(1, 2, 1)
    plt.plot(range(0, epochs_trained), h.history.get('accuracy'), label='Training')
    plt.plot(range(0, epochs_trained), h.history.get('val_accuracy'), label='Validation')
    plt.ylim([0., 1.])
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(range(0, epochs_trained), h.history.get('loss'), label='Training')
    plt.plot(range(0, epochs_trained), h.history.get('val_loss'), label='Validation')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

    
def show_confusion_matrix(y_true, y_pred, classes):
    from sklearn.metrics import confusion_matrix
    
    cm = confusion_matrix(y_true, y_pred, normalize='true')

    plt.figure(figsize=(8, 8))
    sp = plt.subplot(1, 1, 1)
    ctx = sp.matshow(cm)
    plt.xticks(list(range(0, 6)), labels=classes)
    plt.yticks(list(range(0, 6)), labels=classes)
    plt.colorbar(ctx)
    plt.show()

    
print('Using TensorFlow version', tf.__version__)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Using TensorFlow version 2.8.2


Loaing the Data:

In [20]:
data = pd.read_csv("Language Detection.csv")
data["Language"].value_counts()

English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: Language, dtype: int64

In [21]:
#Separating Independent and Dependent features
X = data["Text"]
y = data["Language"]

#Label Encoding to convert it into a numerical form
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

Data pre-processing:

In [22]:
for char in string.punctuation:
  print(char, end=" ")
translate_table = dict((ord(char), None) for char in string.punctuation)

! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~ 

In [23]:
data_list = []


# iterating through all the text
for text in X:         
    text = re.sub(r'[!@#$(),n"%^*?:;~`0-9]', ' ', text)      # removing the symbols and numbers
    text = re.sub(r'[[]]', ' ', text)   
    text = text.lower()          # converting the text to lower case
    data_list.append(text)

Vectorizer and Model Fitting Pipeline:

In [12]:
vectorizer = feature_extraction.text.TfidfVectorizer(ngram_range=(1,3), analyzer = 'char')

pipe_lr_r13 = pipeline.Pipeline([('vectorizer', vectorizer),
                                 ('clf', linear_model.LogisticRegression())])

Model Fitting/Training:

In [13]:
#train-test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=42)

#Model Fitting:
pipe_lr_r13.fit(x_train, y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(analyzer='char', ngram_range=(1, 3))),
                ('clf', LogisticRegression())])

Model Predicition/Evaluation:

In [14]:
#predict output for test dataset
y_pred = pipe_lr_r13.predict(x_test)

In [15]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
ac = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

#Printing the accuracy:
print("Accuracy is :",ac)

Accuracy is : 0.9840425531914894


In [17]:
matrix = metrics.confusion_matrix(y_test, y_pred)
print("Confusion Matrix : \n",matrix)

Confusion Matrix : 
 [[106   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0  71   0   0   0   0   0   0   0   0   0   0   0   0   2   0   0]
 [  0   0 107   3   0   1   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0 290   0   0   0   0   0   0   0   0   0   0   0   0   1]
 [  0   0   0   2 215   0   0   0   1   0   0   1   0   0   0   0   0]
 [  0   0   2   1   0  89   0   0   0   0   0   0   0   0   1   0   0]
 [  0   0   0   0   0   0  68   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0  10   0   0   0   0   0   0   0   0   0]
 [  0   1   0   1   0   0   0   0 141   0   0   0   0   1   0   0   1]
 [  0   0   0   0   0   0   0   0   0  66   0   0   0   0   0   0   0]
 [  0   0   0   1   0   0   0   0   0   0 120   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   1   0   0 141   0   1   0   0   1]
 [  0   0   0   0   0   0   0   0   0   0   0   0 136   0   0   0   0]
 [  0   0   0   0   0   0   0   0   1   0   0   2   1 15