# Language Detection Project:

Using different language data we will build model which shows language name after giving text

## Import required libraries:

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn import feature_extraction
from sklearn import pipeline
from sklearn import linear_model
from sklearn import metrics

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\d\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Read dataset:

In [2]:
df = pd.read_csv('Language Detection.csv')

In [3]:
#top 5 rows
df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [4]:
#shape of dataset
df.shape

(10337, 2)

In [5]:
df['Language'].value_counts()

English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: Language, dtype: int64

## Data cleaning:

In [6]:
corpus = []
for i in range(0, 10337):
    Text= re.sub('[^a-zA-Z]', ' ', df['Text'][i])
    # make all text in lower case
    Text = Text.lower()
    Text = Text.split()
    # stamming and remove stopwords
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    Text = [ps.stem(word) for word in Text if not word in set(all_stopwords)]
    Text = ' '.join(Text)
    corpus.append(Text)

In [7]:
print(corpus)

['natur broadest sens natur physic materi world univers', 'natur refer phenomena physic world also life gener', 'studi natur larg not part scienc', 'although human part natur human activ often understood separ categori natur phenomena', 'word natur borrow old french natur deriv latin word natura essenti qualiti innat disposit ancient time liter meant birth', 'ancient philosophi natura mostli use latin translat greek word physi origin relat intrins characterist plant anim featur world develop accord', 'concept natur whole physic univers one sever expans origin notion began certain core applic word pre socrat philosoph though word dynam dimens especi heraclitu steadili gain currenc ever sinc', 'advent modern scientif method last sever centuri natur becam passiv realiti organ move divin law', 'industri revolut natur increasingli becam seen part realiti depriv intent intervent henc consid sacr tradit rousseau american transcendent mere decorum divin provid human histori hegel marx', 'howev

## Train-Test split:

In [8]:
X=df.iloc[:,0]
Y=df.iloc[:,1]

In [9]:
X_train,X_test,Y_train,Y_test=train_test_split(X, Y, test_size = 0.20, random_state = 0)

In [10]:
# vectorisation
vec=feature_extraction.text.TfidfVectorizer(ngram_range=(1,2),analyzer='char')

## Model Building:

### Logistic Regression:

In [11]:
model_pipe=pipeline.Pipeline([('vec',vec),('clf',linear_model.LogisticRegression())])

In [12]:
model_pipe.fit(X_train,Y_train)

In [13]:
model_pipe.classes_

array(['Arabic', 'Danish', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Hindi', 'Italian', 'Kannada', 'Malayalam', 'Portugeese',
       'Russian', 'Spanish', 'Sweedish', 'Tamil', 'Turkish'], dtype=object)

In [14]:
predict_val=model_pipe.predict(X_test)

## Model Evalution:

In [15]:
metrics.accuracy_score(Y_test,predict_val)

0.9758220502901354

## Building a predictive system:

In [16]:
model_pipe.predict(['Nature, in the broadest sense, is the natural, physical, material world or universe.'])

array(['English'], dtype=object)