In [65]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import re


## Defining the stopwords, symbols and punctuations list

In [66]:
#stopwords_hi = ['तुम','मेरी','मुझे','क्योंकि','हम','प्रति','अबकी','आगे','माननीय','शहर','बताएं','कौनसी','क्लिक','किसकी','बड़े','मैं','and','रही','आज','लें','आपके','मिलकर','सब','मेरे','जी','श्री','वैसा','आपका','अंदर', 'अत', 'अपना', 'अपनी', 'अपने', 'अभी', 'आदि', 'आप', 'इत्यादि', 'इन', 'इनका', 'इन्हीं', 'इन्हें', 'इन्हों', 'इस', 'इसका', 'इसकी', 'इसके', 'इसमें', 'इसी', 'इसे', 'उन', 'उनका', 'उनकी', 'उनके', 'उनको', 'उन्हीं', 'उन्हें', 'उन्हों', 'उस', 'उसके', 'उसी', 'उसे', 'एक', 'एवं', 'एस', 'ऐसे', 'और', 'कई', 'कर','करता', 'करते', 'करना', 'करने', 'करें', 'कहते', 'कहा', 'का', 'काफ़ी', 'कि', 'कितना', 'किन्हें', 'किन्हों', 'किया', 'किर', 'किस', 'किसी', 'किसे', 'की', 'कुछ', 'कुल', 'के', 'को', 'कोई', 'कौन', 'कौनसा', 'गया', 'घर', 'जब', 'जहाँ', 'जा', 'जितना', 'जिन', 'जिन्हें', 'जिन्हों', 'जिस', 'जिसे', 'जीधर', 'जैसा', 'जैसे', 'जो', 'तक', 'तब', 'तरह', 'तिन', 'तिन्हें', 'तिन्हों', 'तिस', 'तिसे', 'तो', 'था', 'थी', 'थे', 'दबारा', 'दिया', 'दुसरा', 'दूसरे', 'दो', 'द्वारा', 'न', 'नहीं', 'ना', 'निहायत', 'नीचे', 'ने', 'पर', 'पर', 'पहले', 'पूरा', 'पे', 'फिर', 'बनी', 'बही', 'बहुत', 'बाद', 'बाला', 'बिलकुल', 'भी', 'भीतर', 'मगर', 'मानो', 'मे', 'में', 'यदि', 'यह', 'यहाँ', 'यही', 'या', 'यिह', 'ये', 'रखें', 'रहा', 'रहे', 'ऱ्वासा', 'लिए', 'लिये', 'लेकिन', 'व', 'वर्ग', 'वह', 'वह', 'वहाँ', 'वहीं', 'वाले', 'वुह', 'वे', 'वग़ैरह', 'संग', 'सकता', 'सकते', 'सबसे', 'सभी', 'साथ', 'साबुत', 'साभ', 'सारा', 'से', 'सो', 'ही', 'हुआ', 'हुई', 'हुए', 'है', 'हैं', 'हो', 'होता', 'होती', 'होते', 'होना', 'होने', 'अपनि', 'जेसे', 'होति', 'सभि', 'तिंहों', 'इंहों', 'दवारा', 'इसि', 'किंहें', 'थि', 'उंहों', 'ओर', 'जिंहें', 'वहिं', 'अभि', 'बनि', 'हि', 'उंहिं', 'उंहें', 'हें', 'वगेरह', 'एसे', 'रवासा', 'कोन', 'निचे', 'काफि', 'उसि', 'पुरा', 'भितर', 'हे', 'बहि', 'वहां', 'कोइ', 'यहां', 'जिंहों', 'तिंहें', 'किसि', 'कइ', 'यहि', 'इंहिं', 'जिधर', 'इंहें', 'अदि', 'इतयादि', 'हुइ', 'कोनसा', 'इसकि', 'दुसरे', 'जहां', 'अप', 'किंहों', 'उनकि', 'भि', 'वरग', 'हुअ', 'जेसा', 'नहिं']
symbols = {'~', ':', "'", '+', '[', '\\', '@', '^', '{', '%', '(', '-', '"', '*', '|', ',', '&', '<', '`', '}', '.', '_', '=', ']', '!', '>', ';', '?', '#', '$', ')', '/','€','§' }
punctuations=['\u200d','\xa0','\t','\n','|','।',':',';',',','.','\'','\\','/','-','‘','’','(',')','?','[',']','*','+','“','”','!','…','+','{','}','=']

In [67]:
import re

## Preprocess function to be applied on every line of the dataset in order to clean/structure it

In [68]:
def preprocess(line):
    splits = line.split()
    line = ' '.join(splits[1:-1])
    for punc in punctuations:
        line=line.replace(punc,"")
    tmp = line.split()
    tmp = [re.sub('[A-Za-z0-9]', '', word) for word in tmp]
    return  ' '.join([word.strip() for word in tmp if not word.isnumeric() and word not in symbols ])


## Creating a clean dataset to be used for training/testing

In [69]:
final = []
with open('hin_mixed_2019_1M-sentences.txt') as f:
    
    text = f.readlines()
    for line in text:
        if(len(final)==100000):
            break
        line = preprocess(line)
        tmp = {}
        tmp['sentence'] = line
        tmp['language'] = "hi"
        final.append(tmp)
        


In [70]:
df = pd.DataFrame(final)
df.loc[:, 'sentence'] = df.loc[:,'sentence'].apply(lambda x: x.strip())
print(df.shape)

(100000, 2)


In [29]:
# import pandas as pd
# from sklearn.feature_extraction.text import TfidfTransformer

# dataset = list(df['sentence'])
# tfIdfTransformer = TfidfTransformer(use_idf=True)
# countVectorizer = CountVectorizer()
# wordCount = countVectorizer.fit_transform(dataset)
# newTfIdf = tfIdfTransformer.fit_transform(wordCount)
# df1 = pd.DataFrame(newTfIdf[0].T.todense(), index=countVectorizer.get_feature_names(), columns=["TF-IDF"])
# df1 = df1.sort_values('TF-IDF', ascending=False)
# print (df1.head(25))

          TF-IDF
पदक     0.552704
टर      0.442592
पलच     0.380346
तमग     0.343738
रजत     0.305919
लव      0.206939
मध      0.206155
वह      0.137801
वर      0.132025
रत      0.114927
और      0.085286
बरक     0.000000
बरइल    0.000000
बरईप    0.000000
बरए     0.000000
बरकर    0.000000
बरकत    0.000000
बरकतउल  0.000000
बरख     0.000000
बरग     0.000000
बरगढ    0.000000
बरगद    0.000000
बर      0.000000
बय      0.000000
बयभ     0.000000




In [71]:
from collections import Counter
import math

In [77]:
import numpy as np
from nltk.tokenize import  word_tokenize 

In [80]:
data = list(df.sentence)

In [90]:
def extract_features( document ):
   terms = tuple(document.lower().split())
   features = set()
   for i in range(len(terms)):
      for n in range(1,2):
          if i+n <= len(terms):
              features.add(terms[i:i+n])
   return features

documents = [
   "This article is about the Golden State Warriors",
   "This article is about the Golden Arches",
   "This article is about state machines",
   "This article is about viking warriors"]

def calculate_idf( documents ):
   N = len(documents)
   from collections import Counter
   tD = Counter()
   for d in documents:
      features = extract_features(d)
      for f in features:
          tD[" ".join(f)] += 1
   IDF = []
   import math
   for (term,term_frequency) in tD.items():
       term_IDF = math.log(float(N) / term_frequency)
       IDF.append(( term_IDF, term ))
   IDF.sort(reverse=True)
   return IDF

st = []
for (IDF, term) in sorted(calculate_idf(data), key=lambda x: x[0])[:100]:
    st.append(term)

In [98]:
st

['के',
 'में',
 'की',
 'को',
 'से',
 'का',
 'और',
 'है',
 'ने',
 'पर',
 'कि',
 'भी',
 'तो',
 'नहीं',
 'लिए',
 'कर',
 'अब',
 'एक',
 'ही',
 'हैं',
 'आप',
 'हो',
 'इस',
 'अपने',
 'आज',
 'अगर',
 'यह',
 'करने',
 'किया',
 'तक',
 'साथ',
 'अपनी',
 'रहे',
 'गया',
 'कहा',
 'बाद',
 'रहा',
 'आपको',
 'जो',
 'जा',
 'कुछ',
 'हुए',
 'कोई',
 'रही',
 'वह',
 'या',
 'दिया',
 'बात',
 'वाले',
 'होने',
 'किसी',
 'करते',
 'व',
 'सकते',
 'था',
 'क्या',
 'अभी',
 'ये',
 'पहले',
 'हम',
 'लोगों',
 'न',
 'भारत',
 'लेकिन',
 'गई',
 'बहुत',
 'सरकार',
 'दिन',
 'तरह',
 'समय',
 'देश',
 'होता',
 'जाता',
 'आपके',
 'काम',
 'सकता',
 'द्वारा',
 'रूप',
 'वाली',
 'जब',
 'करना',
 'लेकर',
 'साल',
 'मैं',
 'दी',
 'गए',
 'हुआ',
 'उनके',
 'लोग',
 'दो',
 'जाने',
 'फिर',
 'आ',
 'उन्हें',
 'वे',
 'कई',
 'करें',
 'नाम',
 'सभी',
 'बार']

In [96]:
def remove_stop(sent):
    return " ".join([w for w in sent.split() if w not in st])

In [99]:
df['sentence'] = df['sentence'].apply(lambda x: remove_stop(x))

In [100]:
df.head()

Unnamed: 0,sentence,language
0,मीटर दौड स्वर्ण पदक मीटर दौड रजत पदक दिलाया वह...,hi
1,जन्म घटकर हैजो महत्वपूर्ण उपलब्धि,hi
2,जिस व्यक्ति तुमने मदद उसका आभार प्रकट करो उसे ...,hi
3,ना मनुष्य ना भगवान विश्व अन्य दोष अच्छा प्रयत्न,hi
4,बदलाव नही,hi


## Repeating the above steps with English Dataset

In [102]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/joker/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [103]:
import nltk
from nltk.corpus import stopwords
sw = stopwords.words('english')

In [106]:
def preprocess_en(line):
    splits = line.split()
    line = ' '.join(splits[1:-1])
    for punc in punctuations:
        line=line.replace(punc,"")
    tmp = line.split()
    tmp = [re.sub('[0-9," ]', '', word) for word in tmp]
    return  ' '.join([word.strip().lower() for word in tmp if word not in sw and not word.isnumeric() and word not in symbols ])


In [107]:
final = []
with open('eng_news_2020_1M-sentences.txt') as f:
    text = f.readlines()
    for line in text:
        if(len(final)==100000):
            break
        line = preprocess_en(line)
        tmp = {}

        tmp['sentence'] = line
        tmp['language'] = "en"
        final.append(tmp)

In [108]:
df_en = pd.DataFrame(final)
df_en.shape

(100000, 2)

In [109]:
df_en.head(100)

Unnamed: 0,sentence,language
0,— nhl network pm,en
1,spent advertising sans rare times see customer...,en
2,cash seized joint,en
3,hospice santa cruz county jacobs heart unantic...,en
4,£ bonus may enough protect,en
...,...,...
95,vouchers redeemed gozitan outlets accommodati...,en
96,upper target monthly,en
97,kwp solar hybrid minigrid bed ikenne isolation...,en
98,adults data ends credit scoring,en


In [110]:
df_en['sentence'] = df_en['sentence'].str.encode('ascii', 'ignore').str.decode('ascii')

## Merging the two dataframes and shuffling the data points

In [111]:
df_new = df.append(df_en)

  df_new = df.append(df_en)


In [112]:
df_new = df_new.sample(frac=1, random_state=1)
df_new.reset_index(drop=True, inplace=True)
df_new

Unnamed: 0,sentence,language
0,बालों रंग बदल,hi
1,artists concerned paintings executed directly ...,en
2,पानी अधिक अधिक मात्रा,hi
3,अपना नेटवर्क लश्कर हिजबुल जैश सिमी फैलाता हज स...,hi
4,अन्त कहूँगा भगवान चाहा,hi
...,...,...
199995,आगे शैतान कहते उसको गॉड,hi
199996,a changing market rising costs title acquisiti...,en
199997,मसीही विश्वास होकर मनुष्य परमेश्वर मेलमिलाप मन...,hi
199998,उच्चतम न्यायालय आदेश अधिकारियों नोएडा सेक्टर स...,hi


## Training, testing and doing some predictions on MNB model

In [113]:
x = np.array(df_new["sentence"])
y = np.array(df_new["language"])

cv = CountVectorizer()
X = cv.fit_transform(x)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=27)

In [114]:
model = MultinomialNB()
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.999975

In [None]:
# user = input("Enter a Text: ")
# data = cv.transform([user]).toarray()
# output = model.predict(data)
# print(output)

In [115]:
sentences = [
    "mera naam is there he she hum humko",
    "tera yours mine",
    "why so cool thanda",
    "after changing kapde uske baad turant he slept",
    "what when how kyun kab which noorjahan",
    "the last thing meri zindagi mein is to live poora",
    "kal mai movie dekhne jaa raha hun what are the reviews",
    "isko code mix vaakya kehte hai"
]

for s in sentences:
    for w in s.split():
        data = cv.transform([w]).toarray()
        output = model.predict(data)
        print(w, output)
    print('')

mera ['en']
naam ['en']
is ['en']
there ['en']
he ['en']
she ['en']
hum ['en']
humko ['hi']

tera ['en']
yours ['hi']
mine ['en']

why ['en']
so ['en']
cool ['en']
thanda ['hi']

after ['en']
changing ['en']
kapde ['hi']
uske ['hi']
baad ['en']
turant ['hi']
he ['en']
slept ['en']

what ['en']
when ['en']
how ['en']
kyun ['en']
kab ['en']
which ['en']
noorjahan ['hi']

the ['en']
last ['en']
thing ['en']
meri ['hi']
zindagi ['en']
mein ['en']
is ['en']
to ['en']
live ['en']
poora ['hi']

kal ['en']
mai ['en']
movie ['en']
dekhne ['hi']
jaa ['hi']
raha ['hi']
hun ['en']
what ['en']
are ['en']
the ['en']
reviews ['en']

isko ['hi']
code ['en']
mix ['en']
vaakya ['hi']
kehte ['hi']
hai ['en']



## Training, testing and doing some predictions on SVM model

In [116]:

from sklearn import svm

clf = svm.SVC()
clf.fit(X_train,y_train)
print(clf.score(X_test,y_test))


0.994875


In [117]:
for s in sentences:
    for w in s.split():
        data = cv.transform([w]).toarray()
        output = clf.predict(data)
        print(w, output)
    print('')

mera ['hi']
naam ['hi']
is ['hi']
there ['hi']
he ['hi']
she ['hi']
hum ['hi']
humko ['hi']

tera ['hi']
yours ['hi']
mine ['hi']

why ['hi']
so ['hi']
cool ['hi']
thanda ['hi']

after ['en']
changing ['hi']
kapde ['hi']
uske ['hi']
baad ['hi']
turant ['hi']
he ['hi']
slept ['hi']

what ['hi']
when ['hi']
how ['hi']
kyun ['hi']
kab ['hi']
which ['hi']
noorjahan ['hi']

the ['en']
last ['en']
thing ['en']
meri ['hi']
zindagi ['hi']
mein ['hi']
is ['hi']
to ['hi']
live ['hi']
poora ['hi']

kal ['hi']
mai ['hi']
movie ['hi']
dekhne ['hi']
jaa ['hi']
raha ['hi']
hun ['hi']
what ['hi']
are ['en']
the ['en']
reviews ['hi']

isko ['hi']
code ['hi']
mix ['hi']
vaakya ['hi']
kehte ['hi']
hai ['hi']



### Getting in the real data


In [None]:
data_og = pd.read_csv('data_100k.csv')

In [None]:
data_og.columns

Index(['idx', 'regno', 'name', 'state', 'gender', 'channel', 'source',
       'district', 'regdate', 'receivedate', 'disposedate', 'closedate',
       'category', 'pmocategory', 'description', 'rating', 'comments',
       'finalreply', 'initforward', 'reminders', 'resolvetime', 'deo', 'freq',
       'spam', 'pending', 'class', 'seen'],
      dtype='object')

In [None]:
data_og.description[3]

'Sir mai virendra pratap singh uttar pradesh ka rahne wala hu sir mai  faridabad mai rent pr rahkr majdoori karta hu sir 02/12/2020 ko mr. Akram ke yaha kam karne ke liye labour chok se gay joki ghar ka saman shift karna tha jis ke majdoori maine rupay 20000(twntiy thousand rupay) tay ke the maine kiray pr truk our 4 labour kr maine kam kar diya saman shift diya tab maine rupay mage to Akram ki maa kahne lage yaha se bhagja nahi mai tujhe bahut maruge  mai mahila ayog ke member hu sir  esa kahetehuye akaram ne mujhe maa bahan galigaloj ke mai chala aya fir  mai 2days bad gaya to mujhe jan se marne ki dhamkedene lage our 3 other log bulaliye sir AKRAM B UNKE MAA kahne lage tu hamara kuch nahi kr paye bhag  ga sir mai majooro ko b truck wale ko rupay kaha se do sir uttar pradesh ke labour department mai ragisterd hu mera registration number UPBOCW 09170500087501 hai sir mai rent pr rehkr apna b apne pariwar ka majduri kr gujara karta hu sir Akaram ka mobile number 9599807071 hai sir yeh 

In [None]:
!pip install transliterate


Collecting transliterate
  Downloading transliterate-1.10.2-py2.py3-none-any.whl (45 kB)
[K     |████████████████████████████████| 45 kB 95 kB/s eta 0:00:011
Installing collected packages: transliterate
Successfully installed transliterate-1.10.2


In [None]:
from transliterate import translit, get_available_language_codes


LanguageCodeError: ``language_code`` is optional with ``reversed`` set to True only.