In [163]:
import numpy as np 
import pandas as pd 
import string
import re
import json

In [151]:
df=pd.read_csv('Hindi_English_Truncated_Corpus.csv')
print(df.shape)

(127607, 3)


In [152]:
df.drop_duplicates(inplace=True) 
df=df[df['source']=='ted']
print(df.shape)

(38803, 3)


In [153]:
df=df.sample(25000,random_state=42)
df=df.drop(columns=['source'],axis=1)
df.reset_index(drop=True,inplace=True)
df.head(n=10)

Unnamed: 0,english_sentence,hindi_sentence
0,"We still don't know who her parents are, who she is.","हम अभी तक नहीं जानते हैं कि उसके माता-पिता कौन हैं, वह कौन है,"
1,"no keyboard,","कोई कुंजीपटल नहीं,"
2,"But as far as being a performer,",लेकिन एक कलाकार होने के साथ
3,"And this particular balloon,","और यह खास गुब्बारा,"
4,"and it's not as hard as you think. Integrate climate solutions into all of your innovations,","और जितना आपको लगता है, यह उतना कठिन नहीं है.अपने सभी नवाचारों में जलवायु समाधान को एकीकृत करें,"
5,and saw the demo by Jeff Han,और जेफ़ हान द्वारा प्रदर्शन देखा होगा
6,This baby is fully electric.,यह बच्चा पूरी तरह से बिजली से चलता है.
7,"kids have no, or very little, say in making the rules,","फिर भी बच्चों को नियम बनाने का बिलकुल नहीं, या बहुत कम, मौका मिलता है"
8,I'm going to add a little bit to my description of aging.,मै अपने द्वारा दिए गए उम्र के बढ़्ने के विवर्ण की कुछ व्याख्या करूंगा.
9,"expands and cools until it gets to the point where it becomes transparent,","फैलने लगता है, फिर ये ठंडा होकर उस अवस्था तक पहुँच जाता है, जब ये पारदर्शी बन जाता है,"


### Data cleaning

In [154]:
def remove_punctuations(x):
    punct=set(string.punctuation)
    for ch in punct:
        x=x.replace(ch,'')
    return x
def clean_data(df):
    #lower_case 
    df['english_sentence']= df['english_sentence'].apply(lambda x: x.lower())
    df['hindi_sentence']= df['hindi_sentence'].apply(lambda x: x.lower())
    #punctuation removal
    punct=set(string.punctuation)
    df['english_sentence']= df['english_sentence'].apply(remove_punctuations)
    df['hindi_sentence']= df['hindi_sentence'].apply(remove_punctuations)
    #remove digits
    df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "" , x))
    df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: re.sub("[0123456789]", "" , x))
    df['english_sentence'] = df['english_sentence'].apply(lambda x: re.sub("[0123456789]", "" , x))
    #remove extra white space 
    df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: " ".join(x.split()))
    df['english_sentence'] = df['english_sentence'].apply(lambda x: " ".join(x.split()))
    # Add start and end tokens
    df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: '<start> '+x+' <end>')
    return df

In [155]:
df=clean_data(df)
df.head()

Unnamed: 0,english_sentence,hindi_sentence
0,we still dont know who her parents are who she is,<start> हम अभी तक नहीं जानते हैं कि उसके मातापिता कौन हैं वह कौन है <end>
1,no keyboard,<start> कोई कुंजीपटल नहीं <end>
2,but as far as being a performer,<start> लेकिन एक कलाकार होने के साथ <end>
3,and this particular balloon,<start> और यह खास गुब्बारा <end>
4,and its not as hard as you think integrate climate solutions into all of your innovations,<start> और जितना आपको लगता है यह उतना कठिन नहीं हैअपने सभी नवाचारों में जलवायु समाधान को एकीकृत करें <end>


In [156]:
df.to_csv('cleaned_data.csv',index=False)

In [157]:
df=pd.read_csv('cleaned_data.csv')

### Vocab

In [158]:
eng_vocab=[]
hindi_vocab=[]
for i in range(len(df)):
    row=df.iloc[i,:] 
    english=str(row['english_sentence'])
    hindi=str(row['hindi_sentence'])
    for w in english.split():
        eng_vocab.append(w)
    for w in hindi.split():
        hindi_vocab.append(w)
eng_vocab=set(eng_vocab)
hindi_vocab=set(hindi_vocab)
print(len(eng_vocab),len(hindi_vocab))

14031 17540


In [162]:
word_to_idx_english={} 
for i,w in enumerate(eng_vocab):
    word_to_idx_english[w]=i+1
word_to_idx_hindi={}
for i,w in enumerate(hindi_vocab):
    word_to_idx_hindi[w]=i+1
idx_to_word_english=dict([(i,w) for (w,i) in word_to_idx_english.items()])
idx_to_word_hindi=dict([(i,w) for (w,i) in word_to_idx_hindi.items()])

In [167]:
with open('word_to_idx_english.json','w') as f:
    json.dump(word_to_idx_english,f)
with open('word_to_idx_hindi.json','w') as f:
    json.dump(word_to_idx_hindi,f)
with open('idx_to_word_english.json','w') as f:
    json.dump(idx_to_word_english,f)
with open('idx_to_word_hindi.json','w') as f:
    json.dump(idx_to_word_hindi,f)