In [25]:
import pandas as pd
import re
from tqdm import tqdm
import copy
from sklearn.model_selection import train_test_split

# Code to remove sentence and clean text

In [5]:
import re
import copy

# Importing contractions

with open("contractions.txt", "r") as inp_cont:
    contractions_list = inp_cont.read()
contractions_list = [re.sub('["]', '', x).split(":") for x in re.sub(r"\s+", " ", re.sub(r"(.*{)|(}.*)", '', contractions_list)).split(',')]
contractions_dict = dict((k.lower().strip(), re.sub('/.*', '', v).lower().strip()) for k, v in contractions_list)

def remove_sc(_line, lang="en"):
    # _line = copy.deepcopy(_line)
    if lang == "hi":
        _line = re.sub(r'[+\-*/#@%>=;~{}×–`’"()_]', "", _line)
        _line = re.sub(r"(?:(\[)|(\])|(‘‘)|(’’))", '', _line)
    elif lang == "en":
        _line = re.sub(r'[+\-*/#@%>=;~{}×–`’"()_|:]', "", _line)
        _line = re.sub(r"(?:(\[)|(\])|(‘‘)|(’’))", '', _line)
    return _line


def clean_text(_text, lang="en"):
    # _text = copy.deepcopy(_text)
    if lang == "en":
        _text = remove_sc(_line=_text, lang=lang)
        for cn in contractions_dict:
            _text = re.sub(cn, contractions_dict[cn], _text)
    elif lang == "hi":
        _text = remove_sc(_line=_text, lang=lang)
    return _text

In [7]:
df = pd.read_csv("cleaned_hindi_english_parallel.csv")

hindi_text = df['hindi']
english_text = df['english']

In [8]:
print(hindi_text.head)
print(english_text.head)

<bound method NDFrame.head of 0           अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें
1                           एक्सेर्साइसर पहुंचनीयता अन्वेषक
2                     निचले पटल के लिए डिफोल्ट प्लग-इन खाका
3                      ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका
4         उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...
                                ...                        
953721    भारत अपनी 64 प्रतिशत कार्यशील आयु समूह आबादी क...
953722                      members making oath/affirmation
953723    मैंने गौर किया है कि यह न केवल अपने महत्त्वपूर...
953724    उन्होंने मेरे समक्ष जो प्रदर्शन किया उसमें से ...
953725    खाद्य और जल सुरक्षा; पर्यावरण की दृष्टि से वहन...
Name: hindi, Length: 953726, dtype: object>
<bound method NDFrame.head of 0            give your application an accessibility workout
1                         accerciser accessibility explorer
2            the default plugin layout for the bottom panel
3               the default plugin layout for the top pa

In [9]:
# Removing English sentences from hindi sentences
ids_to_remove = {}
for _id, _t in tqdm(enumerate(hindi_text)):
    if len(re.findall(r'[a-zA-Z]', _t)) > 0:
        ids_to_remove[_id] = _t
    else:
        pass

953726it [00:01, 482827.51it/s]


In [10]:
ids_to_keep = [i for i in range(len(hindi_text)) if i not in ids_to_remove.keys()]

filtered_eng_text = []
filtered_hindi_text = []
for _id in tqdm(ids_to_keep):
    filtered_eng_text.append(english_text[_id].lower())
    filtered_hindi_text.append(hindi_text[_id])

100%|██████████| 904975/904975 [00:04<00:00, 219939.63it/s]


In [15]:
#Treating english sentences
clean_eng_text = []
for sent in tqdm(filtered_eng_text):
    clean_eng_text.append(clean_text(_text=copy.deepcopy(sent), lang="en"))
    


100%|██████████| 904975/904975 [02:42<00:00, 5571.89it/s]


In [19]:
#Treating hindi sentences
clean_hindi_text = []
for sent in tqdm(filtered_hindi_text):
    clean_hindi_text.append(clean_text(_text=copy.deepcopy(sent), lang="hi"))

100%|██████████| 904975/904975 [00:08<00:00, 109276.45it/s]


In [20]:
clean_data = pd.DataFrame({'eng_text':clean_eng_text, 'hindi_text':clean_hindi_text})

In [21]:
clean_data.head

<bound method NDFrame.head of                                                  eng_text  \
0          give your application an accessibility workout   
1                       accerciser accessibility explorer   
2          the default plugin layout for the bottom panel   
3             the default plugin layout for the top panel   
4          a list of plugins that are disabled by default   
...                                                   ...   
904970  कते हैं। opening of a finnode centre in delhi ...   
904971  प्राप्त होगी। by 2020, india is set to become ...   
904972  है। i note that this is a landmark meeting  no...   
904973  है। in the presentations that they made before...   
904974  ्त है। issues such as food and water security ...   

                                               hindi_text  
0         अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें  
1                         एक्सेर्साइसर पहुंचनीयता अन्वेषक  
2                    निचले पटल के लिए डिफोल्ट प्लगइन खाका

## Filtering data based on sentence length

In [22]:
clean_data["eng_len"] = clean_data.eng_text.str.count("")
clean_data['hindi_len'] = clean_data.hindi_text.str.count("")
small_len_data = clean_data.query('eng_len < 50 & hindi_len < 50')

In [26]:
#Train_Val split
#Full set

train_set, val_set = train_test_split(small_len_data.loc[:, ['eng_text','hindi_text']], test_size=0.1)
train_set.to_csv("train.csv", index=False)
val_set.to_csv("val.csv", index=False)

In [27]:
small_data = small_len_data.loc[:, ["eng_text", "hindi_text"]].sample(n=150000)
train_set_sm, val_set_sm = train_test_split(small_data, test_size=0.3)
train_set_sm.to_csv("train_sm.csv", index=False)
val_set_sm.to_csv("val_sm.csv", index=False)