In [1]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer

In [2]:
with open('spa.txt', encoding='utf-8') as translation_file:
    lines = [x.strip() for x in translation_file.readlines()]
df = pd.DataFrame(lines, columns=['English'])
df['Spanish'] = df['English'].str.split('\t').str[1]
df['English'] = df['English'].str.split('\t').str[0]
df

Unnamed: 0,English,Spanish
0,Go.,Ve.
1,Go.,Vete.
2,Go.,Vaya.
3,Go.,Váyase.
4,Hi.,Hola.
...,...,...
118959,There are four main causes of alcohol-related ...,Hay cuatro causas principales de muertes relac...
118960,There are mothers and fathers who will lie awa...,Hay madres y padres que se quedan despiertos d...
118961,A carbon footprint is the amount of carbon dio...,Una huella de carbono es la cantidad de contam...
118962,Since there are usually multiple websites on a...,Como suele haber varias páginas web sobre cual...


In [3]:
# Lower casing
df['English'] = df['English'].str.lower()
df

Unnamed: 0,English,Spanish
0,go.,Ve.
1,go.,Vete.
2,go.,Vaya.
3,go.,Váyase.
4,hi.,Hola.
...,...,...
118959,there are four main causes of alcohol-related ...,Hay cuatro causas principales de muertes relac...
118960,there are mothers and fathers who will lie awa...,Hay madres y padres que se quedan despiertos d...
118961,a carbon footprint is the amount of carbon dio...,Una huella de carbono es la cantidad de contam...
118962,since there are usually multiple websites on a...,Como suele haber varias páginas web sobre cual...


In [4]:
# Stop words removal
stop_words = set(stopwords.words('english')) 
df['English'] = df['English'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [5]:
# remove punctuation
def remove_punctuation(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    return s

df['English'] = df['English'].apply(remove_punctuation)

In [6]:
# remove digits
df['English'] = df['English'].str.replace('\d+', '')

  df['English'] = df['English'].str.replace('\d+', '')


In [7]:
df

Unnamed: 0,English,Spanish
0,go,Ve.
1,go,Vete.
2,go,Vaya.
3,go,Váyase.
4,hi,Hola.
...,...,...
118959,four main causes alcoholrelated death injury c...,Hay cuatro causas principales de muertes relac...
118960,mothers fathers lie awake children fall asleep...,Hay madres y padres que se quedan despiertos d...
118961,carbon footprint amount carbon dioxide polluti...,Una huella de carbono es la cantidad de contam...
118962,since usually multiple websites given topic us...,Como suele haber varias páginas web sobre cual...


In [8]:
# Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
def lemmatize_text(text):
    return " ".join([wordnet_lemmatizer.lemmatize(w, pos="v") for w in w_tokenizer.tokenize(text)])

df['English'] = df['English'].apply(lemmatize_text)

In [9]:
df

Unnamed: 0,English,Spanish
0,go,Ve.
1,go,Vete.
2,go,Vaya.
3,go,Váyase.
4,hi,Hola.
...,...,...
118959,four main cause alcoholrelated death injury ca...,Hay cuatro causas principales de muertes relac...
118960,mother father lie awake children fall asleep w...,Hay madres y padres que se quedan despiertos d...
118961,carbon footprint amount carbon dioxide polluti...,Una huella de carbono es la cantidad de contam...
118962,since usually multiple websites give topic usu...,Como suele haber varias páginas web sobre cual...


In [10]:
# df[df['English'].apply(lambda x:len(x)>=5 & len(x)<=20)]
# df
mask = (df['English'].str.len() >= 5) & (df['English'].str.len() <= 20)
df = df.loc[mask]
df = df[:10000]
df

Unnamed: 0,English,Spanish
21,go on,Continúa.
22,go on,Continúe.
23,hello,Hola.
28,oh no,"¡Oh, no!"
29,relax,Tomátelo con soda.
...,...,...
11626,thats house,Esa es nuestra casa.
11627,thats hotel,Ese es el hotel.
11628,thats offer,He ahí la oferta.
11629,thats trouble,Eso es problemático.


In [11]:
df.to_csv('ready_dataset.csv')