In [33]:
import re
import sys
import nltk
import json
import warnings
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('wordnet')

warnings.filterwarnings('ignore')
np.set_printoptions(threshold=sys.maxsize)

[nltk_data] Downloading package stopwords to /home/amogha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/amogha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [34]:
# Load all the data
en_trainpath = 'data/train-en.parquet'
en_testpath = 'data/test-en.parquet'
en_valpath = 'data/dev-en.parquet'

es_trainpath = 'data/train-es.parquet'
es_testpath = 'data/test-es.parquet'
es_valpath = 'data/dev-es.parquet'

en_traindata = pd.read_parquet(en_trainpath)
en_testdata = pd.read_parquet(en_testpath)
en_valdata = pd.read_parquet(en_valpath)

es_traindata = pd.read_parquet(es_trainpath)
es_testdata = pd.read_parquet(es_testpath)
es_valdata = pd.read_parquet(es_valpath)

print(en_traindata[:2])
print(es_traindata[:2])

                         sentence1                    sentence2  \
0           A plane is taking off.  An air plane is taking off.   
1  A man is playing a large flute.    A man is playing a flute.   

   similarity_score  
0               5.0  
1               3.8  
                                 sentence1  \
0                Un avión está despegando.   
1  Un hombre está tocando una gran flauta.   

                            sentence2  similarity_score  
0           Un avión está despegando.               5.0  
1  Un hombre está tocando una flauta.               3.8  


In [35]:
# Stop words, Lemmatization and Stemming
en_stop_words = set(stopwords.words('english'))
es_stop_words = set(stopwords.words('spanish'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [36]:
def preprocess_text_en(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', 'num', text)
    # convert to lower case
    text = text.lower()
    sentence = nltk.word_tokenize(text, language='english')
    # remove stop words
    sentence = [word for word in sentence if word not in en_stop_words]
    # apply lemmatize
    sentence = [lemmatizer.lemmatize(word) for word in sentence]
    # apply stemming
    sentence = [stemmer.stem(word) for word in sentence]
    return sentence

def preprocess_text_es(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', 'num', text)
    # convert to lower case
    text = text.lower()
    sentence = nltk.word_tokenize(text, language='spanish')
    # remove stop words
    sentence = [word for word in sentence if word not in es_stop_words]
    # apply lemmatize
    sentence = [lemmatizer.lemmatize(word) for word in sentence]
    # apply stemming
    sentence = [stemmer.stem(word) for word in sentence]
    return sentence

def process_data(data1, data2):
    data1['sentence1'] = data1['sentence1'].apply(lambda x: preprocess_text_en(x))
    data1['sentence2'] = data2['sentence1'].apply(lambda x: preprocess_text_es(x))
    return data1
    
def get_vocab(data):
    vocab = set()
    for _, row in data.iterrows():
        for word in row['sentence2']:
            vocab.add(word)
    return vocab

In [37]:
traindata = process_data(en_traindata, es_traindata)
testdata = process_data(en_testdata, es_testdata)
valdata = process_data(en_valdata, es_valdata)

# Create spanish vocab and word2idx
vocab = get_vocab(traindata)
word2idx = {word: idx for idx, word in enumerate(sorted(vocab))}

file_path = 'data/word2idx-en-es.json'

# Save the dictionary to a JSON file
with open(file_path, 'w') as file:
    json.dump(word2idx, file)

In [38]:
# Store the processed datasets
traindata.to_csv('data/train-en-es.csv', index=False)
testdata.to_csv('data/test-en-es.csv', index=False)
valdata.to_csv('data/validation-en-es.csv', index=False)

In [39]:
print(len(vocab))

8935


In [40]:
print(traindata[:5])

                             sentence1  \
0                        [plane, take]   
1             [man, play, larg, flute]   
2  [man, spread, shrede, chees, pizza]   
3            [three, men, play, chess]   
4                   [man, play, cello]   

                                 sentence2  similarity_score  
0                      [avión, despegando]              5.00  
1           [hombr, tocando, gran, flauta]              3.80  
2  [hombr, untando, queso, rallado, pizza]              3.80  
3           [tre, hombr, jugando, ajedrez]              2.60  
4            [hombr, tocando, violonchelo]              4.25  
