### This notebook reduces the english-spanish dataset so that any sentences that contain tokens that are not contained in the english or spanish word2vec dictionaries are removed. It also reduces the size of the english and spanish word2vec dictionaries to only contain needed tokens.

In [25]:
import pandas as pd
import numpy as np
from preprocess import sentence_to_tokens

In [26]:
# load en-es string pair data
data = pd.read_csv("en_es_data.csv")
data.head()

Unnamed: 0,english,spanish
0,Go.,Ve.
1,Go.,Vete.
2,Go.,Vaya.
3,Go.,Váyase.
4,Hi.,Hola.


In [27]:
# tokenize strings into lists of tokens
# see preprocess.py for function definition
data["en_tokens"] = data["english"].apply(lambda sent: sentence_to_tokens(sent, "en"))
data["es_tokens"] = data["spanish"].apply(lambda sent: sentence_to_tokens(sent, "es"))

In [28]:
data.head()

Unnamed: 0,english,spanish,en_tokens,es_tokens
0,Go.,Ve.,"[<s>, go, ., <e>]","[<s>, ve, ., <e>]"
1,Go.,Vete.,"[<s>, go, ., <e>]","[<s>, vete, ., <e>]"
2,Go.,Vaya.,"[<s>, go, ., <e>]","[<s>, vaya, ., <e>]"
3,Go.,Váyase.,"[<s>, go, ., <e>]","[<s>, váyase, ., <e>]"
4,Hi.,Hola.,"[<s>, hi, ., <e>]","[<s>, hola, ., <e>]"


In [29]:
# load en and es word2vec dictionaries
en_vec_df = pd.read_csv("cc.en.300.csv")
es_vec_df = pd.read_csv("cc.es.300.csv")

In [30]:
# these should all be empty
print(en_vec_df[en_vec_df["word"] == "<s>"])
print(en_vec_df[en_vec_df["word"] == "<e>"])
print(es_vec_df[es_vec_df["word"] == "<s>"])
print(es_vec_df[es_vec_df["word"] == "<e>"])

Empty DataFrame
Columns: [word, vector]
Index: []
Empty DataFrame
Columns: [word, vector]
Index: []
Empty DataFrame
Columns: [word, vector]
Index: []
Empty DataFrame
Columns: [word, vector]
Index: []


In [31]:
# add special start <s> and end <e> tokens
vec_size = 300
start_token = "<s>"
end_token = "<e>"

# these were decided arbitrarily
start_vec = np.array([1.0 for i in range(vec_size)])
end_vec = np.array([0.1 for i in range(vec_size)])

# add to english
en_vec_df = pd.concat([en_vec_df, pd.DataFrame({"word": [start_token, end_token], "vector": [start_vec, end_vec]})])
en_vec_df.reset_index(inplace=True, drop=True)

# add to spanish
es_vec_df = pd.concat([es_vec_df, pd.DataFrame({"word": [start_token, end_token], "vector": [start_vec, end_vec]})])
es_vec_df.reset_index(inplace=True, drop=True)

In [32]:
en_vec_df.tail()

Unnamed: 0,word,vector
1999997,hvm,[-0.0634 -0.0375 -0.2048 -0.0199 0.2529 0.2086...
1999998,GorceyBearTerritory.netSaturday,[0.0142 0.0230 -0.0099 -0.0223 -0.0068 -0.0091...
1999999,Zwicke,[-0.0499 0.0152 0.0038 -0.0695 -0.0220 -0.0079...
2000000,<s>,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
2000001,<e>,"[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, ..."


In [33]:
es_vec_df.tail()

Unnamed: 0,word,vector
1999997,blaciones,[0.0032 0.0005 0.0574 -0.0039 0.0159 0.0112 -0...
1999998,LDSInfantiles,[-0.0071 0.0046 0.0645 0.0001 -0.0075 -0.0288 ...
1999999,TEDxQuito,[-0.0197 -0.0194 0.0922 0.0005 -0.0080 -0.0649...
2000000,<s>,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
2000001,<e>,"[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, ..."


In [34]:
def get_unique_tokens(data, tokenized_col):
    """
    Gets the unique set of tokens contained in all tokenized lists in
    data[tokenized_col]
    
    data (pd.DataFrame): en-es dataset
    tokenized_col: name of column containing tokenized lists
    """
    
    # this might not be the most efficient method, but it works fine
    all_tokens_ls = data[tokenized_col].values.tolist()
    all_tokens = []

    for token_ls in all_tokens_ls:
        for token in token_ls:
            all_tokens.append(token)

    unique_tokens = list(set(all_tokens))
    
    return unique_tokens


def check_token(invalid_tokens, token_ls):
    """
    Checks if any tokens in token_ls is in invalid_tokens.
    Returns True at first match, else if no match found.
    """
    
    for token in token_ls:
        if token in invalid_tokens:
            return True    
    return False


def reduce_data(data, vec_df, unique_tokens, tokenized_col):
    """
    Reduces data to only contain rows where all tokens are also found
    in vec_df. Creates and returns a new reduced dataframe.
    """
    
    # create dataframe out of list of unique tokens
    set_df = pd.DataFrame({"word": unique_tokens})
    
    # get column of True/False if token in word2vec dict
    set_df["in_vec"] = set_df["word"].isin(vec_df["word"])
    
    # get dataframe containing only words NOT contained in word2vec dict
    invalid_indexes = set_df[set_df["in_vec"] == False].index
    invalid_df = set_df.iloc[invalid_indexes, :]
    
    # convert to list for search purposes (maybe not that clever)
    invalid_tokens = invalid_df["word"].values.tolist()
    
    # put boolean in og dataframe indicating whether tokenized list 
    # contains a word not contained 
    data[f"{tokenized_col}_has_invalid_token"] = data[tokenized_col].apply(lambda token_ls: check_token(invalid_tokens, token_ls))

    # create new dataframe w/o any rows containing invalid tokens in tokenized_col
    reduced_data = data[data[f"{tokenized_col}_has_invalid_token"] == False]

    return reduced_data

In [35]:
# get list of unique tokens for each language across entire dataset
en_unique_tokens = get_unique_tokens(data, "en_tokens")
es_unique_tokens = get_unique_tokens(data, "es_tokens")

In [36]:
print(f"{len(en_unique_tokens)} unique english tokens")
print(f"{len(es_unique_tokens)} unique spanish tokens")

13598 unique english tokens
26110 unique spanish tokens


In [37]:
# remove all rows in data where there are tokens in either en or es
# tokenization that are not contained in corresponding word2vec dict
# reduce based on english strings first
print(f"data shape before any reduction: {data.shape}")
reduced_data = reduce_data(data, en_vec_df, en_unique_tokens, "en_tokens")
print(f"data shape after english reduction: {reduced_data.shape}")

# now reduce based on spanish strings
reduced_data = reduce_data(reduced_data, es_vec_df, es_unique_tokens, "es_tokens")
print(f"data shape after spanish reduction: {reduced_data.shape}")

data shape before any reduction: (118964, 4)
data shape after english reduction: (111713, 5)
data shape after spanish reduction: (111184, 6)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[f"{tokenized_col}_has_invalid_token"] = data[tokenized_col].apply(lambda token_ls: check_token(invalid_tokens, token_ls))


In [38]:
def reduce_vec(unique_tokens, vec_df):
    """
    Returns a df that is a reduced version of vec_df. Only tokens
    contained in unique_tokens will remain in this reduced version.
    """
    
    # set bool (not to be confused with set) if word in vec (should already be true always)
    set_df = pd.DataFrame({"word": unique_tokens})
    set_bool = set_df["word"].isin(vec_df["word"])
    
    # sanity check (sizes should be the same)
    print(f"unique tokens count: {len(unique_tokens)}")
    print(f"true count: {set_bool[set_bool == True].size}")
    
    # set bool if word in vec is in the set of unique tokens
    vec_bool = vec_df["word"].isin(set_df["word"])
    
    # only keep rows in vec where word is used
    red_vec_df = vec_df.iloc[vec_bool[vec_bool == True].index, :]
    
    # sanity check again (should be same as above)
    print(f"num rows in reduced vec_df: {red_vec_df.shape[0]}")
    
    return red_vec_df

In [39]:
# get new lists of unique tokens 
en_red_unique = get_unique_tokens(reduced_data, "en_tokens")
es_red_unique = get_unique_tokens(reduced_data, "es_tokens")

# reduce vector dictionaries
en_red_vec = reduce_vec(en_red_unique, en_vec_df)
es_red_vec = reduce_vec(es_red_unique, es_vec_df)

unique tokens count: 12664
true count: 12664
num rows in reduced vec_df: 12664
unique tokens count: 24672
true count: 24672
num rows in reduced vec_df: 24672


In [42]:
# write reduced dataset (NOTE: keeping tokenization as a separate process, 
# entire purpose of this nb was just to reduce cost of dataset and
# word2vec dictionaries, even though the code sort of "skips ahead"
# into the actual data transformation steps)
reduced_data[["english", "spanish"]].to_csv("en_es_reduced_data.csv", index=False)

# write reduced vector dictionaries
en_red_vec.to_csv("cc.en.300.reduced.csv", index=False)
es_red_vec.to_csv("cc.es.300.reduced.csv", index=False)