In [67]:
import urllib.request
import numpy as np
import json

In [68]:
# URL to retrive pre-trained 300 dimensional gloVe embedding
embedding_300_url = "http://www.cs.virginia.edu/~tw8cb/word_embeddings/vectors300.txt"
#embedding_300_url = "http://www.cs.virginia.edu/~tw8cb/word_embeddings/vectors.txt"

def read_embedding(url):
    """Function to read out an embedding
    Input: url: url to embedding
    
    Returns: vocab: list of words in the embedding
             word2id: dictionary mapping words to ids
             word_vectors: array storing the embeddings,
                           row corresponds to word id"""
    # Open url
    data = urllib.request.urlopen(url)
    vocab = []
    word_vectors = []
    
    # Each line contains one word and its embedding
    for line in data:
        line = line.decode()
        # Split by spaces
        split = line.split()
        # First element(== the word) is added to vocabulary
        vocab.append(split[0])
        # All other elements(embedding vectors) are added to vectors
        word_vectors.append([float(elem) for elem in split[1:]])
    
    # Create a dictionary with word-id pairs based on the order
    word2id = {w: i for i, w in enumerate(vocab)}
    # Vectors are converted into an array
    word_vectors = np.array(word_vectors).astype(float)
    
    return vocab, word2id, word_vectors
    
embedding_300_vocab, embedding_300_word2id, embedding_300_word_vector = read_embedding(embedding_300_url)

In [69]:
female_words_url = "https://raw.githubusercontent.com/uvavision/Double-Hard-Debias/master/data/female_word_file.txt"
female_words_data = urllib.request.urlopen(female_words_url)
female_words_list = []
for line in female_words_data:
    line = line.decode()
    line = line.split()
    female_words_list.append(line[0])

In [70]:
male_words_url = "https://raw.githubusercontent.com/uvavision/Double-Hard-Debias/master/data/male_word_file.txt"
male_words_data = urllib.request.urlopen(male_words_url)
male_words_list = []
for line in male_words_data:
    line = line.decode()
    line = line.split()
    male_words_list.append(line[0])

In [71]:
def exclude_vocab(vocab, word2id, word_vector, exclude):
    """Function to exclude specific words from vocabulary
    Input: vocab: list of words in the embedding
           word2id: dictionary mapping words to ids
           word_vector: array storing the embedding vectors
           exclude: list of words to exclude from the vocabulary
           
    Returns: limited_vocab: vocab without the words in exclude
             limited_word2id: word2id without the words in exclude
             limited_word_vector: word_vector without the words in exclude"""
    # Create copies of vocab, word2id and word_vector
    limited_word_vector = word_vector.copy()
    limited_vocab = vocab.copy()
    limited_word2id = word2id.copy()
    # For all words that are in exclude and vocab
    for word in exclude:
        if word in limited_vocab:
            # Get id of word
            id = word2id[word]
            # Delete row corresponding to word from word_vector
            limited_word_vector = np.delete(limited_word_vector, id, axis = 0)
            # Remove word from vocab
            limited_vocab.remove(word)
            # Remove key-value pair of word from word2id
            del limited_word2id[word]
            
    return limited_vocab, limited_word2id, limited_word_vector

In [82]:
# URLs to the files storing differnt gender specific words
female_words_url = "https://raw.githubusercontent.com/uvavision/Double-Hard-Debias/master/data/female_word_file.txt"
male_words_url = "https://raw.githubusercontent.com/uvavision/Double-Hard-Debias/master/data/male_word_file.txt"
gender_specific_url = "https://raw.githubusercontent.com/uvavision/Double-Hard-Debias/master/data/gender_specific_full.json"

# Empty list to accumulate gender specific words
gender_specific = []

# Read out URL and add male gender specific words
with urllib.request.urlopen(male_words_url) as f:
    for l in f:
        gender_specific.append(l.decode().strip())
        
# Read out URL and add female gender specific words 
with urllib.request.urlopen(female_words_url) as f:
    for l in f:
        gender_specific.append(l.decode().strip())

# Read out URL and add further gender specific words
with urllib.request.urlopen(gender_specific_url) as f:
    gender_specific.extend(json.load(f))

exclude_words = gender_specific

In [73]:
# Remove gender specific words from the embedding
embedding_300_vocab_limit, embedding_300_word2id_limit, embedding_300_word_vector_limit = exclude_vocab(embedding_300_vocab, embedding_300_word2id, embedding_300_word_vector, exclude_words)
print("vocab size: ", len(embedding_300_vocab))
print("limited vocab size: ", len(embedding_300_vocab_limit))

vocab size:  322636
limited vocab size:  322099


In [86]:
# URL to the file storing definitional pairs
definitial_pairs_url = "https://raw.githubusercontent.com/uvavision/Double-Hard-Debias/master/data/definitional_pairs.json"

# Empty list to store definitional pairs
definitional_pairs = []


# Read out url and add pairs in list
with urllib.request.urlopen(definitial_pairs_url) as f:
    definitional_pairs.extend(json.load(f))


# Create list of single words instead of pairs  
definitional_words = []
for pair in definitional_pairs:
    for word in pair:
        definitional_words.append(word.lower())