In [1]:
import urllib.request
import numpy as np
import json

In [2]:
# URL to retrive pre-trained 300 dimensional gloVe embedding
embedding_300_url = "http://www.cs.virginia.edu/~tw8cb/word_embeddings/vectors.txt"

def read_embedding(url):
    """Function to read out an embedding
    Input: url: url to embedding
    
    Returns: vocab: list of words in the embedding
             word2id: dictionary mapping words to ids
             word_vectors: array storing the embeddings,
                           row corresponds to word id"""
    # Open url
    data = urllib.request.urlopen(url)
    vocab = []
    word_vectors = []
    
    # Each line contains one word and its embedding
    for line in data:
        line = line.decode()
        # Split by spaces
        split = line.split()
        # First element(== the word) is added to vocabulary
        vocab.append(split[0])
        # All other elements(embedding vectors) are added to vectors
        word_vectors.append([float(elem) for elem in split[1:]])
    
    # Create a dictionary with word-id pairs based on the order
    word2id = {w: i for i, w in enumerate(vocab)}
    # Vectors are converted into an array
    word_vectors = np.array(word_vectors).astype(float)
    
    return vocab, word2id, word_vectors
    
embedding_300_vocab, embedding_300_word2id, embedding_300_word_vector = read_embedding(embedding_300_url)

In [14]:
def exclude_vocab(vocab, exclude):
    """Function to exclude specific words from vocabulary
    Input: vocab: list of words in the embedding
           exclude: list of words to exclude from the vocabulary
           
    Returns: limited_vocab: vocab without the words in exclude"""
    # Create copies of vocab, word2id and word_vector
    limited_vocab = vocab.copy()
    # For all words that are in exclude and vocab
    for word in exclude:
        if word in limited_vocab:
            # Remove word from vocab
            limited_vocab.remove(word)
            
    return limited_vocab

In [4]:
# URL to female specific words
female_words_url = "https://raw.githubusercontent.com/uvavision/Double-Hard-Debias/master/data/female_word_file.txt"
female_words_data = urllib.request.urlopen(female_words_url)

# List of female words
female_words = []
for line in female_words_data:
    line = line.decode()
    line = line.split()
    female_words.append(line[0])

In [5]:
# URL to male specific words
male_words_url = "https://raw.githubusercontent.com/uvavision/Double-Hard-Debias/master/data/male_word_file.txt"
male_words_data = urllib.request.urlopen(male_words_url)

# List of male words
male_words = []
for line in male_words_data:
    line = line.decode()
    line = line.split()
    male_words.append(line[0])

In [6]:
# Create List with female - male pairs from female-male specific words
female_male_pairs = []
for i, female in enumerate(female_words):
    female_male_pairs.append([female, male_words[i]])

In [7]:
# URLs to the files storing gender specific words
gender_specific_url = "https://raw.githubusercontent.com/uvavision/Double-Hard-Debias/master/data/gender_specific_full.json"

# Empty list to accumulate gender specific words plus additional list after lowercasing
gender_specific_original = []
gender_specific = []


# Read out URL and add further gender specific words
with urllib.request.urlopen(gender_specific_url) as f:
    gender_specific_original.extend(json.load(f))

# Add lower case words to second list
for word in gender_specific_original:
    gender_specific.append(word.lower())

In [8]:
# URL to the file storing definitional pairs
definitial_pairs_url = "https://raw.githubusercontent.com/uvavision/Double-Hard-Debias/master/data/definitional_pairs.json"

# Empty list to store definitional pairs plus additional list after lowercasing
definitional_pairs_original = []
definitional_pairs = []


# Read out url and add pairs in list
with urllib.request.urlopen(definitial_pairs_url) as f:
    definitional_pairs_original.extend(json.load(f))
    
# Add lower case pairs to second list
for [w1, w2] in definitional_pairs_original:
    definitional_pairs.append([w1.lower(), w2.lower()])


# Create list of single words instead of pairs  
definitional_words = []
for pair in definitional_pairs:
    for word in pair:
        definitional_words.append(word)
        

In [9]:
# URL to the file storing the equalize pairs
equalize_pairs_url = "https://raw.githubusercontent.com/uvavision/Double-Hard-Debias/master/data/equalize_pairs.json"

# Empty list to store equalize pairs plus additional list after lowercasing
equalize_pairs_original = []
equalize_pairs = []

# Read out URL and add pairs to list
with urllib.request.urlopen(equalize_pairs_url) as f:
    equalize_pairs_original.extend(json.load(f))
    
# Add lower case pairs to second list
for [w1, w2] in equalize_pairs_original:
    equalize_pairs.append([w1.lower(), w2.lower()])
    
# Create list of single words instead of pairs
equalize_words = []
for pair in equalize_pairs:
    for word in pair:
        equalize_words.append(word)

In [10]:
# List of all gender specific words included in 
# female words, male words, gender specific words, equalize words and definitional words
exclude_words = list(set(female_words + male_words + gender_specific + definitional_words + equalize_words))

In [16]:
# Remove gender specific words from the embedding to obtain vocabulary of neutral words
embedding_300_vocab_neutral = exclude_vocab(embedding_300_vocab, exclude_words)
print("vocab size: ", len(embedding_300_vocab))
print("limited vocab size: ", len(embedding_300_vocab_neutral))

vocab size:  322636
limited vocab size:  321977
