In [1]:
import gensim.models.keyedvectors as word2vec
import time
import pandas as pd
import numpy as np
from mlxtend.evaluate import permutation_test
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize

import os

In [2]:
class Weat:
    def __init__(self, w2v_model, female_attr, male_attr):
        self.model = w2v_model
        self.female_attr = self.get_vecs(female_attr)
        self.male_attr = self.get_vecs(male_attr)
        self.vocab = [key.lower() for key in self.model.vocab.keys()]

    def get_vecs(self, word_list):
        vec_list = []
        for word in word_list:
            vec_list.append(self.model.get_vector(word))
        return vec_list

    def get_association(self, word):
        """

        :param word: brand to get association of
        :return: float. If positive, word is more female, if negative, word is more male.
        """
        try:
            word_vec = self.model.get_vector(word.capitalize())
        except KeyError:
            index = self.vocab.index(word)
            word_vec = self.model.get_vector(self.model.index2entity[index])
        female_mean = np.asarray([])
        male_mean = np.asarray([])
        for fem_vec in self.female_attr:
            female_mean = np.append(female_mean, cosine_similarity(fem_vec.reshape(1, -1), word_vec.reshape(1, -1))[0][0])
        for male_vec in self.male_attr:
            male_mean = np.append(male_mean, cosine_similarity(male_vec.reshape(1, -1), word_vec.reshape(1, -1))[0][0])
        return np.average(female_mean) - np.average(male_mean)

    def normalize(self, female_attr, male_attr):
        self.model.init_sims(replace=True)
        self.female_attr = self.get_vecs(female_attr)
        self.male_attr = self.get_vecs(male_attr)

In [3]:
# Load word vector
model = word2vec.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)


In [5]:
vocab = model.vocab.keys()

In [13]:
idx = 290000
list(vocab)[idx:idx+1000]

['Frogster',
 'Craig_Stebic',
 'Methodist_Episcopal_Church',
 'Castillian',
 'Bayleys',
 'Candy_Ride',
 'BROOKINGS',
 'Chris_Shuker',
 '##.#mm_thick',
 'Mamadi_Diane',
 'staunchest_opponents',
 'Bishop_Hendricken',
 'bl',
 'Balkin',
 'Yablonski',
 'Youre',
 'ultra_luxurious',
 'tog',
 'Broadbridge',
 'CIM_Standards',
 'outer_fringes',
 'By_DAVID_MURPHY',
 'universally_acclaimed',
 'Kaveh',
 'procreative',
 'Dumm',
 'Servcorp',
 'Kim_Young_sam',
 'Compliment',
 'Magnetic_Field',
 'NYSE_LDK',
 'Menangle',
 'Bramham',
 'SHOUT',
 'Briski',
 'Linthwaite',
 'captain_Steve_Borthwick',
 'Dunelm',
 'Raheen',
 'ROGERS_Ark.',
 'Abbottabad_compound',
 'Surat_Basin',
 'LIMBAUGH',
 'GERRY',
 'Hamdard',
 'Responsible_Gaming',
 'Future_Generations',
 'Kalima',
 'Psalmist',
 'Yell.com',
 'McCutchan',
 'Barcroft',
 'EQUAL',
 'votive',
 'Pacing_Series',
 'T.Stewart',
 'baseman_Bret_Boone',
 'PurchasePro',
 'OpEd',
 'hybrid_propulsion',
 'EXIM_Bank',
 'X_FAB',
 'Formulate',
 'Philip_Zelikow',
 'semiconduc

In [None]:
# Preprocess vocabulary and get rid of capital letters that aren't at the beginning of a sentence
# This is to distinguish between brands and general uses of brands that are words
vocab = model.vocab.keys()
vocab = [v.lower() for v in vocab]

In [4]:
# Load brand name data frame, keep brands that are in word2vec model
df = pd.read_csv('../Brand-Name-Gender-Prediction/brand_names_coded.csv')
df = df[df['BRAND'].isin(vocab)]
df = df[['BRAND', 'CATEGORY']]

In [5]:
# Get female and male attributes
male = []
female = []
with open('pairs_paper.txt', 'r') as file:
    data = file.readlines()
    data = [line.strip() for line in data]
    for line in data:
        sub = line.split(', ')
        female.append(sub[0])
        male.append(sub[1])

weat = Weat(w2v_model=model, female_attr=female, male_attr=male)
print(female)
print(male)

['she', 'her', 'woman', 'Mary', 'herself', 'daughter', 'mother', 'gal', 'girl', 'female']
['he', 'his', 'man', 'John', 'himself', 'son', 'father', 'guy', 'boy', 'male']


In [6]:
# Unnormalized cosine similarities
df['UNNORMALIZED WEAT'] = df['BRAND'].apply(lambda x: weat.get_association(x))
df['UNNORMALIZED WEAT GENDER'] = df['UNNORMALIZED WEAT'].apply(lambda x: 'F' if x > 0 else 'M')
pd.DataFrame(df).to_csv('unnormalized_weat.csv')


In [11]:
df.drop_duplicates(subset='BRAND')['UNNORMALIZED WEAT GENDER'].value_counts()

F    959
M    512
Name: UNNORMALIZED WEAT GENDER, dtype: int64