### Initialize

In [2]:
# Install a few python packages using pip
#from w266_common import utils
#utils.require_package("wget")      # for fetching dataset

# Standard python helper libraries.
import os, sys, re, json, time
import itertools, collections
from IPython.display import display

# NumPy and SciPy for matrix ops
import numpy as np
import scipy.sparse

# NLTK for NLP utils
import nltk

# Helper libraries
# from w266_common import utils, vocabulary, tf_embed_viz

# from __future__ import print_function, division
#%matplotlib inline
# from matplotlib import pyplot as plt
import random
import debiaswe as dwe
import debiaswe.we as we
from debiaswe.we import WordEmbedding
# from debiaswe.data import load_professions

### Import GloVe dataset
Contains 6B tokens from Wikipedia 2014 + Gigawords 5.

(will take a while to run)

In [3]:
import vector_math; reload(vector_math)

def show_nns(hands, word, k=10):
    """Helper function to print neighbors of a given word."""
    word = word.lower()
    print("Nearest neighbors for '{:s}'".format(word))
    v = hands.get_vector(word)
    for i, sim in zip(*vector_math.find_nn_cos(v, hands.W, k)):
        target_word = hands.vocab.id_to_word[i]
        print("{:.03f} : '{:s}'".format(sim, target_word))
    print("")
    
def show_analogy(hands, a, b, c, k=5):
    """Compute and print a vector analogy."""
    a, b, c = a.lower(), b.lower(), c.lower()
    va = hands.get_vector(a)
    vb = hands.get_vector(b)
    vc = hands.get_vector(c)
    print("'{a:s}' is to '{b:s}' as '{c:s}' is to ___".format(**locals()))
    for i, sim in zip(*vector_math.analogy(va, vb, vc, hands.W, k)):
        target_word = hands.vocab.id_to_word[i]
        print("{:.03f} : '{:s}'".format(sim, target_word))
    print("")


In [5]:
# April's embedding
E = WordEmbedding("./embeddings/glove/glove.6B/glove.6B.300d.txt") # 50, 100, 200, 300 dim are available

*** Reading data from ./embeddings/glove/glove.6B/glove.6B.300d.txt
(400000, 300)
400000 words of dimension 300 : the, ,, ., of, ..., kronik, rolonda, zsombor, sandberger
400000 words of dimension 300 : the, ,, ., of, ..., kronik, rolonda, zsombor, sandberger


In [8]:
# load (modified) professions file
with open("./data/professions_parties.json", 'r') as f:
    professions = json.load(f)
profession_words = set([p[0] for p in professions])

# show party bias in word vectors associated with professions
# sort by projection score in the direction of gender
# equivalent to Fig 1
# sp = sorted([(E.v(w).dot(lean_direction), w) for w in profession_words])

political_terms = ["progressive","neo","left","reformist","free","generous","leftist",
"liberalism","tolerant","loose","broad","handsome",
"big","giving","socialized",
"conservative",
"adult","politics","libertarian",
"democratic","centrist","ideology","socialist","populist","evangelical",
"liberty","freely","freedom",
"independent","welfare","government","equality",
"enlightened","lenient","freedoms",
"caucus","lax","liberalization","liberalized",
"reactionary","right","moderate",
"cautious","bourgeois",
"conventional","liberal",
"liberalism",
"conservatism","moderates","centrist","populist","libertarian",
"secular","progressive","radical","hardline","evangelical","reformist",
"politics","conservatives","nationalism",
"tradition","culture","civilization","restoration",
"orthodox","minimalist"]
lib_terms = ["progressive","neo","left","reformist","free","generous","leftist",
"liberalism","tolerant","loose","broad","handsome",
"big","giving","socialized",
"conservative",
"adult","politics","libertarian",
"democratic","centrist","ideology","socialist","populist","evangelical",
"liberty","freely","freedom",
"independent","welfare","government","equality",
"enlightened","lenient","freedoms",
"caucus","lax","liberalization","liberalized"]
conservative_terms = ["reactionary","right","moderate",
"cautious","bourgeois",
"conventional","liberal",
"liberalism","conservatism","moderates","centrist","populist","libertarian",
"secular","progressive","radical","hardline","evangelical","reformist",
"politics","conservatives","nationalism",
"tradition","culture","civilization","restoration",
"orthodox","minimalist"]

# political direction
vs = [sum(E.v(w) for w in political_terms) for political_terms in (conservative_terms, lib_terms)]
vs = [v / np.linalg.norm(v) for v in vs]

v_political = vs[1] - vs[0]
v_political = v_political / np.linalg.norm(v_political)
sp = sorted([(E.v(w).dot(v_political), w) for w in profession_words])

print('Extreme democrat occupations')
print(sp[-20:])

print('\n\nExtreme republican occupations')
print(sp[0:20])

# This looks suspicious - liberal/dem professions are at end of alphabet, conserative/rep professions are at the beginning 

Extreme democrat occupations
[(0.1449885, u'advocate'), (0.14499615, u'accountant'), (0.14649448, u'manager'), (0.15306641, u'lawyer'), (0.1555304, u'prisoner'), (0.16315494, u'inspector'), (0.16373248, u'receptionist'), (0.16558486, u'director'), (0.16989838, u'pilot'), (0.17002457, u'secretary'), (0.17524724, u'president'), (0.17728889, u'attorney'), (0.19358177, u'officer'), (0.20143299, u'driver'), (0.205292, u'investigator'), (0.21291083, u'commissioner'), (0.21375966, u'citizen'), (0.22395705, u'student'), (0.28910327, u'worker'), (0.38026524, u'employee')]


Extreme republican occupations
[(-0.23575318, u'archaeologist'), (-0.23276556, u'philosopher'), (-0.21155836, u'historian'), (-0.19257726, u'anthropologist'), (-0.184031, u'cleric'), (-0.18385674, u'radiologist'), (-0.18248703, u'saint'), (-0.17012866, u'disc-jockey'), (-0.16695467, u'poet'), (-0.16127391, u'sociologist'), (-0.1546754, u'painter'), (-0.15095007, u'cardiologist'), (-0.14952688, u'sculptor'), (-0.14919002, u'f

In [17]:
lean_direction = E.diff("progressives", "conservatives")
leans_direction = E.diff("liberals", "conservatives")
party_direction = E.diff("democrats", "republicans")

# apparently this works now?!

In [19]:
party_analogies = E.best_analogies_dist_thresh(party_direction)

In [31]:
# print leaning analogies
for (a,b,c) in lean_analogies:
    print(a+"-"+b)

progressives-conservatives
anxieties-fears
mid-2000s-1990s
progressive-conservative
d'état-coup
anyhow-anyway
speculations-speculation
tipper-gore
mid-1950s-1980s
jihadists-extremists
evidences-evidence
goverment-government
outlays-spending
conceivably-could
dae-kim
apprehensive-worried
entombed-buried
260,000-200,000
frictions-tensions
mackintosh-cameron
chided-criticized
plebiscite-referendum
anti--anti
unparalleled-unprecedented
4,400-2,000
’m-'m
indignant-angry
gotta-'ll
purposefully-deliberately
pharmacists-doctors
2,250-1,200
reimbursement-payments
elated-surprised
panoramic-view
e.u.-eu
innumerable-numerous
adversely-affect
mazen-abbas
world-wide-worldwide
monies-funds
astonishingly-surprisingly
1919-november
someplace-else
palestine-israel
penitentiary-prison
québec-quebec
divulged-disclosed
’d-'d
unbelievably-extremely
medications-drugs
commences-begins
hopelessness-frustration
9.3-percent
marvelous-remarkable
seething-anger
spiritual-religious
enacting-legislation
unanticipat

In [16]:
for (a,b,c) in leans_analogies:
    print(a+"-"+b)

liberals-conservatives
liberal-conservative
rosset-henman
democratic-republican
liberalism-conservatism
d-r
wolverines-longhorns
chretien-chirac
incumbents-gop
lieberman-mccain
canada-britain
czechoslovakia-poland
kucera-rafter
mustangs-smu
abkhazia-georgia
bandits-sayyaf
akron-ohio
wanderers-bolton
caf-fifa
1918-april
sana-irna
petrova-sharapova
azerbaijan-iran
halfback-tailback
communists-hardliners
jayasuriya-dravid
firemen-firefighters
woodforde-philippoussis
hungarians-poles
precipitated-triggered
germans-germany
ideologies-beliefs
ronaldinho-ronaldo
canoes-rafts
fatigued-visibly
zürich-basel
catalonia-spain
mazen-abbas
limousine-hearse
canadian-british
syrians-iranians
unitarian-evangelical
14.2-17.8
vitamins-nutritional
yorkers-giuliani
accrington-stockport
sánchez-sanchez
matisse-paintings
shostakovich-mahler
wawrinka-roddick
anymore-sure
independents-republicans
damascus-tehran
speculations-speculation
1919-february
lungs-tissue
catastrophe-disaster
rollicking-rousing
limeligh

In [20]:
for (a,b,c) in party_analogies:
    print(a+"-"+b)

democrats-republicans
democratic-republican
democrat-gop
d-r
communists-revolutionaries
candidate-dole
centrist-moderates
gephardt-gingrich
kristina-brandi
greens-fairways
lu-shi
1806-1799
merkel-sarkozy
germany-france
german-french
gerhard-jacques
kohl-mitterrand
hungarians-macedonians
parliament-congress
becker-agassi
fdp-svp
lawmaker-congressman
governing-rules
daschle-lott
langer-waugh
slovak-bulgarian
lieberman-mccain
socialists-nationalists
huber-majoli
parliamentary-legislative
massimo-moratti
16-year-14-year
steinmeier-lavrov
livni-mofaz
speaker-newt
opposition-supporters
unseeded-roddick
redwood-redwoods
beilin-rabbo
badgers-buckeyes
marlene-garbo
baucus-grassley
schumer-d'amato
1631-1562
peres-arafat
0.56-0.52
1525-1565
olmert-erekat
beer-whiskey
mahendra-sourav
1.34-2.14
schroeder-blair
11,500-2,200
germans-americans
ahead-going
brahms-berlioz
measles-typhoid
duchy-vassal
nasional-bhd.
senator-gramm
jens-lehmann
party-members
kohlschreiber-coria
christian-faith
riesling-zinf

In [25]:
# load professions file - RAN WITH P2
with open("./data/professions_parties.json", 'r') as f:
    professions = json.load(f)
profession_words_glove = set([p[0] for p in professions])

sp1 = sorted([(E1.v(w).dot(lean_direction), w) for w in profession_words_glove])

print('Extreme progressive occupations')
print(sp1[-25:])

print('\n\nExtreme conservative occupations')
print(sp1[0:25])

Extreme progressive occupations
[(0.12166341, u'trumpeter'), (0.123858474, u'nun'), (0.12425645, u'bookkeeper'), (0.124681965, u'electrician'), (0.125307, u'mobster'), (0.12793238, u'trucker'), (0.12981793, u'waiter'), (0.12994574, u'paralegal'), (0.13134617, u'medic'), (0.13181081, u'plumber'), (0.13445294, u'infielder'), (0.13738659, u'disc-jockey'), (0.14198512, u'realtor'), (0.14223564, u'receptionist'), (0.1464186, u'ballerina'), (0.14818361, u'artiste'), (0.14871094, u'patrolman'), (0.15682077, u'janitor'), (0.15682685, u'bartender'), (0.15813881, u'sportsman'), (0.17138809, u'cabbie'), (0.20436387, u'sportswriter'), (0.20571944, u'cellist'), (0.21850221, u'ballplayer'), (0.26412624, u'welder')]


Extreme conservative occupations
[(-0.36460102, u'minister'), (-0.31761765, u'deputy'), (-0.29493612, u'dean'), (-0.28072485, u'judge'), (-0.27158898, u'chancellor'), (-0.25018385, u'officer'), (-0.24489602, u'president'), (-0.23984142, u'critic'), (-0.23661672, u'director'), (-0.228858

In [23]:
party_direction = E.diff("democrats", "republicans")
# load professions file - RAN WITH P2
with open("./data/professions_parties.json", 'r') as f:
    professions = json.load(f)
profession_words_glove = set([p[0] for p in professions])

sp1 = sorted([(E.v(w).dot(lean_direction), w) for w in profession_words_glove])

print('Extreme democrat occupations')
print(sp1[-25:])

print('\n\nExtreme republican occupations')
print(sp1[0:25])

Extreme democrat occupations
[(0.12166341, u'trumpeter'), (0.123858474, u'nun'), (0.12425645, u'bookkeeper'), (0.124681965, u'electrician'), (0.125307, u'mobster'), (0.12793238, u'trucker'), (0.12981793, u'waiter'), (0.12994574, u'paralegal'), (0.13134617, u'medic'), (0.13181081, u'plumber'), (0.13445294, u'infielder'), (0.13738659, u'disc-jockey'), (0.14198512, u'realtor'), (0.14223564, u'receptionist'), (0.1464186, u'ballerina'), (0.14818361, u'artiste'), (0.14871094, u'patrolman'), (0.15682077, u'janitor'), (0.15682685, u'bartender'), (0.15813881, u'sportsman'), (0.17138809, u'cabbie'), (0.20436387, u'sportswriter'), (0.20571944, u'cellist'), (0.21850221, u'ballplayer'), (0.26412624, u'welder')]


Extreme republican occupations
[(-0.36460102, u'minister'), (-0.31761765, u'deputy'), (-0.29493612, u'dean'), (-0.28072485, u'judge'), (-0.27158898, u'chancellor'), (-0.25018385, u'officer'), (-0.24489602, u'president'), (-0.23984142, u'critic'), (-0.23661672, u'director'), (-0.22885832, u

In [26]:
gender_direction = E1.diff("she", "he")

# load professions file - RAN WITH P2
with open("./data/professions_parties.json", 'r') as f:
    professions = json.load(f)
profession_words_glove = set([p[0] for p in professions])

sp1 = sorted([(E1.v(w).dot(gender_direction), w) for w in profession_words_glove])

print('Extreme she occupations')
print(sp1[-25:])

print('\n\nExtreme he occupations')
print(sp1[0:25])

Extreme she occupations
[(0.14705601, u'dermatologist'), (0.1479655, u'swimmer'), (0.1506968, u'pediatrician'), (0.15392463, u'lifeguard'), (0.15917873, u'teenager'), (0.16122326, u'paralegal'), (0.17006998, u'publicist'), (0.1855857, u'therapist'), (0.19313464, u'singer'), (0.20385656, u'hairdresser'), (0.20465359, u'dancer'), (0.2060614, u'nun'), (0.22145087, u'receptionist'), (0.23408638, u'stylist'), (0.23600248, u'housekeeper'), (0.23718882, u'nanny'), (0.24481945, u'nurse'), (0.24694887, u'maid'), (0.26713684, u'homemaker'), (0.29754266, u'socialite'), (0.2991577, u'housewife'), (0.2998004, u'waitress'), (0.3178956, u'ballerina'), (0.33363873, u'businesswoman'), (0.40011856, u'actress')]


Extreme he occupations
[(-0.21125026, u'businessman'), (-0.19674721, u'midfielder'), (-0.19482087, u'colonel'), (-0.17713188, u'footballer'), (-0.1767638, u'manager'), (-0.17547831, u'cleric'), (-0.1751523, u'surveyor'), (-0.16728704, u'inventor'), (-0.16190764, u'coach'), (-0.15954459, u'congr

In [27]:
race_direction = E1.diff("white", "black")

# load professions file - RAN WITH P2
with open("./data/professions_parties.json", 'r') as f:
    professions = json.load(f)
profession_words_glove = set([p[0] for p in professions])

sp1 = sorted([(E1.v(w).dot(race_direction), w) for w in profession_words_glove])

print('Extreme white occupations')
print(sp1[-25:])

print('\n\nExtreme black occupations')
print(sp1[0:25])

Extreme white occupations
[(0.09865982, u'curator'), (0.108171836, u'deputy'), (0.11369758, u'manager'), (0.11478043, u'attorney'), (0.11942616, u'bureaucrat'), (0.1204211, u'receptionist'), (0.12231331, u'diplomat'), (0.1227509, u'gardener'), (0.12763932, u'administrator'), (0.12964483, u'architect'), (0.1299957, u'ambassador'), (0.13897009, u'caretaker'), (0.14045586, u'strategist'), (0.14170404, u'baker'), (0.14186844, u'infielder'), (0.1430281, u'president'), (0.14610942, u'senator'), (0.16301455, u'solicitor'), (0.16587652, u'inspector'), (0.17507976, u'negotiator'), (0.19350114, u'undersecretary'), (0.21487606, u'envoy'), (0.23560712, u'counselor'), (0.30653942, u'secretary'), (0.33467785, u'aide')]


Extreme black occupations
[(-0.18507917, u'entrepreneur'), (-0.16398565, u'gangster'), (-0.15014957, u'musician'), (-0.14196123, u'cabbie'), (-0.13984782, u'sociologist'), (-0.128691, u'tycoon'), (-0.12578337, u'artiste'), (-0.12251607, u'soloist'), (-0.12165736, u'performer'), (-0.

In [28]:
race_direction = E1.diff("rich", "poor")

# load professions file - RAN WITH P2
with open("./data/professions_parties.json", 'r') as f:
    professions = json.load(f)
profession_words_glove = set([p[0] for p in professions])

sp1 = sorted([(E1.v(w).dot(race_direction), w) for w in profession_words_glove])

print('Extreme rich occupations')
print(sp1[-25:])

print('\n\nExtreme poor occupations')
print(sp1[0:25])

Extreme rich occupations
[(0.11554874, u'lyricist'), (0.11559882, u'mobster'), (0.11878678, u'biologist'), (0.11914566, u'historian'), (0.12047925, u'editor'), (0.12273921, u'drummer'), (0.12753421, u'songwriter'), (0.12766533, u'anthropologist'), (0.12826373, u'astronomer'), (0.12908998, u'filmmaker'), (0.12996648, u'tycoon'), (0.13249454, u'infielder'), (0.13521942, u'councilor'), (0.1365053, u'socialite'), (0.13822272, u'restaurateur'), (0.14169464, u'columnist'), (0.14291015, u'businessman'), (0.14699751, u'photojournalist'), (0.15514232, u'industrialist'), (0.15900847, u'jeweler'), (0.16316113, u'comedian'), (0.16750655, u'financier'), (0.16826342, u'curator'), (0.1716395, u'geologist'), (0.20620294, u'archaeologist')]


Extreme poor occupations
[(-0.15411526, u'worker'), (-0.15089017, u'driver'), (-0.1473218, u'laborer'), (-0.14393033, u'nanny'), (-0.13397142, u'nurse'), (-0.12707894, u'missionary'), (-0.12488432, u'teacher'), (-0.12483972, u'nun'), (-0.12144119, u'electrician'),

In [12]:
race_direction = E.diff("old", "young")

# load professions file - RAN WITH P2
with open("./data/professions_parties.json", 'r') as f:
    professions = json.load(f)
profession_words_glove = set([p[0] for p in professions])

sp1 = sorted([(E.v(w).dot(race_direction), w) for w in profession_words_glove])

print('Extreme ols occupations')
print(sp1[-25:])

print('\n\nExtreme young occupations')
print(sp1[0:25])

Extreme elderly occupations
[(0.121213645, u'handyman'), (0.12238066, u'custodian'), (0.122520305, u'caretaker'), (0.12291695, u'cabbie'), (0.12464468, u'paralegal'), (0.12625685, u'receptionist'), (0.12719621, u'worker'), (0.12758507, u'landlord'), (0.13417637, u'jeweler'), (0.13888592, u'pharmacist'), (0.14399216, u'firefighter'), (0.1464987, u'salesman'), (0.14908053, u'hairdresser'), (0.15608409, u'clerk'), (0.16073078, u'medic'), (0.17892808, u'dentist'), (0.18171959, u'plumber'), (0.18551646, u'technician'), (0.19900104, u'laborer'), (0.20589532, u'bookkeeper'), (0.2097608, u'homemaker'), (0.21859434, u'electrician'), (0.22226101, u'welder'), (0.24492887, u'shopkeeper'), (0.24591333, u'mechanic')]


Extreme young occupations
[(-0.16361435, u'soloist'), (-0.16106634, u'protagonist'), (-0.1582798, u'vocalist'), (-0.15160933, u'comic'), (-0.13176918, u'pianist'), (-0.12505761, u'composer'), (-0.12370213, u'violinist'), (-0.12338391, u'tutor'), (-0.11953864, u'artist'), (-0.11733991,

In [42]:
# E1.neighbors(E1.v("republican") - E1.v("democrat") + E1.v("liberal"))
E1.neighbors("progressives")
find_nn_cos(E1.v("democrat"), E1)

NameError: name 'find_nn_cos' is not defined

In [7]:
# load professions file - RAN WITH P3
with open("./data/professions_parties.json", 'r') as f:
    professions = json.load(f)
profession_words_glove = set([p[0] for p in professions])

sp1 = sorted([(E1.v(w).dot(lean_direction), w) for w in profession_words_glove])

print('Extreme progressive occupations')
print(sp1[-20:])

print('\n\nExtreme conservative occupations')
print(sp1[0:20])

Extreme progressive occupations
[(3.689349e+19, 'bodyguard'), (3.689349e+19, 'boxer'), (3.689349e+19, 'cardiologist'), (3.689349e+19, 'cartoonist'), (3.689349e+19, 'designer'), (3.689349e+19, 'environmentalist'), (3.689349e+19, 'hairdresser'), (3.689349e+19, 'judge'), (3.689349e+19, 'mobster'), (3.689349e+19, 'officer'), (3.689349e+19, 'photographer'), (3.689349e+19, 'plumber'), (3.689349e+19, 'sailor'), (3.689349e+19, 'screenwriter'), (3.689349e+19, 'sculptor'), (3.689349e+19, 'servant'), (3.689349e+19, 'solicitor'), (3.689349e+19, 'solicitor-general'), (3.689349e+19, 'tutor'), (3.689349e+19, 'warden')]


Extreme conservative occupations
[(-3.689349e+19, 'advocate'), (-3.689349e+19, 'biologist'), (-3.689349e+19, 'bishop'), (-3.689349e+19, 'campaigner'), (-3.689349e+19, 'caretaker'), (-3.689349e+19, 'commander'), (-3.689349e+19, 'commentator'), (-3.689349e+19, 'dermatologist'), (-3.689349e+19, 'educator'), (-3.689349e+19, 'employee'), (-3.689349e+19, 'firebrand'), (-3.689349e+19, 'insp