### Initialize

In [1]:
# Install a few python packages using pip
from w266_common import utils
utils.require_package("wget")      # for fetching dataset

# Standard python helper libraries.
import os, sys, re, json, time
import itertools, collections
from IPython.display import display

# NumPy and SciPy for matrix ops
import numpy as np
import scipy.sparse

# NLTK for NLP utils
import nltk

from importlib import reload

# Helper libraries
from w266_common import utils, vocabulary, tf_embed_viz

# from __future__ import print_function, division
#%matplotlib inline
# from matplotlib import pyplot as plt
import random
import debiaswe as dwe
import debiaswe.we as we
from debiaswe.we import WordEmbedding
# from debiaswe.data import load_professions

from gensim.models import Word2Vec
import gensim.utils

  from ._conv import register_converters as _register_converters


### Import GloVe dataset
Contains 6B tokens from Wikipedia 2014 + Gigawords 5.

(will take a while to run)

In [9]:
import glove_helper; reload(glove_helper)

hands = glove_helper.Hands(ndim=300)

Loading vectors from data/glove/glove.6B.zip
Parsing file: data/glove/glove.6B.zip:glove.6B.300d.txt
Found 400,000 words.
Parsing vectors... Done! (W.shape = (400003, 300))


In [13]:
import vector_math; reload(vector_math)

def find_nn_cos(v, Wv, k=10):
    nns = []
    ds = []
    
    mid = np.dot(Wv, v)
    norm_Wv = np.linalg.norm(Wv, axis=1)
    norm_v = np.linalg.norm(v, axis=-1)
    # print (norm_Wv, norm_v)
    neighbors = np.divide(mid, np.dot(norm_Wv, norm_v))
    nns = np.argsort(neighbors)[-k:]
    for n in nns:
        ds.append(neighbors[n])
    # print(nns, ds)
    return nns, ds

def analogy(vA, vB, vC, Wv, k=5):
    v = vC + (vB-vA)
    return find_nn_cos(v, Wv, k)

def show_nns(hands, word, k=10):
    """Helper function to print neighbors of a given word."""
    word = word.lower()
    print("Nearest neighbors for '{:s}'".format(word))
    v = hands.get_vector(word)
    for i, sim in zip(*vector_math.find_nn_cos(v, hands.W, k)):
        target_word = hands.vocab.id_to_word[i]
        print("{:.03f} : '{:s}'".format(sim, target_word))
    print("")
    
def show_analogy(hands, a, b, c, k=5):
    """Compute and print a vector analogy."""
    a, b, c = a.lower(), b.lower(), c.lower()
    va = hands.get_vector(a)
    vb = hands.get_vector(b)
    vc = hands.get_vector(c)
    print("'{a:s}' is to '{b:s}' as '{c:s}' is to ___".format(**locals()))
    for i, sim in zip(*vector_math.analogy(va, vb, vc, hands.W, k)):
        target_word = hands.vocab.id_to_word[i]
        print("{:.03f} : '{:s}'".format(sim, target_word))
    print("")


In [14]:
show_nns(hands, "democrat")

Nearest neighbors for 'democrat'
0.610 : 'incumbent'
0.616 : 'senate'
0.642 : 'congressman'
0.661 : 'rep.'
0.699 : 'sen.'
0.711 : 'democrats'
0.720 : 'republican'
0.736 : 'democratic'
0.739 : 'senator'
1.000 : 'democrat'



In [None]:
show_nns(hands, "democrat", 5)
show_nns(hands, "liberal", 5)
show_nns(hands, "progressive", 5)

show_nns(hands, "republican", 5)
show_nns(hands, "conservative", 5)

In [15]:
show_analogy(hands, "democrat", "republican", "liberal") # works
show_analogy(hands, "democrat", "republican", "liberalism") # works
show_analogy(hands, "democrat", "republican", "progressive") # works
show_analogy(hands, "democrat", "republican", "anti-federalist") # works
show_analogy(hands, "democrat", "republican", "left-wing") # works
show_analogy(hands, "democrat", "republican", "left-of-center") # works
show_analogy(hands, "democrat", "republican", "left-leaning") # works
show_analogy(hands, "democrat", "republican", "far-left") # works


'democrat' is to 'republican' as 'liberal' is to ___
0.618 : 'gop'
0.662 : 'conservatives'
0.682 : 'republican'
0.731 : 'conservative'
0.764 : 'liberal'

'democrat' is to 'republican' as 'liberalism' is to ___
0.486 : 'nationalism'
0.492 : 'ideology'
0.586 : 'republicanism'
0.687 : 'conservatism'
0.747 : 'liberalism'

'democrat' is to 'republican' as 'progressive' is to ___
0.442 : 'party'
0.485 : 'conservative'
0.497 : 'gop'
0.539 : 'republican'
0.770 : 'progressive'

'democrat' is to 'republican' as 'anti-federalist' is to ___
0.355 : 'adams-clay'
0.357 : 'anticlerical'
0.366 : 'anti-masonic'
0.368 : 'anti-clericalism'
0.780 : 'anti-federalist'

'democrat' is to 'republican' as 'left-wing' is to ___
0.462 : 'anti-communist'
0.464 : 'far-left'
0.554 : 'far-right'
0.685 : 'right-wing'
0.771 : 'left-wing'

'democrat' is to 'republican' as 'left-of-center' is to ___
0.355 : 'folkeparti'
0.356 : 'pro-republican'
0.362 : 'pro-serbian'
0.365 : 'gryf'
0.713 : 'left-of-center'

'democrat' is 

In [16]:
# education
show_analogy(hands, "democrat", "republican", "higher-education")
show_analogy(hands, "democrat", "republican", "college")
show_analogy(hands, "democrat", "republican", "university")
show_analogy(hands, "democrat", "republican", "intelligent")
show_analogy(hands, "democrat", "republican", "scholar")
show_analogy(hands, "democrat", "republican", "phd")
show_analogy(hands, "democrat", "republican", "blue-collar") # !!
show_analogy(hands, "democrat", "republican", "employed")
show_analogy(hands, "democrat", "republican", "elitist") # !!!
show_analogy(hands, "democrat", "republican", "union")
show_analogy(hands, "republican", "democrat", "uneducated")
show_analogy(hands, "republican", "democrat", "uneducated")

'democrat' is to 'republican' as 'higher-education' is to ___
0.343 : 'deposit-taking'
0.343 : 'ottoman-era'
0.346 : 'teacher-training'
0.354 : 'agronomical'
0.766 : 'higher-education'

'democrat' is to 'republican' as 'college' is to ___
0.522 : 'graduate'
0.551 : 'university'
0.552 : 'colleges'
0.573 : 'school'
0.803 : 'college'

'democrat' is to 'republican' as 'university' is to ___
0.536 : 'graduate'
0.564 : 'professor'
0.573 : 'faculty'
0.584 : 'college'
0.809 : 'university'

'democrat' is to 'republican' as 'intelligent' is to ___
0.451 : 'articulate'
0.460 : 'sophisticated'
0.486 : 'thoughtful'
0.488 : 'smart'
0.725 : 'intelligent'

'democrat' is to 'republican' as 'scholar' is to ___
0.446 : 'literature'
0.452 : 'professor'
0.478 : 'historian'
0.541 : 'scholars'
0.756 : 'scholar'

'democrat' is to 'republican' as 'phd' is to ___
0.625 : 'doctorate'
0.667 : 'doctoral'
0.681 : 'ph.d'
0.723 : 'ph.d.'
0.789 : 'phd'

'democrat' is to 'republican' as 'blue-collar' is to ___
0.372 : 

In [18]:
# race

show_analogy(hands, "democrat", "republican", "african-american")

show_analogy(hands, "democrat", "republican", "black")

show_analogy(hands, "democrat", "republican", "latino")

show_analogy(hands, "democrat", "republican", "hispanic")

show_analogy(hands, "democrat", "republican", "oppressed")

show_analogy(hands, "democrat", "republican", "immigrants")

show_analogy(hands, "democrat", "republican", "minorities")

'democrat' is to 'republican' as 'african-american' is to ___
0.427 : 'female'
0.436 : 'latino'
0.467 : 'african-americans'
0.480 : 'hispanic'
0.760 : 'african-american'

'democrat' is to 'republican' as 'black' is to ___
0.433 : 'republican'
0.436 : 'blue'
0.448 : 'red'
0.650 : 'white'
0.768 : 'black'

'democrat' is to 'republican' as 'latino' is to ___
0.437 : 'gop'
0.525 : 'hispanics'
0.537 : 'latinos'
0.750 : 'hispanic'
0.809 : 'latino'

'democrat' is to 'republican' as 'hispanic' is to ___
0.506 : 'gop'
0.551 : 'latinos'
0.594 : 'hispanics'
0.722 : 'latino'
0.814 : 'hispanic'

'democrat' is to 'republican' as 'oppressed' is to ___
0.463 : 'oppression'
0.485 : 'marginalized'
0.503 : 'disenfranchised'
0.521 : 'persecuted'
0.774 : 'oppressed'

'democrat' is to 'republican' as 'immigrants' is to ___
0.544 : 'immigration'
0.582 : 'undocumented'
0.633 : 'immigrant'
0.640 : 'migrants'
0.792 : 'immigrants'

'democrat' is to 'republican' as 'minorities' is to ___
0.441 : 'muslims'
0.477 : 

In [19]:
# demographics 

show_analogy(hands, "democrat", "republican", "woman")

show_analogy(hands, "republican", "democrat", "redneck")

show_analogy(hands, "republican", "democrat", "southern")

show_analogy(hands, "republican", "democrat", "pro-life")

show_analogy(hands, "republican", "democrat", "elderly")

show_analogy(hands, "republican", "democrat", "older")

'democrat' is to 'republican' as 'woman' is to ___
0.536 : 'women'
0.545 : 'man'
0.563 : 'female'
0.576 : 'girl'
0.738 : 'woman'

'republican' is to 'democrat' as 'redneck' is to ___
0.395 : 'bumpkin'
0.398 : 'snooty'
0.410 : 'proto'
0.410 : 'hillbilly'
0.747 : 'redneck'

'republican' is to 'democrat' as 'southern' is to ___
0.496 : 'southwestern'
0.499 : 'eastern'
0.501 : 'northern'
0.505 : 'south'
0.730 : 'southern'

'republican' is to 'democrat' as 'pro-life' is to ___
0.393 : 'lgbt'
0.414 : 'macovei'
0.454 : 'anti-abortion'
0.595 : 'pro-choice'
0.761 : 'pro-life'

'republican' is to 'democrat' as 'elderly' is to ___
0.407 : 'woman'
0.422 : 'sick'
0.450 : 'frail'
0.452 : 'infirm'
0.717 : 'elderly'

'republican' is to 'democrat' as 'older' is to ___
0.443 : 'siblings'
0.460 : 'sister'
0.466 : 'age'
0.568 : 'younger'
0.715 : 'older'



In [None]:
# religion 

show_analogy(hands, "democrat", "republican", "secular")

show_analogy(hands, "democrat", "republican", "atheist")
show_analogy(hands, "democrat", "republican", "agnostic")

show_analogy(hands, "republican", "democrat", "evangelical")
show_analogy(hands, "republican", "democrat", "christian")
show_analogy(hands, "republican", "democrat", "religious")




In [None]:
show_nns(hands, "yda")

In [None]:
# sexuality

show_analogy(hands, "democrat", "republican", "gay")

show_analogy(hands, "democrat", "republican", "lesbian")

show_analogy(hands, "democrat", "republican", "transgender")

show_analogy(hands, "democrat", "republican", "bisexual")

show_analogy(hands, "democrat", "republican", "homosexual") # !!!

In [None]:
# income / work
show_analogy(hands, "democrat", "republican", "poor")
show_analogy(hands, "democrat", "republican", "lower-class")
show_analogy(hands, "democrat", "republican", "union")
show_analogy(hands, "democrat", "republican", "unionized")
show_analogy(hands, "democrat", "republican", "welfare")

show_analogy(hands, "republican", "democrat", "wealthy")
show_analogy(hands, "republican", "democrat", "rich")
show_analogy(hands, "republican", "democrat", "upper-class")
show_analogy(hands, "republican", "democrat", "high-class")
show_analogy(hands, "republican", "democrat", "nonunion")
show_analogy(hands, "republican", "democrat", "non-union")


In [None]:
show_nns(hands, "homosexual")

show_nns(hands, "man")

show_nns(hands, "southern")

In [None]:
print (sp)

In [None]:
# April's direction calc
# democrat-republican gender direction on GloVe
party_direction = E.diff("democrats", "republicans")

# appendix A: generating analogies
party_analogies = E.best_analogies_dist_thresh(party_direction)

In [44]:
# April's embedding
E = WordEmbedding("./embeddings/glove/glove.6B/glove.6B.300d.txt") # 50, 100, 200, 300 dim are available

*** Reading data from ./embeddings/glove/glove.6B/glove.6B.300d.txt
(400000, 300)
400000 words of dimension 300 : the, ,, ., of, ..., kronik, rolonda, zsombor, sandberger
400000 words of dimension 300 : the, ,, ., of, ..., kronik, rolonda, zsombor, sandberger


In [11]:
# load (modified) professions file
with open("./debiaswe/data/professions_parties.json", 'r') as f:
    professions = json.load(f)
profession_words = set([p[0] for p in professions])

# show party bias in word vectors associated with professions
# sort by projection score in the direction of gender
# equivalent to Fig 1
# sp = sorted([(E.v(w).dot(lean_direction), w) for w in profession_words])

political_terms = ["progressive","neo","left","reformist","free","generous","leftist",
"liberalism","tolerant","loose","broad","handsome",
"big","giving","socialized",
"conservative",
"adult","politics","libertarian",
"democratic","centrist","ideology","socialist","populist","evangelical",
"liberty","freely","freedom",
"independent","welfare","government","equality",
"enlightened","lenient","freedoms",
"caucus","lax","liberalization","liberalized",
"reactionary","right","moderate",
"cautious","bourgeois",
"conventional","liberal",
"liberalism",
"conservatism","moderates","centrist","populist","libertarian",
"secular","progressive","radical","hardline","evangelical","reformist",
"politics","conservatives","nationalism",
"tradition","culture","civilization","restoration",
"orthodox","minimalist"]
lib_terms = ["progressive","neo","left","reformist","free","generous","leftist",
"liberalism","tolerant","loose","broad","handsome",
"big","giving","socialized",
"conservative",
"adult","politics","libertarian",
"democratic","centrist","ideology","socialist","populist","evangelical",
"liberty","freely","freedom",
"independent","welfare","government","equality",
"enlightened","lenient","freedoms",
"caucus","lax","liberalization","liberalized"]
conservative_terms = ["reactionary","right","moderate",
"cautious","bourgeois",
"conventional","liberal",
"liberalism","conservatism","moderates","centrist","populist","libertarian",
"secular","progressive","radical","hardline","evangelical","reformist",
"politics","conservatives","nationalism",
"tradition","culture","civilization","restoration",
"orthodox","minimalist"]

# political direction
vs = [sum(E.v(w) for w in political_terms) for political_terms in (conservative_terms, lib_terms)]
vs = [v / np.linalg.norm(v) for v in vs]

v_political = vs[1] - vs[0]
v_political = v_political / np.linalg.norm(v_political)
sp = sorted([(E.v(w).dot(v_political), w) for w in profession_words])

print('Extreme democrat occupations')
print(sp[-20:])

print('\n\nExtreme republican occupations')
print(sp[0:20])

# This looks suspicious - liberal/dem professions are at end of alphabet, conserative/rep professions are at the beginning 

Extreme democrat occupations
[(0.0, 'treasurer'), (0.0, 'trooper'), (0.0, 'trucker'), (0.0, 'trumpeter'), (0.0, 'tutor'), (0.0, 'tycoon'), (0.0, 'undersecretary'), (0.0, 'understudy'), (0.0, 'valedictorian'), (0.0, 'vice-chancellor'), (0.0, 'violinist'), (0.0, 'vocalist'), (0.0, 'waiter'), (0.0, 'waitress'), (0.0, 'warden'), (0.0, 'warrior'), (0.0, 'welder'), (0.0, 'worker'), (0.0, 'wrestler'), (0.0, 'writer')]


Extreme republican occupations
[(0.0, 'accountant'), (0.0, 'acquaintance'), (0.0, 'actor'), (0.0, 'actress'), (0.0, 'addict'), (0.0, 'administrator'), (0.0, 'adventurer'), (0.0, 'advocate'), (0.0, 'aide'), (0.0, 'alderman'), (0.0, 'alter-ego'), (0.0, 'ambassador'), (0.0, 'analyst'), (0.0, 'anthropologist'), (0.0, 'archaeologist'), (0.0, 'archbishop'), (0.0, 'architect'), (0.0, 'artist'), (0.0, 'artiste'), (0.0, 'assassin')]


  ret = sqrt(sqnorm)


In [10]:
lean_direction = E1.diff("progressives", "conservatives")
# apparently this works now?!

In [42]:
# E1.neighbors(E1.v("republican") - E1.v("democrat") + E1.v("liberal"))
E1.neighbors("progressives")
find_nn_cos(E1.v("democrat"), E1)

NameError: name 'find_nn_cos' is not defined

#### Word2Vec - Google News
