In [1]:
# Setup
import warnings; warnings.simplefilter('ignore')
# Set your working directory
WORKING_DIR = 'C:/Users/HuyenNguyen/Dropbox (Erasmus Universiteit Rotterdam)/Hamburg/TEACHING_UHH/WiSo21-22/Text Analysis for Social Sciences in Python/Exercises/W9'
import os
os.chdir(WORKING_DIR)

# Word Embeddings

In [11]:
#Import relevant packages
%matplotlib inline
import pandas as pd
import numpy as np
import pickle
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
import re
from string import punctuation

In [13]:
import keras
import pydot as pyd
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

ModuleNotFoundError: No module named 'pydot'

In [3]:
#Read the csv file
df1 = pd.read_csv('death-penalty-cases.csv')
df1 = df1[pd.notnull(df1['author_id'])] # drop cases without an author

In [6]:
#Check the data
df1.head()

Unnamed: 0,court_id,author_id,state,year,dateFiled,citeCount,snippet
1,fla,4019.0,FL,1973,1973-07-26T00:00:00Z,552,whether the death penalty per unconstitutional...
2,texcrimapp,5765.0,TX,1975,1975-04-16T00:00:00Z,143,contention that the assessment the death penal...
4,texcrimapp,5758.0,TX,1944,1944-12-20T00:00:00Z,56,assume the district attorney orally waived the...
5,azd,550.0,AZ,2003,2003-05-19T00:00:00Z,0,against death penalty stop prisoner rape citiz...
9,texcrimapp,5765.0,TX,1964,1964-10-14T00:00:00Z,80,this court received the record death penalty c...


In [7]:
#check the data types etc
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18215 entries, 1 to 32566
Data columns (total 7 columns):
court_id     18215 non-null object
author_id    18215 non-null float64
state        18215 non-null object
year         18215 non-null int64
dateFiled    18215 non-null object
citeCount    18215 non-null int64
snippet      18215 non-null object
dtypes: float64(1), int64(2), object(4)
memory usage: 1.1+ MB


In [4]:
#Read the .pkl files
vocab = pd.read_pickle('vocab.pkl')

For more information on .pkl files in Python, please check this link below https://pythonnumericalmethods.berkeley.edu/notebooks/chapter11.03-Pickle-Files.html

In [5]:
#Clean the text data
translator = str.maketrans('','',punctuation) 
def fix_snippet(txt):
    a = txt.encode("ascii", errors="ignore").decode()
    a = re.sub('\W\w\W', ' ', a).lower()
    a = re.sub('\W\w\w\W', ' ', a)
    a = a.replace('&quot;', ' ').replace ('\n', ' ')
    a = a.translate(translator)
    a = a.replace('deathpenalty',' ')
    a = ' '.join(a.split())
    return a
df1['snippet'] = df1['snippet'].apply(fix_snippet)
df1['snippet'] = df1['snippet'].apply(fix_snippet)
df1.to_csv('cases-processed.csv')

### Entity Embeddings

In [8]:
# Make judge dummy variables
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
J = encoder.fit_transform(df1['author_id'].astype(str))
num_judges = max(J)+1
Y = df1['citeCount'] > 0
Y2 = np.log(1+df1['citeCount'])

In [14]:
# Set up Deep Neural Networks (DNN)
from keras.models import Sequential
from keras.layers import Dense, Embedding, Flatten

model = Sequential()
model.add(Embedding(num_judges, # number of categories
                    2, # dimensions of embedding
                    input_length=1)) 
model.add(Flatten()) # needed after Embedding
model.add(Dense(2))
model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

dot = model_to_dot(model,
                   show_shapes=True,
                   show_layer_names=False)
SVG(dot.create(prog='dot', format='svg'))

ImportError: Failed to import `pydot`. Please install `pydot`. For example with `pip install pydot`.

In [15]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1, 2)              5964      
_________________________________________________________________
flatten_2 (Flatten)          (None, 2)                 0         
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 6         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 3         
Total params: 5,973
Trainable params: 5,973
Non-trainable params: 0
_________________________________________________________________


Install ggplot to visualize vectors (using Prompt/Terminal) https://pypi.org/project/ggplot/

In [16]:
# Visualize the Judge Vectors
import ggplot as gg

ModuleNotFoundError: No module named 'ggplot'

In [None]:
judge_cites = dict(Y.groupby(J).mean())
df2 = pd.DataFrame(J,columns=['judge']).drop_duplicates().sort_values('judge')
df2['cites'] = df2['judge'].apply(lambda x: judge_cites[x])

for i in range(5):
    if i > 0:
        model.fit(J,Y,epochs=1, validation_split=.2)
    
    judge_vectors = model.layers[0].get_weights()[0]
    df2['x'] = judge_vectors[:,0]
    df2['y'] = judge_vectors[:,1]    
    chart = gg.ggplot( df2, gg.aes(x='x', y='y', color='cites') ) \
                      + gg.geom_point(size=10, alpha=.8) 
    chart.show()

In [None]:
# convert documents to sequences of word indexes
from keras.preprocessing.text import Tokenizer
num_words = 200
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(df1['snippet'])
sequences = tokenizer.texts_to_sequences(df1['snippet'])

In [None]:
# represent data as numrows x maxlen matrix
from keras.preprocessing.sequence import pad_sequences
maxlen = max([len(sent) for sent in sequences]) 
maxlen

In [None]:
X = pad_sequences(sequences, maxlen=maxlen)

# Model setup
model = Sequential()
model.add(Embedding(num_words,
                    2,
                    input_length=maxlen)) # sequence length
model.add(Flatten()) # 86*2 = 172 dims
model.add(Dense(2))
model.add(Dense(1))
model.compile(optimizer='adam',loss='binary_crossentropy')
dot = model_to_dot(model, show_shapes=True, show_layer_names=False)
SVG(dot.create(prog='dot', format='svg'))

In [None]:
model.summary()

In [None]:
# Show the vectors
df3 = pd.DataFrame(list(tokenizer.word_index.items()),
                  columns=['word', 'word_index']).sort_values('word_index')[:num_words]

for i in range(5):
    if i > 0:
        model.fit(X,Y,epochs=1, validation_split=.2)

    word_vectors = model.layers[0].get_weights()[0]
    df3['x'] = word_vectors[:,0]
    df3['y'] = word_vectors[:,1]
    chart = gg.ggplot( df3, gg.aes(x='x', y='y', label='word') ) \
                      + gg.geom_text(size=10, alpha=.8, label='word') 
    chart.show()

In [None]:
# Word Similarity
from scipy.spatial.distance import cosine

vec_defendants = word_vectors[tokenizer.word_index['defendants']-1]
vec_convicted = word_vectors[tokenizer.word_index['convicted']-1]
vec_against = word_vectors[tokenizer.word_index['against']-1]

print(1-cosine(vec_defendants, vec_convicted))

In [None]:
print(1-cosine(vec_defendants, vec_against))

### Word2vec in Gensim

In [None]:
# word2vec requires sentences as input
from utils import get_sentences
sentences = []
for doc in df1['snippet']:
    sentences += get_sentences(doc)
from random import shuffle
shuffle(sentences) # stream in sentences in random order

In [None]:
# train the model
from gensim.models import Word2Vec
w2v = Word2Vec(sentences,  # list of tokenized sentences
               workers = 8, # Number of threads to run in parallel
               size=300,  # Word vector dimensionality     
               min_count =  25, # Minimum word count  
               window = 5, # Context window size      
               sample = 1e-3, # Downsample setting for frequent words
               )

# done training, so delete context vectors
w2v.init_sims(replace=True)

w2v.save('w2v-vectors.pkl')

w2v.wv['judg'] # vector for "judge"

In [None]:
# Check similarity between these two words
w2v.wv.similarity('judg','juri') 

In [None]:
# Find most similar words to 'judg'
w2v.wv.most_similar('judg') 

In [None]:
# Analogies: judge is to man as __ is to woman
w2v.wv.most_similar(positive=['judg','man'],
                 negative=['woman'])

### Word2Vec: K-Means Clusters

In [None]:
from sklearn.cluster import KMeans
kmw = KMeans(n_clusters=50)
kmw.fit(w2v.wv.vectors)
judge_clust = kmw.labels_[w2v.wv.vocab['judg'].index]
for i, cluster in enumerate(kmw.labels_):
    if cluster == judge_clust:
        print(w2v.wv.index2word[i])

In [None]:
###
# Pre-trained vectors
###

import spacy
en = spacy.load('en_core_web_lg')
apple = en('apple') 
apple.vector # vector for 'apple'

In [None]:
#Check how similar the word 'apple' is to 'apple'
apple.similarity(apple)

In [None]:
orange = en('orange')
apple.similarity(orange)

In [None]:
import spacy
it = spacy.load('it')
mela = it('mela')
arancia = it('arancia')
mela.similarity(arancia)

In [None]:
# Initializing an embedding layer with pre-trained vectors
embed_dims = len(apple.vector)
embedding_matrix = np.zeros([num_words, embed_dims])
for word, i in tokenizer.word_index.items():
    if i > num_words:
        break
    embedding_vector = en(word).vector
    embedding_matrix[i-1] = embedding_vector

In [None]:
model = Sequential()
model.add(Embedding(num_words,
                    embed_dims,
                    weights=[embedding_matrix],
                    input_length=maxlen,
                    trainable=False)) # frozen layer
model.add(Flatten()) # 86*300 = 25800 dims
model.add(Dense(64,activation='relu'))
model.add(Dense(1))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
# show the vectors
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, verbose=1, perplexity=50, n_iter=300)

df3 = pd.DataFrame(list(tokenizer.word_index.items()),
                  columns=['word', 'word_index']).sort_values('word_index')[:num_words]

for i in range(3):
    if i > 0:
        model.fit(X,Y,epochs=1, validation_split=.2)
    
    word_vectors = model.layers[0].get_weights()[0]
    wv_tsne = tsne.fit_transform(word_vectors)

    df3['x'] = wv_tsne[:,0]
    df3['y'] = wv_tsne[:,1]
    chart = gg.ggplot( df3, gg.aes(x='x', y='y', label='word') ) \
                      + gg.geom_text(size=10, alpha=.8, label='word') 
    chart.show()

In [None]:
# Word Mover Distance
###

import spacy
import wmd
nlp = spacy.load('en', 
                 create_pipeline=wmd.WMD.create_spacy_pipeline)
doc1 = nlp("Politician speaks to the media in Illinois.")
doc2 = nlp("The president greets the press in Chicago.")
print(doc1.similarity(doc2))