# Analysis Notebook

Cleaning up data, creating model, etc.

In [1]:
import pandas as pd
import numpy as np
import unicodedata
import spacy
from spacy.symbols import nsubj, VERB
from collections import Counter
import re

### Cleaning Data

In [2]:
raw_data = pd.read_csv('articles.csv')
raw_data

Unnamed: 0,title,link,article_body
0,Restaurant video captured Missouri deputy shoo...,https://apnews.com//db6f22a0fac8c19421ca755c2a...,"SEDALIA, Mo. (AP) — A restaurant’s surveillanc..."
1,Delaware lawmakers release bill banning police...,https://apnews.com//3a4a9d79e2b7cce62ac10d774e...,"DOVER, Del. (AP) — A proposal by Democratic la..."
2,Country’s oldest city moves to relocate Confed...,https://apnews.com//8ab52f7318690a4708536e77c9...,"TALLAHASSEE, Fla. (AP) — Leaders of the countr..."
3,Mourners pay respects to Rayshard Brooks at Eb...,https://apnews.com//9de5db04c653cdde049245d7b3...,ATLANTA (AP) — Mourners filed through Atlanta’...
4,Las Vegas police pledge more communication at ...,https://apnews.com//19a753a5f9c43c6b2d61f55764...,LAS VEGAS (AP) — Las Vegas police say they wil...
...,...,...,...
585,Your coronavirus need-to-know: SAT drops at-ho...,https://www.usatoday.com/story/news/health/202...,Massive protests over death of George Floyd ha...
586,Did protests fuel COVID-19 cases? Are we alrea...,https://www.usatoday.com/story/news/health/202...,It’s been six months since doctors discovered ...
587,Flag Day 2020: Americans mark the holiday amid...,https://www.usatoday.com/story/news/2020/06/12...,Flag Day is June 14 and this year it comes at ...
588,Analysis: Most Florida officers disciplined fo...,https://www.usatoday.com/story/news/2020/06/08...,Most Florida law enforcement and corrections o...


In [3]:
# Removes unicode characters (e.g. \xa0) from article body in USA Today stories
for i in range(0, 590):
    string = raw_data.iloc[i]['article_body']
    raw_data.iloc[i]['article_body'] = unicodedata.normalize('NFKD', string)
    raw_data.iloc[i]['article_body'] = raw_data.iloc[i]['article_body'].lower()

## Analysis

In [4]:
# Loads SpaCy model & processes the article bodies in batches of 25
nlp = spacy.load("en_core_web_md")
unprocessed = list(raw_data['article_body'])
articles = list(nlp.pipe(unprocessed, batch_size=25))

### Top 50 Most Common Words

In [5]:
# Getting top 50 most used words in the corpus
nested_words = []
for doc in articles:
    words = [token.text.lower() for token in doc if token.is_stop != True and token.is_punct != True]
    nested_words.append(words)
    
corpus = [item for sublist in nested_words for item in sublist]
print("len of corpus:", len(corpus))
word_freq = Counter(corpus)
print("top 50 common words (excluding stop & punctuation):", word_freq.most_common(50))

len of corpus: 677931
top 50 common words (excluding stop & punctuation): [('said', 12936), ('police', 10419), (' ', 7672), ('floyd', 4636), ('people', 4602), ('officers', 3728), ('black', 3670), ('protests', 3657), ('protesters', 3627), ('trump', 3314), ('city', 3041), ('george', 2854), ('white', 2558), ('president', 2503), ('death', 2377), ('new', 2349), ('officer', 2145), ('department', 1935), ('cnn', 1872), ('minneapolis', 1832), ('state', 1786), ('law', 1760), ('according', 1732), ('house', 1713), ('told', 1682), ('national', 1674), ('protest', 1639), ('country', 1596), ('time', 1546), ('�', 1546), ('night', 1512), ('monday', 1492), ('man', 1436), ('mayor', 1415), ('statement', 1355), ('like', 1348), ('justice', 1314), ('week', 1300), ('video', 1297), ('coronavirus', 1297), ('public', 1285), ('lives', 1262), ('enforcement', 1257), ('day', 1252), ('peaceful', 1187), ('cases', 1182), ('racism', 1138), ('violence', 1125), ('force', 1124), ('matter', 1100)]


### Most Similar Words

In [6]:
def most_similar(word):
    by_similarity = sorted(word.vocab, key=lambda w: word.similarity(w), reverse=True)
    return [w.orth_ for w in by_similarity[:10]]

print("top 10 similarities for words of interest:")
print('similar words to POLICE:', most_similar(nlp('police')), "\n")
print('similar words to PROTESTS:', most_similar(nlp('protests')), "\n")
print('similar words to PROTESTERS:', most_similar(nlp('protesters')), "\n")
print('similar words to PROTEST:', most_similar(nlp('protest')), "\n")
print('similar words to PROTESTER:', most_similar(nlp('protester')), "\n")
print('similar words to RIOT:', most_similar(nlp('riot')), "\n")
print('similar words to BLACK:', most_similar(nlp('black')), "\n")
print('similar words to WHITE:', most_similar(nlp('white')), "\n")
print('similar words to VIOLENT:', most_similar(nlp('violent')), "\n")
print('similar words to VIOLENCE:', most_similar(nlp('violence')), "\n")
print('similar words to PEACEFUL:', most_similar(nlp('peaceful')), "\n")

top 10 similarities for words of interest:


  


similar words to POLICE: ['police', 'cops', 'officers', 'marshals', 'uniformed', 'deputies', 'arrested', 'arrest', 'authorities', 'sergeant'] 

similar words to PROTESTS: ['protests', 'protest', 'protesters', 'protester', 'demonstrators', 'clashed', 'marchers', 'rioters', 'protestors', 'protested'] 

similar words to PROTESTERS: ['protester', 'demonstrators', 'clashed', 'protesters', 'marchers', 'rioters', 'protestors', 'protests', 'protest', 'protested'] 

similar words to PROTEST: ['protest', 'protests', 'protested', 'protesting', 'protesters', 'protester', 'demonstrators', 'clashed', 'marchers', 'rioters'] 

similar words to PROTESTER: ['protesters', 'demonstrators', 'clashed', 'protester', 'marchers', 'rioters', 'protestors', 'protests', 'protest', 'protested'] 

similar words to RIOT: ['riot', 'riots', 'unrest', 'rioting', 'protesters', 'protester', 'demonstrators', 'clashed', 'marchers', 'rioters'] 

similar words to BLACK: ['black', 'white', 'red', 'brown', 'blue', 'gray', 'dark

In [7]:
print("similarities between words of interest:")
print("similarity between POLICE and VIOLENCE:", nlp('police').similarity(nlp('violence')))
print("similarity between POLICE and VIOLENT:", nlp('police').similarity(nlp('violent')))
print("similarity between POLICE and WHITE:", nlp('police').similarity(nlp('white')))
print("similarity between POLICE and BLACK:", nlp('police').similarity(nlp('black')))
print("similarity between POLICE and PROTESTS:", nlp('police').similarity(nlp('protests')))
print("similarity between POLICE and PEACEFUL:", nlp('police').similarity(nlp('peaceful')))
print("similarity between POLICE and RIOT:", nlp('police').similarity(nlp('riot')))
print("\n")
print("similarity between PROTESTS and VIOLENCE:", nlp('protests').similarity(nlp('violence')))
print("similarity between PROTESTS and VIOLENT:", nlp('protests').similarity(nlp('violent')))
print("similarity between PROTESTS and BLACK:", nlp('protests').similarity(nlp('black')))
print("similarity between PROTESTS and WHITE:", nlp('protests').similarity(nlp('white')))
print("similarity between PROTESTS and PEACEFUL:", nlp('protests').similarity(nlp('peaceful')))
print("\n")
print("similarity between PROTESTERS and VIOLENCE:", nlp('protesters').similarity(nlp('violence')))
print("similarity between PROTESTERS and VIOLENT:", nlp('protesters').similarity(nlp('violent')))
print("similarity between PROTESTERS and BLACK:", nlp('protesters').similarity(nlp('black')))
print("similarity between PROTESTERS and WHITE:", nlp('protesters').similarity(nlp('white')))
print("similarity between PROTESTERS and PEACEFUL:", nlp('protesters').similarity(nlp('peaceful')))
print("\n")
print("similarity between RIOT and VIOLENCE:", nlp('riot').similarity(nlp('violence')))
print("similarity between RIOT and VIOLENT:", nlp('riot').similarity(nlp('violent')))
print("similarity between RIOT and WHITE:", nlp('riot').similarity(nlp('white')))
print("similarity between RIOT and BLACK:", nlp('riot').similarity(nlp('black')))
print("similarity between RIOT and PROTESTS:", nlp('riot').similarity(nlp('protests')))
print("similarity between RIOT and PEACEFUL:", nlp('riot').similarity(nlp('peaceful')))

similarities between words of interest:
similarity between POLICE and VIOLENCE: 0.5241225354674653
similarity between POLICE and VIOLENT: 0.4823528274605144
similarity between POLICE and WHITE: 0.21429311719809296
similarity between POLICE and BLACK: 0.2553994971490491
similarity between POLICE and PROTESTS: 0.46271436290191487
similarity between POLICE and PEACEFUL: 0.274091132376229
similarity between POLICE and RIOT: 0.5751548483597106


similarity between PROTESTS and VIOLENCE: 0.5586057544737452
similarity between PROTESTS and VIOLENT: 0.525904275709895
similarity between PROTESTS and BLACK: 0.1462950580648276
similarity between PROTESTS and WHITE: 0.11793965752726573
similarity between PROTESTS and PEACEFUL: 0.44062706988634903


similarity between PROTESTERS and VIOLENCE: 0.47343999882244225
similarity between PROTESTERS and VIOLENT: 0.46465988407431974
similarity between PROTESTERS and BLACK: 0.1700303625398976
similarity between PROTESTERS and WHITE: 0.15161536152331864
simila

### Connotation Frames

In [8]:
police_verbs = []
protester_verbs = []

In [9]:
def get_verbs(subj, lst):
    for doc in articles:
        for possible_subject in doc:
            if possible_subject.text == subj and possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
                lst.append(possible_subject.head)

In [10]:
get_verbs("enforcement", police_verbs)
print(police_verbs)

[cleared, investigating, initiates, noted, respond, put, inflaming, doing, cleared, said, found, firing, use, target, began, expanded, began, cleared, expanded, go, dispersed, use, use, dispersed, forced, began, firing, arrive, made, cleared, realize, began, made, dominate, began, dominate, respond, deploying, said, took, say, sees, joined, needs, doing, began, needs, made, uses, put, found, slashed, deployed, donned, cleared, pushed, stationed, used, clashed, provide, set, identify, gathering, shoots, aimed, come, stood, decided, cleared, worked, opened, cleared, disappear, stopped, shift, descended, looking, cleared, approached, blocked, appeared, investigate, speaks, realized, seemed, received, kept]


In [11]:
get_verbs("police", police_verbs)
print(police_verbs)

[cleared, investigating, initiates, noted, respond, put, inflaming, doing, cleared, said, found, firing, use, target, began, expanded, began, cleared, expanded, go, dispersed, use, use, dispersed, forced, began, firing, arrive, made, cleared, realize, began, made, dominate, began, dominate, respond, deploying, said, took, say, sees, joined, needs, doing, began, needs, made, uses, put, found, slashed, deployed, donned, cleared, pushed, stationed, used, clashed, provide, set, identify, gathering, shoots, aimed, come, stood, decided, cleared, worked, opened, cleared, disappear, stopped, shift, descended, looking, cleared, approached, blocked, appeared, investigate, speaks, realized, seemed, received, kept, say, communicate, said, say, pursued, asked, retreated, said, say, said, doing, arrested, said, left, say, declared, said, retreated, had, arrested, said, declared, said, said, arrested, said, said, moved, said, retake, lives, came, told, abandoned, attempted, investigating, made, arres

In [12]:
get_verbs("protesters", protester_verbs)
print(protester_verbs)

[swarmed, want, calling, showed, toppled, set, read, returned, marched, held, pulled, called, marched, enacted, cordoned, sued, cried, left, brought, march, try, try, trying, surround, occupied, marched, lie, stretch, walk, pack, take, gather, use, gather, hold, pack, kneel, walk, lie, ride, gather, raise, pass, drawn, clear, dismantled, vandalized, tear, took, tore, feel, taken, marched, attempted, tried, attempted, trying, clear, attempted, face, tried, clear, trying, taken, attempted, face, tried, told, grab, clear, trying, put, established, attempted, toppled, take, attempted, toppled, wearing, chanting, gathered, confronting, yell, hit, expressed, take, gather, kept, understand, walked, topple, toppled, appeared, demanding, pulled, tied, marched, put, dragged, took, kept, trying, planning, understand, pulled, tied, marched, put, dragged, pulled, pulled, awakened, protecting, choose, speak, taken, planned, clashed, taking, shut, dispersing, engaging, began, comply, spent, testify, 

In [13]:
lemm_police_verbs = []
lemm_protester_verbs = []

def lemma(lst, new_lst):
    for word in lst:
        new_lst.append(word.lemma_)

In [14]:
lemma(police_verbs, lemm_police_verbs)
print(lemm_police_verbs)

['clear', 'investigate', 'initiate', 'note', 'respond', 'put', 'inflame', 'do', 'clear', 'say', 'find', 'fire', 'use', 'target', 'begin', 'expand', 'begin', 'clear', 'expand', 'go', 'disperse', 'use', 'use', 'disperse', 'force', 'begin', 'fire', 'arrive', 'make', 'clear', 'realize', 'begin', 'make', 'dominate', 'begin', 'dominate', 'respond', 'deploy', 'say', 'take', 'say', 'see', 'join', 'need', 'do', 'begin', 'need', 'make', 'use', 'put', 'find', 'slash', 'deploy', 'don', 'clear', 'push', 'station', 'use', 'clash', 'provide', 'set', 'identify', 'gather', 'shoot', 'aim', 'come', 'stand', 'decide', 'clear', 'work', 'open', 'clear', 'disappear', 'stop', 'shift', 'descend', 'look', 'clear', 'approach', 'block', 'appear', 'investigate', 'speak', 'realize', 'seem', 'receive', 'keep', 'say', 'communicate', 'say', 'say', 'pursue', 'ask', 'retreat', 'say', 'say', 'say', 'do', 'arrest', 'say', 'leave', 'say', 'declare', 'say', 'retreat', 'have', 'arrest', 'say', 'declare', 'say', 'say', 'arres

In [15]:
lemma(protester_verbs, lemm_protester_verbs)
print(lemm_protester_verbs)

['swarm', 'want', 'call', 'show', 'topple', 'set', 'read', 'return', 'march', 'hold', 'pull', 'call', 'march', 'enact', 'cordone', 'sue', 'cry', 'leave', 'bring', 'march', 'try', 'try', 'try', 'surround', 'occupy', 'march', 'lie', 'stretch', 'walk', 'pack', 'take', 'gather', 'use', 'gather', 'hold', 'pack', 'kneel', 'walk', 'lie', 'ride', 'gather', 'raise', 'pass', 'draw', 'clear', 'dismantle', 'vandalize', 'tear', 'take', 'tear', 'feel', 'take', 'march', 'attempt', 'try', 'attempt', 'try', 'clear', 'attempt', 'face', 'try', 'clear', 'try', 'take', 'attempt', 'face', 'try', 'tell', 'grab', 'clear', 'try', 'put', 'establish', 'attempt', 'topple', 'take', 'attempt', 'topple', 'wear', 'chant', 'gather', 'confront', 'yell', 'hit', 'express', 'take', 'gather', 'keep', 'understand', 'walk', 'topple', 'topple', 'appear', 'demand', 'pull', 'tie', 'march', 'put', 'drag', 'take', 'keep', 'try', 'plan', 'understand', 'pull', 'tie', 'march', 'put', 'drag', 'pull', 'pull', 'awaken', 'protect', 'cho

In [16]:
conn_frames = pd.read_csv('agency_power.csv')
conn_frames

Unnamed: 0,verb,agency,power
0,abandons,agency_pos,power_agent
1,abolishes,agency_pos,power_agent
2,absorbs,agency_pos,power_agent
3,abuses,agency_pos,power_agent
4,accelerates,agency_pos,power_agent
...,...,...,...
2150,yelps,agency_pos,
2151,yields,agency_equal,power_agent
2152,zaps,agency_pos,power_agent
2153,zips,agency_pos,


In [17]:
conn_verb_list = conn_frames['verb'].to_list()
verb_doc = spacy.tokens.Doc(nlp.vocab, words=conn_verb_list)
lemm_verbs = []
lemma(verb_doc, lemm_verbs)
conn_frames['lemm_verb'] = lemm_verbs
conn_frames

Unnamed: 0,verb,agency,power,lemm_verb
0,abandons,agency_pos,power_agent,abandon
1,abolishes,agency_pos,power_agent,abolish
2,absorbs,agency_pos,power_agent,absorb
3,abuses,agency_pos,power_agent,abuse
4,accelerates,agency_pos,power_agent,accelerate
...,...,...,...,...
2150,yelps,agency_pos,,yelp
2151,yields,agency_equal,power_agent,yield
2152,zaps,agency_pos,power_agent,zap
2153,zips,agency_pos,,zip


In [18]:
agency_arr = conn_frames['agency'].to_numpy()
a_conditions = [(pd.isnull(agency_arr)), (agency_arr == 'agency_pos'), (agency_arr == 'agency_equal'), (agency_arr == 'agency_neg')]
a_values = [np.nan, 1.0, 0.0, -1.0]

conn_frames['agency_val'] = np.select(a_conditions, a_values)

power_arr = conn_frames['power'].to_numpy()
p_conditions = [(pd.isnull(power_arr)), (power_arr == 'power_agent'), (power_arr == 'power_equal'), (power_arr == 'power_theme')]
p_values = [np.nan, 1.0, 0.0, -1.0]

conn_frames['power_val'] = np.select(p_conditions, p_values)

conn_frames

Unnamed: 0,verb,agency,power,lemm_verb,agency_val,power_val
0,abandons,agency_pos,power_agent,abandon,1.0,1.0
1,abolishes,agency_pos,power_agent,abolish,1.0,1.0
2,absorbs,agency_pos,power_agent,absorb,1.0,1.0
3,abuses,agency_pos,power_agent,abuse,1.0,1.0
4,accelerates,agency_pos,power_agent,accelerate,1.0,1.0
...,...,...,...,...,...,...
2150,yelps,agency_pos,,yelp,1.0,
2151,yields,agency_equal,power_agent,yield,0.0,1.0
2152,zaps,agency_pos,power_agent,zap,1.0,1.0
2153,zips,agency_pos,,zip,1.0,


In [19]:
pol_agency = []
pol_power = []

pro_agency = []
pro_power = []

In [20]:
def agency(verb_lst, df, final_lst):
    for verb in verb_lst:
        if (df['lemm_verb'] == verb).bool:
            df2 = df.loc[df['lemm_verb'] == verb, 'agency_val']
            if len(df2) > 0:
                final_lst.append(df2.iloc[0])
            else:
                final_lst.append(np.nan)

def power(verb_lst, df, final_lst):
    for verb in verb_lst:
        if (df['lemm_verb'] == verb).bool:
            df2 = df.loc[df['lemm_verb'] == verb, 'power_val']
            if len(df2) > 0:
                final_lst.append(df2.iloc[0])
            else:
                final_lst.append(np.nan)

In [21]:
agency(lemm_police_verbs, conn_frames, pol_agency)
pol_agency = np.array(pol_agency)
pol_agency

array([ 1.,  1.,  1., ..., -1.,  1.,  1.])

In [22]:
agency(lemm_protester_verbs, conn_frames, pro_agency)
pro_agency = np.array(pro_agency)
pro_agency

array([ 1., -1.,  1., ..., nan,  1.,  1.])

In [23]:
power(lemm_police_verbs, conn_frames, pol_power)
pol_power = np.array(pol_power)
pol_power

array([ 1.,  1.,  1., ...,  0., -1.,  1.])

In [24]:
power(lemm_protester_verbs, conn_frames, pro_power)
pro_power = np.array(pro_power)
pro_power

array([nan, -1., -1., ..., nan, nan, nan])

In [31]:
from scipy import stats
# NULL HYPOTHESIS: pol_agency <= pro_agency, alpha = 0.05
agency_ttest = stats.ttest_ind(pol_agency, pro_agency, nan_policy='omit')
one_sided_pvalue1 = agency_ttest.pvalue / 2
print(f"statistic: {agency_ttest.statistic}; p-value: {one_sided_pvalue1}")

statistic: 2.57260937935974; p-value: 0.005074398037629736


In [32]:
# NULL HYPOTHESIS: pol_power >= pro_power, alpha = 0.05
power_ttest = stats.ttest_ind(pol_power, pro_power, nan_policy='omit')
one_sided_pvalue2 = power_ttest.pvalue / 2
print(f"statistic: {power_ttest.statistic}; p-value: {one_sided_pvalue2}")

statistic: -2.1142335288229344; p-value: 0.01730157173526271


### CODE DUMP

In [50]:
len(pol_agency)

1528

In [51]:
len(pro_agency)

1245

In [39]:
np.nanmean(pol_agency)

0.8388203017832647

In [46]:
np.isnan(pol_agency).sum()

70

In [40]:
np.nanmean(pro_agency)

0.7869142351900973

In [47]:
np.isnan(pro_agency).sum()

114

In [41]:
np.nanmean(pol_power)

0.45194424064563465

In [48]:
np.isnan(pol_power).sum()

165

In [42]:
np.nanmean(pro_power)

0.5123789020452099

In [49]:
np.isnan(pro_power).sum()

316