# State of the Union

## Topic Modeling With *udpipe*

### Created by Lars Bernhardsson

I used the jupyter/datascience-notebook docker image found at https://hub.docker.com/r/jupyter/datascience-notebook when making this notebook.

## R session prep

In [1]:
# If you are using the jupyter/datascience-notebook docker image to run the notebooks in this repository,
# open a terminal window and run this first:
#
#   R -e 'install.packages("udpipe", repos="https://cran.r-project.org")'

library(tidyverse)
library(udpipe)

options(stringsAsFactors = FALSE)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.2.1 ──
[32m✔[39m [34mggplot2[39m 3.2.1     [32m✔[39m [34mpurrr  [39m 0.3.2
[32m✔[39m [34mtibble [39m 2.1.3     [32m✔[39m [34mdplyr  [39m 0.8.3
[32m✔[39m [34mtidyr  [39m 0.8.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.4.0
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


## Setting up udpipe

In [2]:
en <- udpipe_download_model("english")

Downloading udpipe model from https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.4/master/inst/udpipe-ud-2.4-190531/english-ewt-ud-2.4-190531.udpipe to /home/jovyan/work/SOTU/english-ewt-ud-2.4-190531.udpipe
Visit https://github.com/jwijffels/udpipe.models.ud.2.4 for model license details


In [3]:
model <- udpipe_load_model(en$file_model)

## Data

In [4]:
if(file.exists("SOTUtexts.RData")) {
    load("SOTUtexts.RData")
} else {
    stop("Run the data notebook first")
}

## Processing

In [5]:
docs <- udpipe_annotate(model, d$Text, doc_id=d$Speech)

In [6]:
docs_df <- as.data.frame(docs)

In [7]:
docs_df %>%
    head()

doc_id,paragraph_id,sentence_id,sentence,token_id,token,lemma,upos,xpos,feats,head_token_id,dep_rel,deps,misc
<chr>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1981 Reagan,1,1,"Mr. Speaker, Mr. President, distinguished Members of Congress, honored guests, and fellow citizens:",1,Mr.,Mr.,PROPN,NNP,Number=Sing,0,root,,SpacesBefore=\n\s\s\s\s
1981 Reagan,1,1,"Mr. Speaker, Mr. President, distinguished Members of Congress, honored guests, and fellow citizens:",2,Speaker,speaker,PROPN,NNP,Number=Sing,1,flat,,SpaceAfter=No
1981 Reagan,1,1,"Mr. Speaker, Mr. President, distinguished Members of Congress, honored guests, and fellow citizens:",3,",",",",PUNCT,",",,1,punct,,
1981 Reagan,1,1,"Mr. Speaker, Mr. President, distinguished Members of Congress, honored guests, and fellow citizens:",4,Mr.,Mr.,PROPN,NNP,Number=Sing,1,flat,,
1981 Reagan,1,1,"Mr. Speaker, Mr. President, distinguished Members of Congress, honored guests, and fellow citizens:",5,President,President,PROPN,NNP,Number=Sing,1,flat,,SpaceAfter=No
1981 Reagan,1,1,"Mr. Speaker, Mr. President, distinguished Members of Congress, honored guests, and fellow citizens:",6,",",",",PUNCT,",",,1,punct,,


### Extracting Top Keywords

In [8]:
speech_topics <- lapply(split(docs_df, docs_df$doc_id), function(doc) {
    keywords_rake(x = doc, term = "lemma", group = "doc_id", relevant = doc$upos %in% c("ADJ", "NOUN", "VERB"))
})

In [9]:
head(speech_topics[[1]])

keyword,ngram,freq,rake
<chr>,<int>,<int>,<dbl>
tax rate,2,3,2.810345
Federal Government,2,2,2.75
unearned income,2,2,2.666667
interest rate,2,4,2.666667
local government,2,2,2.614907
block grant,2,2,2.571429


### Top Keywords by Speech

In [10]:
t(sapply(speech_topics, function(x) head(x$keyword, 5)))

0,1,2,3,4,5
1981 Reagan,tax rate,Federal Government,unearned income,interest rate,local government
1982 Reagan,federal government,grass root,social service,military force,local community
1983 Reagan,foreign policy,international trade,national debt,interest rate,majority leader
1984 Reagan,increase tax,economic recovery,bipartisan cooperation,down payment,underground economy
1985 Reagan,federal government,tax simplification,american people,give,fellow citizen
1986 Reagan,true reform,federal budget,american people,national security,human spirit
1987 Reagan,national interest,welfare reform,arm reduction,new job,third century
1988 Reagan,Federal Government,budget process,national security,balanced budget,arm reduction
1989 BushSr,new program,nuclear weapon,best judgment,growth,child care
1990 BushSr,record high,new initiative,american worker,new world,american people


In [11]:
topics_by_speech <- bind_rows(speech_topics, .id="Speech")

## Specific Keywords

In [12]:
## Find the top mentions of keywords

find_topic <- function(x) {
    topics_by_speech %>%
        filter(str_detect(keyword, fixed(x))) %>%
        arrange(desc(rake)) %>%
        head()
}

In [13]:
find_topic("health")

Speech,keyword,ngram,freq,rake
<chr>,<chr>,<int>,<int>,<dbl>
2004 GWBush,health care,2,3,4.125
2007 GWBush,health insurance,2,4,4.033613
2009 Obama,health care,2,5,3.631579
2004 GWBush,health insurance,2,2,3.625
1994 Clinton,health care,2,7,3.536585
2001 GWBush,health care,2,3,3.383333


In [14]:
find_topic("immigration")

Speech,keyword,ngram,freq,rake
<chr>,<chr>,<int>,<int>,<dbl>
2019 Trump,immigration system,2,2,2.8666667
2019 Trump,illegal immigration,2,2,2.8
2008 GWBush,immigration,1,2,0.3333333
2015 Obama,immigration,1,2,0.0


In [15]:
find_topic("gun")

Speech,keyword,ngram,freq,rake
<chr>,<chr>,<int>,<int>,<dbl>
2000 Clinton,keep gun,2,2,2.06
1994 Clinton,own gun,2,2,1.9821429
2013 Obama,gun violence,2,2,1.6666667
2000 Clinton,gun,1,4,1.56
1994 Clinton,gun,1,2,0.8571429
2013 Obama,gun,1,2,0.8333333


In [16]:
find_topic("tax")

Speech,keyword,ngram,freq,rake
<chr>,<chr>,<int>,<int>,<dbl>
1998 Clinton,tax credit,2,2,4.0
1998 Clinton,tax cut,2,2,3.7
2000 Clinton,tax credit,2,4,3.692308
2013 Obama,tax reform,2,2,3.371429
2005 GWBush,tax code,2,2,3.366667
2010 Obama,tax credit,2,2,3.02381


In [17]:
find_topic("bank")

Speech,keyword,ngram,freq,rake
<chr>,<chr>,<int>,<int>,<dbl>
2009 Obama,bank,1,4,1.0
2010 Obama,bank,1,4,1.0
2012 Obama,bank,1,3,0.25
1994 Clinton,bankruptcy,1,2,0.0


In [18]:
find_topic("security")

Speech,keyword,ngram,freq,rake
<chr>,<chr>,<int>,<int>,<dbl>
2008 GWBush,energy security,2,2,2.466667
1988 Reagan,national security,2,2,2.45
2010 Obama,national security,2,3,2.333333
1986 Reagan,national security,2,2,1.880952
2013 Obama,security,1,2,1.7
2007 GWBush,own security,2,2,1.443182


In [19]:
find_topic("terror")

Speech,keyword,ngram,freq,rake
<chr>,<chr>,<int>,<int>,<dbl>
2002 GWBush,terrorist camp,2,2,2.375
1988 Reagan,nuclear terror,2,2,2.142857
2006 GWBush,terror network,2,2,1.833333
2006 GWBush,terrorist,1,2,1.454545
2013 Obama,counterterrorism effort,2,2,1.4
2005 GWBush,harbor terrorist,2,2,1.25


In [20]:
find_topic("future")

Speech,keyword,ngram,freq,rake
<chr>,<chr>,<int>,<int>,<dbl>
1982 Reagan,near future,2,2,2.111111
1993 Clinton,economic future,2,2,1.867133
2013 Obama,future generation,2,2,1.5
1989 BushSr,better future,2,2,1.466667
1994 Clinton,own future,2,2,1.458333
2016 Obama,better future,2,2,1.351648


In [21]:
find_topic("illegal")

Speech,keyword,ngram,freq,rake
<chr>,<chr>,<int>,<int>,<dbl>
1995 Clinton,illegal alien,2,2,3.0
2019 Trump,illegal immigration,2,2,2.8


In [22]:
find_topic("prescription")

Speech,keyword,ngram,freq,rake
<chr>,<chr>,<int>,<int>,<dbl>
2000 Clinton,prescription drug,2,2,3.166667
2019 Trump,prescription drug,2,2,2.5


In [23]:
find_topic("insurance")

Speech,keyword,ngram,freq,rake
<chr>,<chr>,<int>,<int>,<dbl>
2007 GWBush,health insurance,2,4,4.033613
2004 GWBush,health insurance,2,2,3.625
1994 Clinton,health insurance,2,4,3.253049
1999 Clinton,health insurance,2,2,3.15
2014 Obama,unemployment insurance,2,2,3.071429
1998 Clinton,health insurance,2,2,2.933333


In [24]:
find_topic("employ")

Speech,keyword,ngram,freq,rake
<chr>,<chr>,<int>,<int>,<dbl>
2014 Obama,unemployment insurance,2,2,3.0714286
1983 Reagan,unemployment,1,2,2.0
1983 Reagan,term unemployed,2,2,1.6666667
1995 Clinton,employ people,2,2,1.6027397
1982 Reagan,unemployment,1,3,0.7142857
2018 Trump,employee,1,2,0.6666667
