Skip to content
Permalink
Browse files

Initial commit

  • Loading branch information...
Jonas Kristoffer Lindeløv
Jonas Kristoffer Lindeløv committed Mar 13, 2019
1 parent e037f4e commit 8f443134eba01f197b483e1b5d6097547b1f883b
Showing with 450 additions and 0 deletions.
  1. +3 −0 .gitignore
  2. +79 −0 Google Scholar Scraper.R
  3. +13 −0 SPSS is dying.Rproj
  4. BIN SPSS is dying.docx
  5. +79 −0 citations.csv
  6. +92 −0 graphs.R
  7. +184 −0 trends.csv
@@ -0,0 +1,3 @@
.Rproj.user
.RData
.Rhistory
@@ -0,0 +1,79 @@
library(rvest)
library(httr)
library(tidyverse)

# Settings
years = 2010:2018
searches = list(
R = '"the R software" OR "the R project" OR "r-project.org" OR "R development core" OR "bioconductor" OR "lme4" OR "nlme" OR "lmeR function" OR "ggplot2" OR "Hmisc" OR "r function" OR "r package" OR "mass package" OR "plyr package" OR "mvtnorm"',
SPSS = 'SPSS -"SPSS Modeler" -"Amos"',
SAS = '"SAS Institute" -JMP -"Enterprise Miner"',
STATA = '("stata" "college station") OR "StataCorp" OR "Stata Corp" OR "Stata Journal" OR "Stata Press" OR "stata command" OR "stata module"',
Prism = 'GraphPad Prism',
JASP = '("jasp" (bayesian OR bayes OR wagenmakers) OR ("jasp package" OR "jasp software" OR "jasp team" OR "jasp-stats") -"jasper" -"joint attention symbolic" -EURASIP -"Journal of Applied School Psychology" -"Journal of Applied social psychology"',
)
sleep_interval = c(1, 10) # Uniformly break between searches in this interval to prevent scholar from rejecting searches
scholar_prefix = 'https://scholar.google.dk/scholar?hl=en&as_sdt=0%2C5&as_ylo=9999&as_yhi=9999&q='


###################
# HANDY FUNCTIONS #
###################

# Build the URL string
get_url = function(software, year) {
url_prefix = gsub('9999', as.character(year), scholar_prefix) # Enter year
url_search = gsub(' ', '+', searches[[software]]) # Escape spaces
url_search = gsub('\"', '%22', url_search) # Escape quotes
url = paste(url_prefix, url_search, sep='')
url
}

# Do the web search
get_html = function(url) {
html = read_html(url)
#html = content(GET(url))
html
}

extract_citations = function(html) {
# Extract the citation number
hits_strings = html %>%
html_nodes(css='.gs_ab_mdw') %>% # Name of the class where we can find citation number
html_text()
hits_string = strsplit(hits_strings[2], ' ')[[1]][2] # Second hit, second "word"
hits_numeric = as.numeric(gsub(',', '', hits_string)) # As numeric, not string
hits_numeric
}

get_citations = function(software, year) {
# Sleep to prevent HTTP error 503
sleep_duration = runif(1, sleep_interval[1], sleep_interval[2])
Sys.sleep(sleep_duration)

# Do the search
url = get_url(software, year)
html = get_html(url)
citations = extract_citations(html)

# Status and return
print(sprintf('Got %i scholar citations in %i for %s', citations, year, software))
citations
}


#################
# DO THE SEARCH #
#################
citation_history = expand.grid(years, names(searches))
names(citation_history) = c('year', 'software')

citation_history = citation_history %>%
filter(software == 'STATA') %>%
rowwise() %>%
mutate(
citations = get_citations(software, year)
)

# Save it so you don't have to repeat in case Scholar locks you out
write.csv(citation_history, 'citations.csv', row.names = F)
@@ -0,0 +1,13 @@
Version: 1.0

RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8

RnwWeave: Sweave
LaTeX: pdfLaTeX
BIN +43.1 KB SPSS is dying.docx
Binary file not shown.
@@ -0,0 +1,79 @@
year,software,citations
2010,SAS,88900
2011,SAS,88000
2012,SAS,92900
2013,SAS,85400
2014,SAS,79300
2015,SAS,70800
2016,SAS,61600
2017,SAS,52100
2018,SAS,35600
2010,Prism,32400
2011,Prism,44200
2012,Prism,53800
2013,Prism,60700
2014,Prism,61200
2015,Prism,57800
2016,Prism,50400
2017,Prism,42200
2018,Prism,33500
2016,JASP,218
2017,JASP,511
2018,JASP,1110
1995,R,1110
1996,R,1060
1997,R,1070
1998,R,1150
1999,R,1110
2000,R,1370
2001,R,1640
2002,R,1980
2003,R,2580
2004,R,3800
2005,R,5990
2006,R,8420
2007,R,11900
2008,R,17000
2009,R,18600
2010,R,21700
2011,R,24400
2012,R,27000
2013,R,29100
2014,R,32000
2015,R,36800
2016,R,45700
2017,R,57500
2018,R,49200
1995,SPSS,10600
1996,SPSS,12700
1997,SPSS,17300
1998,SPSS,24500
1999,SPSS,38200
2000,SPSS,51300
2001,SPSS,87800
2002,SPSS,123000
2003,SPSS,174000
2004,SPSS,208000
2005,SPSS,290000
2006,SPSS,310000
2007,SPSS,353000
2008,SPSS,368000
2009,SPSS,367000
2010,SPSS,394000
2011,SPSS,371000
2012,SPSS,335000
2013,SPSS,323000
2014,SPSS,283000
2015,SPSS,229000
2016,SPSS,171000
2017,SPSS,124000
2018,SPSS,85700
2010,STATA,15500
2011,STATA,17600
2012,STATA,19700
2013,STATA,21900
2014,STATA,26700
2015,STATA,30800
2016,STATA,35300
2017,STATA,36900
2018,STATA,33400
@@ -0,0 +1,92 @@
library(tidyverse)
factor_order = c('SPSS', 'R', 'SAS', 'STATA') # Needs to be common in both datasets
colors = c('red', 'darkgreen', 'blue', 'magenta', 'black', 'orange')


##########################
# VISUALIZE SCHOLAR DATA #
##########################

# Load the data. C for "citations"
C = read.csv('citations.csv') %>%
filter(year >= 2010) %>%
mutate(software = fct_relevel(software, factor_order))

# Plot
plot_citations = C %>%
ggplot(aes(x=year, y=citations, color=software)) +
geom_line() +
geom_point() +
scale_x_continuous(breaks=seq(1996, 2030, by=2)) +
scale_y_continuous(breaks=seq(0, 4*10^5, 0.5*10^5), labels=scales::comma) +
scale_colour_manual(values = colors) +
labs(
title = 'Scholar Citations',
x = '',
y = 'Citations'
) +
theme_gray(13) +
theme(
axis.text.x = element_text(angle = 90, hjust = 1)
)
#plot_citations



###########################
# VISUALIZE GOOGLE TRENDS #
###########################

# Load the data. P for "popularity"
P = read.csv('trends.csv') %>%
# Yearly summary from 2010 in long format
separate(Month, c('year', 'month'), '-') %>%
gather('software', 'popularity', -year, -month) %>%
filter(year >= 2010) %>%

# Summarise it
group_by(year, software) %>%
summarise(
popularity = mean(popularity),
) %>%

# A bit of tidying
ungroup() %>%
mutate(
year = as.numeric(as.character(year)),
software = fct_relevel(software, factor_order),
popularity = popularity / 100
)


# Plot it
plot_trends = P %>%
ggplot(aes(x=year, y=popularity, color=software)) +
geom_line() +
geom_point() +

# Appearance stuff
scale_x_continuous(breaks=seq(2010, 2030, by=2)) +
scale_y_continuous(breaks=seq(0, 1, 0.2), labels = scales::percent_format(1)) +
scale_colour_manual(values = colors) +
labs(
title = 'Google Trends',
x = '',
y = 'Relative search proportion'
) +
theme_gray(13) +
theme(
legend.position = "none", # Remove legend
axis.text.x = element_text(angle = 90, hjust = 1)
)
#plot_trends


####################
# All together now #
####################
library(patchwork)
plot_trends +
plot_spacer() +
plot_citations +
plot_layout(widths=c(0.48, 0.03, 0.51))
Oops, something went wrong.

0 comments on commit 8f44313

Please sign in to comment.
You can’t perform that action at this time.