## Return on self-promotion

In [1]:
data_root <- "~/Data/Promotion/"

In [2]:
library("lme4")
library("margins")
library("stargazer")
library("emmeans")
library("ggeffects")
library("broom")
library("broom.mixed")
library("MASS")
library("pscl")

Loading required package: Matrix

Please cite as: 

 Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
 R package version 5.2.2. https://CRAN.R-project.org/package=stargazer 

Classes and Methods for R developed in the
Political Science Computational Laboratory
Department of Political Science
Stanford University
Simon Jackman
hurdle and zeroinfl functions by Achim Zeileis


In [3]:
options(repr.matrix.max.rows=500, repr.matrix.max.cols=200)

In [4]:
mydata <- read.csv(paste(data_root, "reg_data_drop_missing.csv", sep = ''), header = TRUE)

In [5]:
nrow(mydata)

In [6]:
ncol(mydata)

In [8]:
mydata = mydata[mydata$gender %in% c("Male", "Female"), ]

In [9]:
nrow(mydata)

In [10]:
mydata = mydata[mydata$affiliation_cate != 'unknown', ]

In [11]:
nrow(mydata)

In [12]:
mydata <- within(mydata, gender <- relevel(gender, ref = 'Male'))
mydata <- within(mydata, authorship_pos <- relevel(authorship_pos, ref = 'last_position'))
# mydata <- within(mydata, is_corresponding <- relevel(is_corresponding, ref = 'yes'))
mydata <- within(mydata, affiliation_cate <- relevel(affiliation_cate, ref = 'domestic'))

In [13]:
keywords <- " + Social_Sciences + Materials_Science + Engineering + Chemistry + \
        Biochemistry__Genetics_and_Molecular_Biology + Medicine + Nursing + Agricultural_and_Biological_Sciences + \
        Pharmacology__Toxicology_and_Pharmaceutics + Neuroscience + Business__Management_and_Accounting + \
        Economics__Econometrics_and_Finance + Chemical_Engineering + Physics_and_Astronomy + Computer_Science + \
        Decision_Sciences + Health_Professions + Psychology + Immunology_and_Microbiology + Dentistry + \
        Earth_and_Planetary_Sciences + Environmental_Science + Mathematics + Arts_and_Humanities + Energy + \
        Veterinary + General"

In [14]:
mydata$author_citation_log <- log2(1 + mydata$author_citation)

In [15]:
mydata$matched_tid_follower_cn_log <- log2(1 + mydata$matched_tid_follower_cn)

## Num of scientists

### All observations

In [16]:
min(mydata$researcher)

In [17]:
mean(mydata[mydata$gender == "Female", 'researcher'])

In [18]:
mean(mydata[mydata$gender == "Male", 'researcher'])

Negative Binomial Regression

In [19]:
base_str <- "researcher ~ 1 + gender"
m_binary_bar <- glm.nb(formula = base_str, data = mydata)

In [20]:
tidy(m_binary_bar)

term,estimate,std.error,statistic,p.value
(Intercept),1.39870988,0.001806485,774.27151,0.0
genderFemale,-0.08939577,0.003008843,-29.71102,5.533677e-194


In [21]:
base_str <- "researcher ~ 1 + gender + journal_impact"
m_binary_bar <- glm.nb(formula = base_str, data = mydata)

In [22]:
tidy(m_binary_bar)

term,estimate,std.error,statistic,p.value
(Intercept),0.30161853,0.0019578123,154.058966,0.0
genderFemale,-0.01915782,0.0026913464,-7.118302,1.092645e-12
journal_impact,0.11681828,0.0001763518,662.416118,0.0


In [19]:
base_str <- "researcher ~ 1 + gender * self_promotion + authorship_pos + \
        author_pub_count_cate + affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + author_citation_log"
equation <- as.formula(paste(base_str, keywords, sep = " "))
m_binary <- glm.nb(equation, data = mydata)

In [20]:
nobs(m_binary)

In [21]:
tidy(m_binary)

term,estimate,std.error,statistic,p.value
(Intercept),0.1953920879,0.005250717,37.212458,4.2914009999999995e-303
genderFemale,0.0173111281,0.002665194,6.495261,8.288983e-11
self_promotionTrue,1.594786471,0.005204375,306.431879,0.0
authorship_posfirst_position,0.0110787551,0.004522136,2.449894,0.01428981
authorship_posmiddle_position,0.3000863731,0.003506016,85.591851,0.0
authorship_possolo_author,-0.2065105354,0.0112929,-18.286752,1.055174e-74
author_pub_count_cate,-0.1243237396,0.00123141,-100.960517,0.0
affiliation_rank_cate,-0.0412932605,0.000453446,-91.065448,0.0
affiliation_cateinternational,-0.0974749361,0.002665421,-36.570185,8.520293e-293
num_authors,0.0001109678,3.812567e-06,29.105793,3.032279e-186


In [22]:
# you're trying to measure the ratio of the deviance in your model to the null; how much better your model is (residual deviance) than just the intercept (null deviance). If that ratio is tiny, you're 'explaining' most of the deviance in the null; 1 minus that gets you your R-squared.
with(summary(m_binary), 1 - deviance/null.deviance)


In [23]:
# use this search string in Sublime: " \\\\ \n  &" with " &"
stargazer(m_binary, type = "latex", single.row=TRUE, ci = FALSE, report = ('vc*p'), star.cutoffs = c(0.05, 0.01, 0.001))


% Table created by stargazer v.5.2.2 by Marek Hlavac, Harvard University. E-mail: hlavac at fas.harvard.edu
% Date and time: Tue, May 10, 2022 - 04:03:40 PM
\begin{table}[!htbp] \centering 
  \caption{} 
  \label{} 
\begin{tabular}{@{\extracolsep{5pt}}lc} 
\\[-1.8ex]\hline 
\hline \\[-1.8ex] 
 & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ 
\cline{2-2} 
\\[-1.8ex] & researcher \\ 
\hline \\[-1.8ex] 
 genderFemale & 0.017$^{***}$ \\ 
  & p = 0.000 \\ 
  self\_promotionTrue & 1.595$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posfirst\_position & 0.011$^{*}$ \\ 
  & p = 0.015 \\ 
  authorship\_posmiddle\_position & 0.300$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_possolo\_author & $-$0.207$^{***}$ \\ 
  & p = 0.000 \\ 
  author\_pub\_count\_cate & $-$0.124$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_rank\_cate & $-$0.041$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_cateinternational & $-$0.097$^{***}$ \\ 
  & p = 0.000 \\ 
  num\_authors & 0.0001$^{***}$ \\ 
  & p = 0.000 \\ 
  

### Self-promoted subset

Negative Binomial Regression

In [24]:
base_str <- "researcher ~ 1 + gender + authorship_pos + matched_tid_follower_cn_log + \
        author_pub_count_cate + affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + author_citation_log"
equation <- as.formula(paste(base_str, keywords, sep = " "))
m_yes <- glm.nb(equation, data = mydata[mydata$self_promotion == "True", ])

In [25]:
nrow(mydata[mydata$self_promotion == "True", ])

In [26]:
tidy(m_yes)

term,estimate,std.error,statistic,p.value
(Intercept),0.9608714085,0.01703314,56.4118928,0.0
genderFemale,0.0910016147,0.006362132,14.3036355,2.07671e-46
authorship_posfirst_position,0.0790049051,0.008982112,8.7958046,1.4202660000000001e-18
authorship_posmiddle_position,0.3653361067,0.007591775,48.1226225,0.0
authorship_possolo_author,-0.2962323933,0.01742159,-17.0037493,7.703169e-65
matched_tid_follower_cn_log,0.1170529651,0.001404737,83.3272934,0.0
author_pub_count_cate,-0.134224989,0.003077311,-43.617624,0.0
affiliation_rank_cate,-0.0246866275,0.001066255,-23.1526453,1.367238e-118
affiliation_cateinternational,-0.0821448799,0.006280782,-13.0787653,4.354561e-39
num_authors,0.0007912782,2.771271e-05,28.5528929,2.586001e-179


In [27]:
# use this search string in Sublime: " \\\\ \n  &" with " &"
stargazer(m_yes, type = "latex", single.row=TRUE, ci = FALSE, report = ('vc*p'), star.cutoffs = c(0.05, 0.01, 0.001))


% Table created by stargazer v.5.2.2 by Marek Hlavac, Harvard University. E-mail: hlavac at fas.harvard.edu
% Date and time: Tue, May 10, 2022 - 04:03:58 PM
\begin{table}[!htbp] \centering 
  \caption{} 
  \label{} 
\begin{tabular}{@{\extracolsep{5pt}}lc} 
\\[-1.8ex]\hline 
\hline \\[-1.8ex] 
 & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ 
\cline{2-2} 
\\[-1.8ex] & researcher \\ 
\hline \\[-1.8ex] 
 genderFemale & 0.091$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posfirst\_position & 0.079$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posmiddle\_position & 0.365$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_possolo\_author & $-$0.296$^{***}$ \\ 
  & p = 0.000 \\ 
  matched\_tid\_follower\_cn\_log & 0.117$^{***}$ \\ 
  & p = 0.000 \\ 
  author\_pub\_count\_cate & $-$0.134$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_rank\_cate & $-$0.025$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_cateinternational & $-$0.082$^{***}$ \\ 
  & p = 0.000 \\ 
  num\_authors & 0.001$^{***}$ \\ 
  & p =

## Num of non-scientists

### All observations

In [32]:
min(mydata$num_non_scientists)

In [33]:
mean(mydata[mydata$gender == "Female", 'num_non_scientists'])

In [34]:
mean(mydata[mydata$gender == "Male", 'num_non_scientists'])

Negative Binomial Regression

In [35]:
base_str <- "num_non_scientists ~ 1 + gender"
m_binary_bar <- glm.nb(formula = base_str, data = mydata)

In [36]:
tidy(m_binary_bar)

term,estimate,std.error,statistic,p.value
(Intercept),2.53448163,0.001413511,1793.0397,0.0
genderFemale,-0.06839597,0.002352205,-29.07738,6.936884e-186


In [37]:
base_str <- "num_non_scientists ~ 1 + gender + journal_impact"
m_binary_bar <- glm.nb(formula = base_str, data = mydata)

In [38]:
tidy(m_binary_bar)

term,estimate,std.error,statistic,p.value
(Intercept),1.5544031,0.001542573,1007.669318,0.0
genderFemale,0.0191186,0.002116514,9.033057,1.6694229999999998e-19
journal_impact,0.1028258,0.000142137,723.42698,0.0


In [28]:
base_str <- "num_non_scientists ~ 1 + gender * self_promotion + authorship_pos + \
        author_pub_count_cate + affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + author_citation_log"
equation <- as.formula(paste(base_str, keywords, sep = " "))
m_binary <- glm.nb(equation, data = mydata)

In [29]:
nobs(m_binary)

In [30]:
tidy(m_binary)

term,estimate,std.error,statistic,p.value
(Intercept),1.362463,0.004140654,329.0454007,0.0
genderFemale,0.01617873,0.002095327,7.7213374,1.151154e-14
self_promotionTrue,1.501359,0.004276208,351.0959666,0.0
authorship_posfirst_position,0.01035959,0.003531658,2.9333513,0.003353243
authorship_posmiddle_position,0.2391506,0.002744304,87.1443497,0.0
authorship_possolo_author,-0.1865916,0.008844143,-21.0977593,8.339538000000001e-99
author_pub_count_cate,-0.05987906,0.0009735538,-61.5056525,0.0
affiliation_rank_cate,-0.02045037,0.0003576485,-57.1800894,0.0
affiliation_cateinternational,-0.1092388,0.002112426,-51.7125,0.0
num_authors,8.395813e-05,3.066321e-06,27.3807354,4.651749e-165


In [31]:
# you're trying to measure the ratio of the deviance in your model to the null; how much better your model is (residual deviance) than just the intercept (null deviance). If that ratio is tiny, you're 'explaining' most of the deviance in the null; 1 minus that gets you your R-squared.
with(summary(m_binary), 1 - deviance/null.deviance)


In [32]:
# use this search string in Sublime: " \\\\ \n  &" with " &"
stargazer(m_binary, type = "latex", single.row=TRUE, ci = FALSE, report = ('vc*p'), star.cutoffs = c(0.05, 0.01, 0.001))


% Table created by stargazer v.5.2.2 by Marek Hlavac, Harvard University. E-mail: hlavac at fas.harvard.edu
% Date and time: Tue, May 10, 2022 - 04:09:46 PM
\begin{table}[!htbp] \centering 
  \caption{} 
  \label{} 
\begin{tabular}{@{\extracolsep{5pt}}lc} 
\\[-1.8ex]\hline 
\hline \\[-1.8ex] 
 & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ 
\cline{2-2} 
\\[-1.8ex] & num\_non\_scientists \\ 
\hline \\[-1.8ex] 
 genderFemale & 0.016$^{***}$ \\ 
  & p = 0.000 \\ 
  self\_promotionTrue & 1.501$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posfirst\_position & 0.010$^{**}$ \\ 
  & p = 0.004 \\ 
  authorship\_posmiddle\_position & 0.239$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_possolo\_author & $-$0.187$^{***}$ \\ 
  & p = 0.000 \\ 
  author\_pub\_count\_cate & $-$0.060$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_rank\_cate & $-$0.020$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_cateinternational & $-$0.109$^{***}$ \\ 
  & p = 0.000 \\ 
  num\_authors & 0.0001$^{***}$ \\ 
  & p = 0

### Self-promoted subset

Negative Binomial Regression

In [33]:
base_str <- "num_non_scientists ~ 1 + gender + authorship_pos + matched_tid_follower_cn_log + \
        author_pub_count_cate + affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + author_citation_log"
equation <- as.formula(paste(base_str, keywords, sep = " "))
m_yes <- glm.nb(equation, data = mydata[mydata$self_promotion == "True", ])

In [34]:
nrow(mydata[mydata$self_promotion == "True", ])

In [35]:
tidy(m_yes)

term,estimate,std.error,statistic,p.value
(Intercept),2.208960469,0.01845744,119.6786105,0.0
genderFemale,0.057845611,0.006923082,8.3554713,6.517196000000001e-17
authorship_posfirst_position,0.053602959,0.009731413,5.5082399,3.624392e-08
authorship_posmiddle_position,0.425991281,0.008258613,51.5814563,0.0
authorship_possolo_author,-0.212382646,0.01864659,-11.3898902,4.695812e-30
matched_tid_follower_cn_log,0.096266663,0.001521672,63.2637252,0.0
author_pub_count_cate,-0.046685868,0.003347964,-13.9445549,3.3955959999999995e-44
affiliation_rank_cate,-0.005416488,0.001159423,-4.6717102,2.987021e-06
affiliation_cateinternational,-0.176312153,0.006848992,-25.7427885,3.8821669999999998e-146
num_authors,0.001157518,3.071838e-05,37.6816095,0.0


In [36]:
# use this search string in Sublime: " \\\\ \n  &" with " &"
stargazer(m_yes, type = "latex", single.row=TRUE, ci = FALSE, report = ('vc*p'), star.cutoffs = c(0.05, 0.01, 0.001))


% Table created by stargazer v.5.2.2 by Marek Hlavac, Harvard University. E-mail: hlavac at fas.harvard.edu
% Date and time: Tue, May 10, 2022 - 04:10:07 PM
\begin{table}[!htbp] \centering 
  \caption{} 
  \label{} 
\begin{tabular}{@{\extracolsep{5pt}}lc} 
\\[-1.8ex]\hline 
\hline \\[-1.8ex] 
 & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ 
\cline{2-2} 
\\[-1.8ex] & num\_non\_scientists \\ 
\hline \\[-1.8ex] 
 genderFemale & 0.058$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posfirst\_position & 0.054$^{***}$ \\ 
  & p = 0.00000 \\ 
  authorship\_posmiddle\_position & 0.426$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_possolo\_author & $-$0.212$^{***}$ \\ 
  & p = 0.000 \\ 
  matched\_tid\_follower\_cn\_log & 0.096$^{***}$ \\ 
  & p = 0.000 \\ 
  author\_pub\_count\_cate & $-$0.047$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_rank\_cate & $-$0.005$^{***}$ \\ 
  & p = 0.00001 \\ 
  affiliation\_cateinternational & $-$0.176$^{***}$ \\ 
  & p = 0.000 \\ 
  num\_authors & 0.001$^{***