## Return on self-promotion

In [1]:
data_root <- "~/Data/Promotion/"

In [2]:
library("lme4")
library("margins")
library("stargazer")
library("emmeans")
library("ggeffects")
library("broom")
library("broom.mixed")
library("MASS")
library("pscl")

Loading required package: Matrix

Please cite as: 

 Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
 R package version 5.2.2. https://CRAN.R-project.org/package=stargazer 

Classes and Methods for R developed in the
Political Science Computational Laboratory
Department of Political Science
Stanford University
Simon Jackman
hurdle and zeroinfl functions by Achim Zeileis


In [3]:
options(repr.matrix.max.rows=500, repr.matrix.max.cols=200)

In [4]:
mydata <- read.csv(paste(data_root, "reg_data_drop_missing.csv", sep = ''), header = TRUE)

In [5]:
nrow(mydata)

In [6]:
ncol(mydata)

In [7]:
mydata = mydata[mydata$gender %in% c("Male", "Female"), ]

In [8]:
nrow(mydata)

In [9]:
mydata = mydata[mydata$affiliation_cate != 'unknown', ]

In [10]:
nrow(mydata)

In [11]:
mydata <- within(mydata, gender <- relevel(gender, ref = 'Male'))
mydata <- within(mydata, authorship_pos <- relevel(authorship_pos, ref = 'last_position'))
# mydata <- within(mydata, is_corresponding <- relevel(is_corresponding, ref = 'yes'))
mydata <- within(mydata, affiliation_cate <- relevel(affiliation_cate, ref = 'domestic'))

In [12]:
keywords <- " + Social_Sciences + Materials_Science + Engineering + Chemistry + \
        Biochemistry__Genetics_and_Molecular_Biology + Medicine + Nursing + Agricultural_and_Biological_Sciences + \
        Pharmacology__Toxicology_and_Pharmaceutics + Neuroscience + Business__Management_and_Accounting + \
        Economics__Econometrics_and_Finance + Chemical_Engineering + Physics_and_Astronomy + Computer_Science + \
        Decision_Sciences + Health_Professions + Psychology + Immunology_and_Microbiology + Dentistry + \
        Earth_and_Planetary_Sciences + Environmental_Science + Mathematics + Arts_and_Humanities + Energy + \
        Veterinary + General"

In [13]:
mydata$author_citation_log <- log2(1 + mydata$author_citation)

In [14]:
mydata$total_num_tweets_log <- log2(1 + mydata$total_num_tweets)

In [15]:
mydata$matched_tid_follower_cn_log <- log2(1 + mydata$matched_tid_follower_cn)

### All observations

In [16]:
mean(mydata[mydata$gender == "Female", 'total_num_tweets'])

In [17]:
mean(mydata[mydata$gender == "Male", 'total_num_tweets'])

OLS regression

In [18]:
base_str <- "total_num_tweets_log ~ 1 + gender"
m_binary <- lm(formula = base_str, data = mydata)

In [19]:
tidy(m_binary)

term,estimate,std.error,statistic,p.value
(Intercept),2.37416637,0.001532009,1549.708054,0.0
genderFemale,-0.01366503,0.002547889,-5.363275,8.173406e-08


In [20]:
base_str <- "total_num_tweets_log ~ 1 + gender + journal_impact"
m_binary <- lm(formula = base_str, data = mydata)

In [21]:
tidy(m_binary)

term,estimate,std.error,statistic,p.value
(Intercept),1.78807912,0.0017341452,1031.10116,0.0
genderFemale,0.02696282,0.0023771862,11.34233,8.110523e-30
journal_impact,0.09719969,0.0001629883,596.35974,0.0


Negative Binomial Regression

In [22]:
base_str <- "total_num_tweets ~ 1 + gender"
m_binary_bar <- glm.nb(formula = base_str, data = mydata)

In [23]:
tidy(m_binary_bar)

term,estimate,std.error,statistic,p.value
(Intercept),2.89910601,0.001417326,2045.47583,0.0
genderFemale,-0.07520588,0.002358222,-31.89093,3.567338e-223


In [61]:
base_str <- "total_num_tweets ~ 1 + gender + journal_impact"
m_binary_bar <- glm.nb(formula = base_str, data = mydata)

In [62]:
tidy(m_binary_bar)

term,estimate,std.error,statistic,p.value
(Intercept),1.88464247,0.001547564,1217.811955,0.0
genderFemale,0.01107064,0.002122904,5.214855,1.839607e-07
journal_impact,0.10676763,0.000143436,744.357297,0.0


In [16]:
base_str <- "total_num_tweets ~ 1 + gender * self_promotion + authorship_pos + author_pub_count_cate + \
            affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + author_citation_log"
equation <- as.formula(paste(base_str, keywords, sep = " "))
m_binary <- glm.nb(equation, data = mydata)

In [17]:
nobs(m_binary)

In [18]:
tidy(m_binary)

term,estimate,std.error,statistic,p.value
(Intercept),1.675146941,0.004147716,403.87217178,0.0
genderFemale,0.0148448947,0.002097625,7.07700164,1.473068e-12
self_promotionTrue,1.5694501068,0.004322618,363.07860576,0.0
authorship_posfirst_position,0.0087304068,0.003532068,2.47175519,0.01344516
authorship_posmiddle_position,0.2702541719,0.002745871,98.42201424,0.0
authorship_possolo_author,-0.1957947851,0.008846846,-22.13159181,1.569384e-108
author_pub_count_cate,-0.076826068,0.0009755406,-78.75229864,0.0
affiliation_rank_cate,-0.0243307339,0.0003583196,-67.90231828,0.0
affiliation_cateinternational,-0.1016629059,0.002118694,-47.98375491,0.0
num_authors,0.000104221,3.045684e-06,34.21923118,1.251648e-256


In [19]:
# you're trying to measure the ratio of the deviance in your model to the null; how much better your model is (residual deviance) than just the intercept (null deviance). If that ratio is tiny, you're 'explaining' most of the deviance in the null; 1 minus that gets you your R-squared.
with(summary(m_binary), 1 - deviance/null.deviance)


In [20]:
# use this search string in Sublime: " \\\\ \n  &" with " &"
stargazer(m_binary, type = "latex", single.row=TRUE, ci = FALSE, report = ('vc*p'), star.cutoffs = c(0.05, 0.01, 0.001))


% Table created by stargazer v.5.2.2 by Marek Hlavac, Harvard University. E-mail: hlavac at fas.harvard.edu
% Date and time: Tue, May 10, 2022 - 03:45:09 PM
\begin{table}[!htbp] \centering 
  \caption{} 
  \label{} 
\begin{tabular}{@{\extracolsep{5pt}}lc} 
\\[-1.8ex]\hline 
\hline \\[-1.8ex] 
 & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ 
\cline{2-2} 
\\[-1.8ex] & total\_num\_tweets \\ 
\hline \\[-1.8ex] 
 genderFemale & 0.015$^{***}$ \\ 
  & p = 0.000 \\ 
  self\_promotionTrue & 1.569$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posfirst\_position & 0.009$^{*}$ \\ 
  & p = 0.014 \\ 
  authorship\_posmiddle\_position & 0.270$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_possolo\_author & $-$0.196$^{***}$ \\ 
  & p = 0.000 \\ 
  author\_pub\_count\_cate & $-$0.077$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_rank\_cate & $-$0.024$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_cateinternational & $-$0.102$^{***}$ \\ 
  & p = 0.000 \\ 
  num\_authors & 0.0001$^{***}$ \\ 
  & p = 0.00

In [21]:
# show confidence intervals for presenting the result as a fig in the paper
# stargazer(m_binary, type = "latex", single.row=TRUE, ci = TRUE, report = ('vcs'), star.cutoffs = c(0.05, 0.01, 0.001))

In [None]:
# # random effects take too long to estimate (there are 539K papers)

# base_str <- "total_num_tweets ~ 1 + gender * self_promotion + authorship_pos + \
#         author_pub_count_cate + affiliation_rank_cate + affiliation_cate + num_authors + journal_impact"
# equation <- as.formula(paste(base_str, " + (1|doi) + ", keywords, sep = " "))
# m_binary_bar <- glmer.nb(equation, data = mydata)

Adjusted predictions

In [22]:
# weighted average for factor variable, median for non-factor variables
MEs = ggemmeans(m_binary, terms=c('self_promotion', 'gender'), typical='median')

In [23]:
MEs

x,predicted,std.error,conf.low,conf.high,group
False,7.728609,0.002847794,7.685591,7.771867,Male
False,7.844195,0.003080608,7.796975,7.8917,Female
True,37.12828,0.004754039,36.783935,37.475849,Male
True,39.131917,0.006483966,38.637762,39.632392,Female


In [24]:
write.csv(MEs, paste(data_root, "reg_results_F_M/gender_return_on_self_promotion.csv", sep = ''), row.names=FALSE)

### Self-promoted subset

Negative Binomial Regression

In [25]:
base_str <- "total_num_tweets ~ 1 + gender + authorship_pos + matched_tid_follower_cn_log + \
        author_pub_count_cate + affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + author_citation_log"
equation <- as.formula(paste(base_str, keywords, sep = " "))
m_yes <- glm.nb(equation, data = mydata[mydata$self_promotion == "True", ])

In [26]:
nrow(mydata[mydata$self_promotion == "True", ])

In [27]:
tidy(m_yes)

term,estimate,std.error,statistic,p.value
(Intercept),2.481728991,0.01715861,144.6346042,0.0
genderFemale,0.06549068,0.006436649,10.1746544,2.573119e-24
authorship_posfirst_position,0.067410588,0.009041009,7.4560912,8.912709e-14
authorship_posmiddle_position,0.415193962,0.00767619,54.0885493,0.0
authorship_possolo_author,-0.231577909,0.01731439,-13.3748774,8.479336999999999e-41
matched_tid_follower_cn_log,0.108054921,0.001414454,76.393391,0.0
author_pub_count_cate,-0.066423486,0.003113162,-21.3363412,5.2225170000000005e-101
affiliation_rank_cate,-0.009001096,0.001077954,-8.3501648,6.816781e-17
affiliation_cateinternational,-0.117310555,0.006370177,-18.4155866,9.85162e-76
num_authors,0.0010601,2.860591e-05,37.0587846,1.296603e-300


In [28]:
# use this search string in Sublime: " \\\\ \n  &" with " &"
stargazer(m_yes, type = "latex", single.row=TRUE, ci = FALSE, report = ('vc*p'), star.cutoffs = c(0.05, 0.01, 0.001))


% Table created by stargazer v.5.2.2 by Marek Hlavac, Harvard University. E-mail: hlavac at fas.harvard.edu
% Date and time: Tue, May 10, 2022 - 03:50:11 PM
\begin{table}[!htbp] \centering 
  \caption{} 
  \label{} 
\begin{tabular}{@{\extracolsep{5pt}}lc} 
\\[-1.8ex]\hline 
\hline \\[-1.8ex] 
 & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ 
\cline{2-2} 
\\[-1.8ex] & total\_num\_tweets \\ 
\hline \\[-1.8ex] 
 genderFemale & 0.065$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posfirst\_position & 0.067$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posmiddle\_position & 0.415$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_possolo\_author & $-$0.232$^{***}$ \\ 
  & p = 0.000 \\ 
  matched\_tid\_follower\_cn\_log & 0.108$^{***}$ \\ 
  & p = 0.000 \\ 
  author\_pub\_count\_cate & $-$0.066$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_rank\_cate & $-$0.009$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_cateinternational & $-$0.117$^{***}$ \\ 
  & p = 0.000 \\ 
  num\_authors & 0.001$^{***}$ \\ 

## Robustness check 1 (not due to coauthors' self-promotion)

### Solo-author papers (all obs)

In [29]:
subdata = mydata[mydata$authorship_pos == "solo_author", ]

In [30]:
nrow(subdata)

Negative Binomial Regression

In [31]:
base_str <- "total_num_tweets ~ 1 + gender * self_promotion + \
        author_pub_count_cate + affiliation_rank_cate + affiliation_cate + journal_impact + author_citation_log"
equation <- as.formula(paste(base_str, keywords, sep = " "))
m_binary <- glm.nb(equation, data = subdata)

In [32]:
nobs(m_binary)

In [33]:
tidy(m_binary)

term,estimate,std.error,statistic,p.value
(Intercept),1.5493991,0.027965805,55.4033429,0.0
genderFemale,0.08645353,0.019931895,4.3374467,1.441474e-05
self_promotionTrue,1.60119501,0.024184618,66.2071655,0.0
author_pub_count_cate,-0.05469446,0.00868526,-6.2973887,3.027015e-10
affiliation_rank_cate,-0.01491372,0.002787646,-5.3499337,8.798647e-08
affiliation_cateinternational,-0.17118824,0.016916071,-10.1198582,4.510686e-24
journal_impact,0.04664741,0.001033368,45.1411539,0.0
author_citation_log,0.06454285,0.004854749,13.294786,2.481822e-40
Social_Sciences,0.09998604,0.022271913,4.4893333,7.144643e-06
Materials_Science,-0.64638074,0.087989449,-7.3461165,2.040486e-13


In [34]:
# use this search string in Sublime: " \\\\ \n  &" with " &"
stargazer(m_binary, type = "latex", single.row=TRUE, ci = FALSE, report = ('vc*p'), star.cutoffs = c(0.05, 0.01, 0.001))


% Table created by stargazer v.5.2.2 by Marek Hlavac, Harvard University. E-mail: hlavac at fas.harvard.edu
% Date and time: Tue, May 10, 2022 - 03:50:14 PM
\begin{table}[!htbp] \centering 
  \caption{} 
  \label{} 
\begin{tabular}{@{\extracolsep{5pt}}lc} 
\\[-1.8ex]\hline 
\hline \\[-1.8ex] 
 & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ 
\cline{2-2} 
\\[-1.8ex] & total\_num\_tweets \\ 
\hline \\[-1.8ex] 
 genderFemale & 0.086$^{***}$ \\ 
  & p = 0.00002 \\ 
  self\_promotionTrue & 1.601$^{***}$ \\ 
  & p = 0.000 \\ 
  author\_pub\_count\_cate & $-$0.055$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_rank\_cate & $-$0.015$^{***}$ \\ 
  & p = 0.00000 \\ 
  affiliation\_cateinternational & $-$0.171$^{***}$ \\ 
  & p = 0.000 \\ 
  journal\_impact & 0.047$^{***}$ \\ 
  & p = 0.000 \\ 
  author\_citation\_log & 0.065$^{***}$ \\ 
  & p = 0.000 \\ 
  Social\_Sciences & 0.100$^{***}$ \\ 
  & p = 0.00001 \\ 
  Materials\_Science & $-$0.646$^{***}$ \\ 
  & p = 0.000 \\ 
  Engineering & $

### Solo-author papers (self-promoted subset)

Negative Binomial Regression

In [35]:
base_str <- "total_num_tweets ~ 1 + gender + matched_tid_follower_cn_log + \
        author_pub_count_cate + affiliation_rank_cate + affiliation_cate + journal_impact + author_citation_log"
equation <- as.formula(paste(base_str, keywords, sep = " "))
m_yes <- glm.nb(equation, data = subdata[subdata$self_promotion == "True", ])

In [36]:
tidy(m_yes)

term,estimate,std.error,statistic,p.value
(Intercept),1.503837118,0.084655707,17.76415526,1.3394139999999999e-70
genderFemale,0.135924011,0.032061779,4.23944071,2.240774e-05
matched_tid_follower_cn_log,0.186100731,0.00731519,25.4403147,9.035819e-143
author_pub_count_cate,0.037073369,0.016520045,2.24414453,0.0248231
affiliation_rank_cate,-0.026840393,0.005290654,-5.07317074,3.912412e-07
affiliation_cateinternational,-0.290087666,0.032897415,-8.817947,1.165774e-18
journal_impact,0.037192622,0.001739265,21.38409819,1.8788360000000002e-101
author_citation_log,0.003663737,0.008973017,0.40830603,0.683049
Social_Sciences,-0.109730959,0.038868933,-2.8231019,0.004756145
Materials_Science,-0.777899999,0.21886284,-3.55428084,0.0003790142


### First authors (all obs)

In [37]:
subdata = mydata[mydata$authorship_pos == "first_position", ]

In [38]:
nrow(subdata)

Negative Binomial Regression

In [39]:
base_str <- "total_num_tweets ~ 1 + gender * self_promotion + \
        author_pub_count_cate + affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + author_citation_log"
equation <- as.formula(paste(base_str, keywords, sep = " "))
m_binary <- glm.nb(equation, data = subdata)

“alternation limit reached”

In [40]:
nobs(m_binary)

In [41]:
tidy(m_binary)

term,estimate,std.error,statistic,p.value
(Intercept),1.656174387,0.007702563,215.0160044,0.0
genderFemale,0.015009012,0.005238779,2.8649828,0.004170318
self_promotionTrue,1.530059206,0.009072256,168.6525666,0.0
author_pub_count_cate,-0.042688072,0.002506704,-17.0295596,4.95759e-65
affiliation_rank_cate,-0.032851148,0.0008897864,-36.9202621,2.1862700000000003e-298
affiliation_cateinternational,-0.148184764,0.005353435,-27.6803134,1.205154e-168
num_authors,0.002820522,7.969039e-05,35.3935057,2.149155e-274
journal_impact,0.083997503,0.0004093922,205.1760941,0.0
author_citation_log,0.050234872,0.001363355,36.8465136,3.325771e-297
Social_Sciences,0.223488313,0.01190916,18.7660789,1.4306309999999998e-78


### First authors (self-promoted subset)

Negative Binomial Regression

In [42]:
base_str <- "total_num_tweets ~ 1 + gender + matched_tid_follower_cn_log + \
        author_pub_count_cate + affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + author_citation_log"
equation <- as.formula(paste(base_str, keywords, sep = " "))
m_yes <- glm.nb(equation, data = subdata[subdata$self_promotion == "True", ])

In [43]:
tidy(m_yes)

term,estimate,std.error,statistic,p.value
(Intercept),2.041705277,0.0274915805,74.2665659,0.0
genderFemale,0.03786,0.011653767,3.2487349,0.001159194
matched_tid_follower_cn_log,0.142092717,0.0026646201,53.3256945,0.0
author_pub_count_cate,-0.055115969,0.0058452502,-9.429189,4.132721e-21
affiliation_rank_cate,-0.017714008,0.0020559698,-8.615889,6.940094e-18
affiliation_cateinternational,-0.07123425,0.0121631289,-5.8565728,4.725165e-09
num_authors,0.013133448,0.0004589738,28.6148083,4.396273e-180
journal_impact,0.054153098,0.0007186901,75.3497217,0.0
author_citation_log,0.048143692,0.0031282358,15.3900457,1.9089730000000002e-53
Social_Sciences,-0.192472017,0.0212992042,-9.0365825,1.616463e-19


## Robustness check 2 (different definition of self-promotion)

### All observations

Negative Binomial Regression

In [44]:
base_str <- "total_num_tweets ~ 1 + gender * self_promotion_def + authorship_pos + \
        author_pub_count_cate + affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + author_citation_log"
equation <- as.formula(paste(base_str, keywords, sep = " "))
m_binary <- glm.nb(equation, data = mydata)

In [45]:
nobs(m_binary)

In [46]:
tidy(m_binary)

term,estimate,std.error,statistic,p.value
(Intercept),1.821917,0.004222783,431.4493835,0.0
genderFemale,0.001183581,0.00211467,0.5597002,0.5756839
self_promotion_defTrue,1.431342,0.005263336,271.9457934,0.0
authorship_posfirst_position,0.02071637,0.003598478,5.7569801,8.563199e-09
authorship_posmiddle_position,0.2162283,0.002797186,77.302104,0.0
authorship_possolo_author,-0.09759314,0.008991933,-10.8534107,1.921198e-27
author_pub_count_cate,-0.07102727,0.0009956269,-71.3392365,0.0
affiliation_rank_cate,-0.02615917,0.0003655818,-71.554909,0.0
affiliation_cateinternational,-0.120148,0.002162364,-55.563268,0.0
num_authors,8.305554e-05,3.108989e-06,26.7146492,3.181073e-157


### Self-promoted subset

Negative Binomial Regression

In [47]:
base_str <- "total_num_tweets ~ 1 + gender + authorship_pos + matched_tid_follower_cn_log + \
        author_pub_count_cate + affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + author_citation_log"
equation <- as.formula(paste(base_str, keywords, sep = " "))
m_yes <- glm.nb(equation, data = mydata[mydata$self_promotion_def == "True", ])

In [48]:
nrow(mydata[mydata$self_promotion_def == "True", ])

In [49]:
tidy(m_yes)

term,estimate,std.error,statistic,p.value
(Intercept),2.342400693,0.02085152,112.3371794,0.0
genderFemale,0.077366244,0.007656466,10.1046933,5.266011e-24
authorship_posfirst_position,0.153622129,0.01068192,14.3815047,6.760849e-47
authorship_posmiddle_position,0.444241011,0.009061605,49.0245414,0.0
authorship_possolo_author,-0.135880279,0.02111675,-6.4347148,1.237054e-10
matched_tid_follower_cn_log,0.114384242,0.00172772,66.2053207,0.0
author_pub_count_cate,-0.056063202,0.003725276,-15.0494086,3.483343e-51
affiliation_rank_cate,-0.011263936,0.001280601,-8.7958179,1.420097e-18
affiliation_cateinternational,-0.05945776,0.007598502,-7.8249316,5.079332e-15
num_authors,0.001172718,3.288313e-05,35.6632239,1.469895e-278


In [50]:
# use this search string in Sublime: " \\\\ \n  &" with " &"
stargazer(m_yes, type = "latex", single.row=TRUE, ci = FALSE, report = ('vc*p'), star.cutoffs = c(0.05, 0.01, 0.001))


% Table created by stargazer v.5.2.2 by Marek Hlavac, Harvard University. E-mail: hlavac at fas.harvard.edu
% Date and time: Tue, May 10, 2022 - 03:58:48 PM
\begin{table}[!htbp] \centering 
  \caption{} 
  \label{} 
\begin{tabular}{@{\extracolsep{5pt}}lc} 
\\[-1.8ex]\hline 
\hline \\[-1.8ex] 
 & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ 
\cline{2-2} 
\\[-1.8ex] & total\_num\_tweets \\ 
\hline \\[-1.8ex] 
 genderFemale & 0.077$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posfirst\_position & 0.154$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posmiddle\_position & 0.444$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_possolo\_author & $-$0.136$^{***}$ \\ 
  & p = 0.000 \\ 
  matched\_tid\_follower\_cn\_log & 0.114$^{***}$ \\ 
  & p = 0.000 \\ 
  author\_pub\_count\_cate & $-$0.056$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_rank\_cate & $-$0.011$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_cateinternational & $-$0.059$^{***}$ \\ 
  & p = 0.000 \\ 
  num\_authors & 0.001$^{***}$ \\ 