In [1]:
Data_Root <- "/Data/Promotion/revision/"
data_root <- "/Data/Promotion/revision/reg_results_F_M/"

In [71]:
library("lme4")
library("margins")
library("stargazer")
library("emmeans")
library("ggeffects")
library("broom")
library("broom.mixed")
library("dplyr")
library("MuMIn")

In [4]:
# https://www.rdocumentation.org/packages/ggeffects/versions/1.1.1
# https://www.rdocumentation.org/packages/ggeffects/versions/1.1.1/topics/ggeffect
packageVersion("ggeffects")

[1] ‘1.1.4’

In [6]:
# library("sjmisc")
# ?sjmisc::typical_value

In [5]:
options(repr.matrix.max.rows=500, repr.matrix.max.cols=200)

In [6]:
options(dplyr.print_max = 100)

In [59]:
mydata <- read.csv(paste(Data_Root, "reg_data_drop_missing.csv", sep = ''), header = TRUE, stringsAsFactors = TRUE)

In [60]:
# this is after dropping missing data
nrow(mydata)

In [61]:
ncol(mydata)

In [62]:
mydata = mydata[mydata$affiliation_cate != 'unknown', ]

In [63]:
mydata = mydata[mydata$gender != 'Unknown', ]

In [64]:
table(mydata$gender)


 Female    Male  Unisex Unknown 
4025650 7371102  808320       0 

In [65]:
nrow(mydata)

In [115]:
mydata$pub_year <- as.factor(mydata$pub_year)

In [67]:
mydata <- within(mydata, gender <- relevel(gender, ref = 'Male'))
mydata <- within(mydata, authorship_pos <- relevel(authorship_pos, ref = 'last_position'))
mydata <- within(mydata, affiliation_cate <- relevel(affiliation_cate, ref = 'domestic'))

In [68]:
keywords <- " + Social_Sciences + Materials_Science + Engineering + Chemistry + \
        Biochemistry__Genetics_and_Molecular_Biology + Medicine + Nursing + Agricultural_and_Biological_Sciences + \
        Pharmacology__Toxicology_and_Pharmaceutics + Neuroscience + Business__Management_and_Accounting + \
        Economics__Econometrics_and_Finance + Chemical_Engineering + Physics_and_Astronomy + Computer_Science + \
        Decision_Sciences + Health_Professions + Psychology + Immunology_and_Microbiology + Dentistry + \
        Earth_and_Planetary_Sciences + Environmental_Science + Mathematics + Arts_and_Humanities + Energy + \
        Veterinary + General"

In [69]:
mydata$author_citation_log <- log2(1 + mydata$author_citation)

### SI - include unisex

In [24]:
base_str <- "self_promotion ~ 1 + gender + authorship_pos + author_pub_count_cate + I(author_pub_count_cate^2) + \
        affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + author_citation_log + pub_year"
equation <- as.formula(paste(base_str, " + (1|doi) + ", keywords, sep = " "))
m_all_tw_0 <- glmer(equation, data = mydata, family = "binomial", control = glmerControl(optimizer = "nloptwrap"), nAGQ = 0)


In [25]:
tidy(m_all_tw_0)

effect,group,term,estimate,std.error,statistic,p.value
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
fixed,,(Intercept),-4.441800012,0.01138513,-390.140571,0.0
fixed,,genderFemale,-0.347689815,0.00380382,-91.405438,0.0
fixed,,genderUnisex,-0.753432056,0.009625669,-78.273218,0.0
fixed,,authorship_posfirst_position,0.363608947,0.004899606,74.211879,0.0
fixed,,authorship_posmiddle_position,-0.639838508,0.00434085,-147.399363,0.0
fixed,,authorship_possolo_author,0.843901177,0.009889549,85.332626,0.0
fixed,,author_pub_count_cate,0.286180719,0.003121574,91.678324,0.0
fixed,,I(author_pub_count_cate^2),-0.02056712,0.0002573086,-79.931714,0.0
fixed,,affiliation_rank_cate,-0.047655775,0.0006779367,-70.295322,0.0
fixed,,affiliation_cateinternational,0.048979383,0.00420009,11.661508,2.0046000000000002e-31


In [29]:
# use this search string in Sublime: " \\\\ \n  &" with " &"
stargazer(m_all_tw_0, type = "latex", single.row=TRUE, ci = FALSE, report = ('vc*p'), star.cutoffs = c(0.05, 0.01, 0.001))



% Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
% Date and time: Wed, Nov 23, 2022 - 08:34:59 PM
\begin{table}[!htbp] \centering 
  \caption{} 
  \label{} 
\begin{tabular}{@{\extracolsep{5pt}}lc} 
\\[-1.8ex]\hline 
\hline \\[-1.8ex] 
 & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ 
\cline{2-2} 
\\[-1.8ex] & self\_promotion \\ 
\hline \\[-1.8ex] 
 genderFemale & $-$0.348$^{***}$ \\ 
  & p = 0.000 \\ 
  genderUnisex & $-$0.753$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posfirst\_position & 0.364$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posmiddle\_position & $-$0.640$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_possolo\_author & 0.844$^{***}$ \\ 
  & p = 0.000 \\ 
  author\_pub\_count\_cate & 0.286$^{***}$ \\ 
  & p = 0.000 \\ 
  I(author\_pub\_count\_cate$\hat{\mkern6mu}$2) & $-$0.021$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_rank\_cate & $-$0.048$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_cateinternational 

In [72]:
# Pseudo-R-squared for Generalized Mixed-Effect models
# https://www.rdocumentation.org/packages/MuMIn/versions/1.47.1/topics/r.squaredGLMM

# The marginal R squared values are those associated with your fixed effects, the conditional ones are those of your
# fixed effects plus the random effects. Usually we will be interested in the marginal effects.
r.squaredGLMM(m_all_tw_0)

“'r.squaredGLMM' now calculates a revised statistic. See the help page.”
“the null model is correct only if all variables used by the original model remain unchanged.”


Unnamed: 0,R2m,R2c
theoretical,0.19235363,0.5197861
delta,0.07611118,0.2056708


### Focus on Male and Female (drop unisex)

In [78]:
mydata = mydata[mydata$gender %in% c("Male", "Female"), ]

In [79]:
nrow(mydata)

In [80]:
n_distinct(mydata$author_id)

### SI - A series of regressions

In [65]:
m1 <- glm(formula = 'self_promotion ~ 1 + gender', data = mydata, family = "binomial")

In [66]:
tidy(m1)

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
(Intercept),-3.0267476,0.001754046,-1725.58066,0
genderFemale,-0.2978328,0.003238054,-91.97894,0


In [214]:
m2 <- glm(formula = 'self_promotion ~ 1 + gender + pub_year + authorship_pos + num_authors', data = mydata, family = "binomial")


In [217]:
m3 <- glm(formula = 'self_promotion ~ 1 + gender + pub_year + authorship_pos + num_authors + \
        journal_impact + affiliation_cate + affiliation_rank_cate + author_pub_count_cate + \
        I(author_pub_count_cate^2) + author_citation_log', data = mydata, family = "binomial")

In [220]:
base_str <- "self_promotion ~ 1 + gender + pub_year + authorship_pos + num_authors + journal_impact + \
        affiliation_cate + affiliation_rank_cate + author_pub_count_cate + I(author_pub_count_cate^2) + \
        author_citation_log"
equation <- as.formula(paste(base_str, keywords, sep = " "))
m4 <- glm(formula = equation, data = mydata, family = "binomial")

In [70]:
base_str <- "self_promotion ~ 1 + gender + pub_year + authorship_pos + num_authors + journal_impact + \
        affiliation_cate + affiliation_rank_cate + author_pub_count_cate + I(author_pub_count_cate^2) + \
        author_citation_log"
equation <- as.formula(paste(base_str, " + (1|doi) + ", keywords, sep = " "))
m5 <- glmer(equation, data = mydata, family = "binomial", control = glmerControl(optimizer = "nloptwrap"), nAGQ = 0)

In [80]:
r.squaredGLMM(m5)

“the null model is correct only if all variables used by the original model remain unchanged.”


Unnamed: 0,R2m,R2c
theoretical,0.18254972,0.5105761
delta,0.07211197,0.2016911


In [71]:
stargazer(m1, m2, m3, m4, m5, type = "latex", dep.var.labels = "Self-promotion = True", 
          single.row=TRUE, ci = FALSE, star.cutoffs = c(0.05, 0.01, 0.001), report = "vc*")


% Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
% Date and time: Wed, Nov 23, 2022 - 09:41:07 PM
\begin{table}[!htbp] \centering 
  \caption{} 
  \label{} 
\begin{tabular}{@{\extracolsep{5pt}}lccccc} 
\\[-1.8ex]\hline 
\hline \\[-1.8ex] 
 & \multicolumn{5}{c}{\textit{Dependent variable:}} \\ 
\cline{2-6} 
\\[-1.8ex] & \multicolumn{3}{c}{Self-promotion = True} & \multicolumn{2}{c}{NA} \\ 
\\[-1.8ex] & \multicolumn{3}{c}{\textit{logistic}} & \textit{logistic} & \textit{generalized linear} \\ 
 & \multicolumn{3}{c}{\textit{}} & \textit{} & \textit{mixed-effects} \\ 
\\[-1.8ex] & (1) & (2) & (3) & (4) & (5)\\ 
\hline \\[-1.8ex] 
 genderFemale & $-$0.298$^{***}$ & $-$0.318$^{***}$ & $-$0.284$^{***}$ & $-$0.311$^{***}$ & $-$0.348$^{***}$ \\ 
  pub\_year2014 &  & 0.319$^{***}$ & 0.318$^{***}$ & 0.320$^{***}$ & 0.305$^{***}$ \\ 
  pub\_year2015 &  & 0.635$^{***}$ & 0.653$^{***}$ & 0.656$^{***}$ & 0.607$^{***}$ \\ 
  pub\_year201

### SI - gender x location

In [62]:
base_str <- "self_promotion ~ 1 + gender * affiliation_cate + authorship_pos + author_pub_count_cate + I(author_pub_count_cate^2) + \
        affiliation_rank_cate + num_authors + journal_impact + author_citation_log + pub_year"
equation <- as.formula(paste(base_str, " + (1|doi) + ", keywords, sep = " "))
m_all_tw_loc <- glmer(equation, data = mydata, family = "binomial", control = glmerControl(optimizer = "nloptwrap"), nAGQ = 0)


In [63]:
tidy(m_all_tw_loc)

effect,group,term,estimate,std.error,statistic,p.value
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
fixed,,(Intercept),-4.40664684,0.01159617,-380.008698,0.0
fixed,,genderFemale,-0.41226583,0.006236604,-66.104215,0.0
fixed,,affiliation_cateinternational,0.02253202,0.004784363,4.709514,2.483083e-06
fixed,,authorship_posfirst_position,0.3609027,0.004969984,72.616466,0.0
fixed,,authorship_posmiddle_position,-0.63794437,0.004395535,-145.134635,0.0
fixed,,authorship_possolo_author,0.82749351,0.009972566,82.976993,0.0
fixed,,author_pub_count_cate,0.28638009,0.003174041,90.225701,0.0
fixed,,I(author_pub_count_cate^2),-0.02064193,0.0002614424,-78.953996,0.0
fixed,,affiliation_rank_cate,-0.04757269,0.0006852733,-69.421483,0.0
fixed,,num_authors,-0.00223791,7.904005e-05,-28.313623,2.348938e-176


In [64]:
stargazer(m_all_tw_loc, type = "latex", single.row=TRUE, ci = FALSE, report = ('vc*p'), star.cutoffs = c(0.05, 0.01, 0.001))



% Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
% Date and time: Wed, Nov 23, 2022 - 08:48:55 PM
\begin{table}[!htbp] \centering 
  \caption{} 
  \label{} 
\begin{tabular}{@{\extracolsep{5pt}}lc} 
\\[-1.8ex]\hline 
\hline \\[-1.8ex] 
 & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ 
\cline{2-2} 
\\[-1.8ex] & self\_promotion \\ 
\hline \\[-1.8ex] 
 genderFemale & $-$0.412$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_cateinternational & 0.023$^{***}$ \\ 
  & p = 0.00001 \\ 
  authorship\_posfirst\_position & 0.361$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posmiddle\_position & $-$0.638$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_possolo\_author & 0.827$^{***}$ \\ 
  & p = 0.000 \\ 
  author\_pub\_count\_cate & 0.286$^{***}$ \\ 
  & p = 0.000 \\ 
  I(author\_pub\_count\_cate$\hat{\mkern6mu}$2) & $-$0.021$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_rank\_cate & $-$0.048$^{***}$ \\ 
  & p = 0.000 \\ 
  num\_authors &

In [77]:
r.squaredGLMM(m_all_tw_loc)

“'r.squaredGLMM' now calculates a revised statistic. See the help page.”
“the null model is correct only if all variables used by the original model remain unchanged.”


Unnamed: 0,R2m,R2c
theoretical,0.18260111,0.510505
delta,0.07212594,0.2016453


### 1. Gender gap in overall self-promotion (without Tw control)

All tweets

In [129]:
base_str <- "self_promotion ~ 1 + gender + authorship_pos + author_pub_count_cate + I(author_pub_count_cate^2) + \
        affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + author_citation_log + pub_year"
equation <- as.formula(paste(base_str, " + (1|doi) + ", keywords, sep = " "))
m_all_tw <- glmer(equation, data = mydata, family = "binomial", control = glmerControl(optimizer = "nloptwrap"), nAGQ = 0)


In [130]:
tidy(m_all_tw)

effect,group,term,estimate,std.error,statistic,p.value
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
fixed,,(Intercept),-4.425075143,0.01151645,-384.239575,0.0
fixed,,genderFemale,-0.348223403,0.003799982,-91.63817,0.0
fixed,,authorship_posfirst_position,0.361379997,0.004969704,72.71661,0.0
fixed,,authorship_posmiddle_position,-0.637938251,0.004395494,-145.134584,0.0
fixed,,authorship_possolo_author,0.827767711,0.009972733,83.003092,0.0
fixed,,author_pub_count_cate,0.286002683,0.003174216,90.101822,0.0
fixed,,I(author_pub_count_cate^2),-0.020620638,0.0002614406,-78.87312,0.0
fixed,,affiliation_rank_cate,-0.047512832,0.0006852199,-69.339537,0.0
fixed,,affiliation_cateinternational,0.051606361,0.004239058,12.174016,4.275201e-34
fixed,,num_authors,-0.002237233,7.905595e-05,-28.299358,3.519338e-176


In [131]:
MEs = ggemmeans(m_all_tw, terms=c('gender', 'pub_year'), typical='median')

Model contains polynomial or cubic / quadratic terms. Consider using `terms="gender [all]"` to get smooth plots. See also package-vignette 'Marginal Effects at Specific Values'.



In [132]:
MEs

x,predicted,std.error,conf.low,conf.high,group
<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
Male,0.02462171,0.008716538,0.02421474,0.02503534,2013
Male,0.03310707,0.007680173,0.03262858,0.03359233,2014
Male,0.04424832,0.006907478,0.04367929,0.04482441,2015
Male,0.05556626,0.006459304,0.05490561,0.05623439,2016
Male,0.07575889,0.006069873,0.07493008,0.07659611,2017
Male,0.10550519,0.005705154,0.10445456,0.10656514,2018
Female,0.01750824,0.009086367,0.01720451,0.01781723,2013
Female,0.02360141,0.008079836,0.02323922,0.02396911,2014
Female,0.03164851,0.007327173,0.03121134,0.0320916,2015
Female,0.03987811,0.006887576,0.03936444,0.04039819,2016


In [133]:
fname = paste(data_root, 'pred_all_tweet.csv', sep = '')
write.csv(MEs, fname, row.names = FALSE)

In [134]:
# use this search string in Sublime: " \\\\ \n  &" with " &"
stargazer(m_all_tw, type = "latex", single.row=TRUE, ci = FALSE, report = ('vc*p'), star.cutoffs = c(0.05, 0.01, 0.001))


% Table created by stargazer v.5.2.2 by Marek Hlavac, Harvard University. E-mail: hlavac at fas.harvard.edu
% Date and time: Thu, Oct 06, 2022 - 12:36:59 PM
\begin{table}[!htbp] \centering 
  \caption{} 
  \label{} 
\begin{tabular}{@{\extracolsep{5pt}}lc} 
\\[-1.8ex]\hline 
\hline \\[-1.8ex] 
 & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ 
\cline{2-2} 
\\[-1.8ex] & self\_promotion \\ 
\hline \\[-1.8ex] 
 genderFemale & $-$0.348$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posfirst\_position & 0.361$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posmiddle\_position & $-$0.638$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_possolo\_author & 0.828$^{***}$ \\ 
  & p = 0.000 \\ 
  author\_pub\_count\_cate & 0.286$^{***}$ \\ 
  & p = 0.000 \\ 
  I(author\_pub\_count\_cate$\hat{\mkern6mu}$2) & $-$0.021$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_rank\_cate & $-$0.048$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_cateinternational & 0.052$^{***}$ \\ 
  & p = 0.000 \\ 
  num\_authors & $-$0.0

Original tweets

In [135]:
base_str <- "self_promotion_original ~ 1 + gender + authorship_pos + author_pub_count_cate + I(author_pub_count_cate^2) + \
            affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + author_citation_log + pub_year"
equation <- as.formula(paste(base_str, " + (1|doi)", keywords, sep = " "))
m_org <- glmer(formula = equation, data = mydata, family = "binomial", control = glmerControl(optimizer = "nloptwrap"), nAGQ = 0)


In [136]:
tidy(m_org)

effect,group,term,estimate,std.error,statistic,p.value
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
fixed,,(Intercept),-4.495998397,0.01214685,-370.137083,0.0
fixed,,genderFemale,-0.375214975,0.004257129,-88.138041,0.0
fixed,,authorship_posfirst_position,0.389046857,0.005361844,72.558407,0.0
fixed,,authorship_posmiddle_position,-0.791566219,0.004897553,-161.624835,0.0
fixed,,authorship_possolo_author,0.851814398,0.009881296,86.204725,0.0
fixed,,author_pub_count_cate,0.312382801,0.003561664,87.706974,0.0
fixed,,I(author_pub_count_cate^2),-0.02383027,0.0002964468,-80.38634,0.0
fixed,,affiliation_rank_cate,-0.048430451,0.0007357,-65.829073,0.0
fixed,,affiliation_cateinternational,0.090099054,0.004421905,20.375619,2.752124e-92
fixed,,num_authors,-0.002013666,6.760717e-05,-29.784806,6.146184e-195


In [137]:
# weighted average for factor variable, median for non-factor variables
MEs = ggemmeans(m_org, terms=c('gender', 'pub_year'), typical='median')

Model contains polynomial or cubic / quadratic terms. Consider using `terms="gender [all]"` to get smooth plots. See also package-vignette 'Marginal Effects at Specific Values'.



In [138]:
MEs

x,predicted,std.error,conf.low,conf.high,group
<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
Male,0.02408541,0.008785418,0.02368397,0.02449348,2013
Male,0.03113302,0.007771325,0.03067685,0.03159575,2014
Male,0.04002107,0.006999128,0.03949734,0.04055145,2015
Male,0.04950268,0.006519188,0.04890492,0.05010735,2016
Male,0.06644085,0.00608092,0.06570541,0.06718394,2017
Male,0.09041826,0.005669784,0.08950848,0.09133636,2018
Female,0.01667574,0.009262735,0.01638064,0.01697606,2013
Female,0.02160321,0.008288187,0.02126251,0.02194925,2014
Female,0.02784883,0.007544628,0.02745128,0.02825198,2015
Female,0.03455044,0.007079812,0.03409055,0.03501631,2016


In [139]:
fname = paste(data_root, 'pred_original.csv', sep = '')
write.csv(MEs, fname, row.names = FALSE)

Retweets

In [140]:
base_str <- "self_promotion_retweet ~ 1 + gender + authorship_pos + author_pub_count_cate + I(author_pub_count_cate^2) + \
            affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + author_citation_log + pub_year"

In [141]:
equation <- as.formula(paste(base_str, " + (1|doi) + ", keywords, sep = " "))
m_retweet <- glmer(formula = equation, data = mydata, family = "binomial", control = glmerControl(optimizer = "nloptwrap"), nAGQ = 0)


In [142]:
tidy(m_retweet)

effect,group,term,estimate,std.error,statistic,p.value
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
fixed,,(Intercept),-5.650606787,0.0181768047,-310.8690931,0.0
fixed,,genderFemale,-0.280162154,0.0053392034,-52.4726508,0.0
fixed,,authorship_posfirst_position,0.321284964,0.0073228872,43.8740832,0.0
fixed,,authorship_posmiddle_position,-0.476481053,0.0063446287,-75.0999114,0.0
fixed,,authorship_possolo_author,0.638180184,0.0154091946,41.4155444,0.0
fixed,,author_pub_count_cate,0.237947701,0.004511105,52.7470991,0.0
fixed,,I(author_pub_count_cate^2),-0.016752953,0.0003697853,-45.3045366,0.0
fixed,,affiliation_rank_cate,-0.050509395,0.0009756264,-51.7712489,0.0
fixed,,affiliation_cateinternational,0.042039049,0.0060792932,6.9151213,4.674633e-12
fixed,,num_authors,-0.001843363,0.0001028801,-17.917591,8.597127e-72


In [143]:
# weighted average for factor variable, median for non-factor variables
MEs = ggemmeans(m_retweet, terms=c('gender', 'pub_year'), typical='median')

Model contains polynomial or cubic / quadratic terms. Consider using `terms="gender [all]"` to get smooth plots. See also package-vignette 'Marginal Effects at Specific Values'.



In [144]:
MEs

x,predicted,std.error,conf.low,conf.high,group
<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
Male,0.006564572,0.014724795,0.006379017,0.006755487,2013
Male,0.010451411,0.01206005,0.010209758,0.010698722,2014
Male,0.015727463,0.01038455,0.015415477,0.01604566,2015
Male,0.020540447,0.009567684,0.02016655,0.020921127,2016
Male,0.029869255,0.008823401,0.029372192,0.030374466,2017
Male,0.042429855,0.008235942,0.041778829,0.043090569,2018
Female,0.004968562,0.015139531,0.004823996,0.005117437,2013
Female,0.007917946,0.012541558,0.007727173,0.00811339,2014
Female,0.01193048,0.010913058,0.011680956,0.01218527,2015
Female,0.015599912,0.010116342,0.015298334,0.015907339,2016


In [145]:
fname = paste(data_root, 'pred_retweet.csv', sep = '')
write.csv(MEs, fname, row.names = FALSE)

### 2. Gender gap in who is active on Twitter (their matching)

In [203]:
base_str <- "is_active_on_twitter ~ 1 + gender + authorship_pos + author_pub_count_cate + I(author_pub_count_cate^2) + \
        affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + author_citation_log + pub_year"
equation <- as.formula(paste(base_str, " + (1|doi) + ", keywords, sep = " "))
m_ontw <- glmer(equation, data = mydata, family = "binomial", control = glmerControl(optimizer = "nloptwrap"), nAGQ = 0)


In [204]:
tidy(m_ontw)

effect,group,term,estimate,std.error,statistic,p.value
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
fixed,,(Intercept),-5.842051612,0.01166827,-500.6786298,0.0
fixed,,genderFemale,-0.133872233,0.003205392,-41.7647001,0.0
fixed,,authorship_posfirst_position,0.295021124,0.004571097,64.5405538,0.0
fixed,,authorship_posmiddle_position,-0.240523455,0.003582538,-67.1377286,0.0
fixed,,authorship_possolo_author,0.64960193,0.00918388,70.7328407,0.0
fixed,,author_pub_count_cate,0.669910077,0.003355295,199.6575736,0.0
fixed,,I(author_pub_count_cate^2),-0.039330098,0.0002635286,-149.244108,0.0
fixed,,affiliation_rank_cate,-0.070999826,0.0005827622,-121.8332716,0.0
fixed,,affiliation_cateinternational,0.117428093,0.003480081,33.7429182,1.357961e-249
fixed,,num_authors,-0.001351512,3.937613e-05,-34.3231345,3.5456319999999996e-258


In [119]:
r.squaredGLMM(m_ontw)

“the null model is correct only if all variables used by the original model remain unchanged.”


Unnamed: 0,R2m,R2c
theoretical,0.23629542,0.4199421
delta,0.08458409,0.1503221


In [148]:
MEs = ggemmeans(m_ontw, terms=c('gender', 'pub_year'), typical='median')

Model contains polynomial or cubic / quadratic terms. Consider using `terms="gender [all]"` to get smooth plots. See also package-vignette 'Marginal Effects at Specific Values'.



In [149]:
MEs

x,predicted,std.error,conf.low,conf.high,group
<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
Male,0.02311103,0.007862319,0.02276566,0.0234615,2013
Male,0.03747638,0.006491341,0.03702014,0.03793803,2014
Male,0.05450538,0.005713752,0.05393113,0.05508539,2015
Male,0.07265465,0.005295617,0.07195843,0.07335707,2016
Male,0.0968422,0.005007077,0.09598725,0.09770394,2017
Male,0.12876453,0.004740085,0.12772589,0.12981037,2018
Female,0.02027397,0.008108874,0.01996068,0.02059207,2013
Female,0.03293532,0.006772964,0.03251512,0.03336076,2014
Female,0.0480038,0.006013468,0.04746804,0.0485453,2015
Female,0.064135,0.005600334,0.06347932,0.06479699,2016


In [150]:
fname = paste(data_root, 'pred_gender_on_tw.csv', sep = '')
write.csv(MEs, fname, row.names = FALSE)

In [205]:
# use this search string in Sublime: " \\\\ \n  &" with " &"
stargazer(m_ontw, type = "latex", single.row=TRUE, ci = FALSE, report = ('vc*p'), star.cutoffs = c(0.05, 0.01, 0.001))


% Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
% Date and time: Tue, Nov 29, 2022 - 12:43:10 PM
\begin{table}[!htbp] \centering 
  \caption{} 
  \label{} 
\begin{tabular}{@{\extracolsep{5pt}}lc} 
\\[-1.8ex]\hline 
\hline \\[-1.8ex] 
 & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ 
\cline{2-2} 
\\[-1.8ex] & is\_active\_on\_twitter \\ 
\hline \\[-1.8ex] 
 genderFemale & $-$0.134$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posfirst\_position & 0.295$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posmiddle\_position & $-$0.241$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_possolo\_author & 0.650$^{***}$ \\ 
  & p = 0.000 \\ 
  author\_pub\_count\_cate & 0.670$^{***}$ \\ 
  & p = 0.000 \\ 
  I(author\_pub\_count\_cate$\hat{\mkern6mu}$2) & $-$0.039$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_rank\_cate & $-$0.071$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_cateinternational & 0.117$^{***}$ \\ 
  & p = 0.000 \\ 
  num\_aut

### 3. Gap among subset of observations where the author is active on Twitter (their matching)

In [151]:
nrow(mydata[mydata$is_active_on_twitter == 'True', ])

In [120]:
# remove quadratic term for pub cate
base_str <- "self_promotion ~ 1 + gender + authorship_pos + author_pub_count_cate + \
        affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + author_citation_log + pub_year"
equation <- as.formula(paste(base_str, " + (1|doi) + ", keywords, sep = " "))
m_on_tw <- glmer(formula = equation, data = mydata[mydata$is_active_on_twitter == 'True', ], family = "binomial", control = glmerControl(optimizer = "nloptwrap"), nAGQ = 0)


In [121]:
tidy(m_on_tw)

effect,group,term,estimate,std.error,statistic,p.value
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
fixed,,(Intercept),0.311811655,0.0214723587,14.5215371,8.850311999999999e-48
fixed,,genderFemale,-0.18853359,0.0068740473,-27.4268682,1.311736e-165
fixed,,authorship_posfirst_position,0.332898629,0.0096221909,34.5969678,2.806152e-262
fixed,,authorship_posmiddle_position,-0.741090466,0.0075423733,-98.2569327,0.0
fixed,,authorship_possolo_author,0.375352091,0.0183198179,20.4888549,2.706849e-93
fixed,,author_pub_count_cate,-0.109233173,0.0032394346,-33.719827,2.961115e-249
fixed,,affiliation_rank_cate,0.005031903,0.0011773508,4.2739202,1.920658e-05
fixed,,affiliation_cateinternational,0.243482105,0.0069903289,34.8312803,8.178120999999999e-266
fixed,,num_authors,-0.002438394,0.0001561743,-15.6132822,5.9115970000000005e-55
fixed,,journal_impact,0.022156645,0.0004889344,45.3161965,0.0


In [122]:
r.squaredGLMM(m_on_tw)

“the null model is correct only if all variables used by the original model remain unchanged.”


Unnamed: 0,R2m,R2c
theoretical,0.1391187,0.3180333
delta,0.1196608,0.2735515


In [123]:
MEs = ggemmeans(m_on_tw, terms=c('gender', 'pub_year'), typical='median')

In [124]:
MEs

x,predicted,std.error,conf.low,conf.high,group
<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
Male,0.4108908,0.017164471,0.4027726,0.4190578,2013
Male,0.4304405,0.013401594,0.4240131,0.4368915,2014
Male,0.4528152,0.011292428,0.4473373,0.4583046,2015
Male,0.4664147,0.010174301,0.4614554,0.4713807,2016
Male,0.5195424,0.009353608,0.5149647,0.5241168,2017
Male,0.5720358,0.008731704,0.5678411,0.5762202,2018
Female,0.3661387,0.017902777,0.3580342,0.3743198,2013
Female,0.3849506,0.014266244,0.378352,0.3915917,2014
Female,0.4066489,0.012224163,0.4008812,0.4124426,2015
Female,0.419926,0.011116018,0.4146283,0.4252421,2016


In [125]:
fname = paste(data_root, 'pred_on_tweet_subset.csv', sep = '')
write.csv(MEs, fname, row.names=FALSE)

In [126]:
# use this search string in Sublime: " \\\\ \n  &" with " &"
stargazer(m_on_tw, type = "latex", single.row=TRUE, ci = FALSE, report = ('vc*p'), star.cutoffs = c(0.05, 0.01, 0.001))


% Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
% Date and time: Thu, Nov 24, 2022 - 12:26:36 AM
\begin{table}[!htbp] \centering 
  \caption{} 
  \label{} 
\begin{tabular}{@{\extracolsep{5pt}}lc} 
\\[-1.8ex]\hline 
\hline \\[-1.8ex] 
 & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ 
\cline{2-2} 
\\[-1.8ex] & self\_promotion \\ 
\hline \\[-1.8ex] 
 genderFemale & $-$0.189$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posfirst\_position & 0.333$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posmiddle\_position & $-$0.741$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_possolo\_author & 0.375$^{***}$ \\ 
  & p = 0.000 \\ 
  author\_pub\_count\_cate & $-$0.109$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_rank\_cate & 0.005$^{***}$ \\ 
  & p = 0.00002 \\ 
  affiliation\_cateinternational & 0.243$^{***}$ \\ 
  & p = 0.000 \\ 
  num\_authors & $-$0.002$^{***}$ \\ 
  & p = 0.000 \\ 
  journal\_impact & 0.022$^{***}$ \\ 
  & p = 0.0

### Gender x Journal Impact (all tweets)

1. Gender gap among all obs, without on_tw control

In [45]:
base_str <- "self_promotion ~ 1 + gender * journal_impact + authorship_pos + author_pub_count_cate + \
            I(author_pub_count_cate^2) + affiliation_rank_cate + affiliation_cate + num_authors + \
            author_citation_log + pub_year"

In [46]:
equation <- as.formula(paste(base_str, " + (1|doi) + ", keywords, sep = " "))
m_x_jif_1 <- glmer(formula = equation, data = mydata, family = "binomial", control = glmerControl(optimizer = "nloptwrap"), nAGQ = 0)


In [47]:
tidy(m_x_jif_1)

effect,group,term,estimate,std.error,statistic,p.value
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
fixed,,(Intercept),-4.446969704,0.01154638,-385.139642,0.0
fixed,,genderFemale,-0.274572366,0.004604832,-59.627016,0.0
fixed,,journal_impact,0.036234859,0.0003049132,118.836619,0.0
fixed,,authorship_posfirst_position,0.359101808,0.004970378,72.248397,0.0
fixed,,authorship_posmiddle_position,-0.63821703,0.004395717,-145.190651,0.0
fixed,,authorship_possolo_author,0.826967311,0.00997212,82.927932,0.0
fixed,,author_pub_count_cate,0.287843159,0.003175099,90.656438,0.0
fixed,,I(author_pub_count_cate^2),-0.020794985,0.0002615752,-79.499062,0.0
fixed,,affiliation_rank_cate,-0.047520374,0.0006852678,-69.345702,0.0
fixed,,affiliation_cateinternational,0.050943702,0.004238926,12.018068,2.855704e-33


In [48]:
MEs = ggemmeans(m_x_jif_1, terms=c('journal_impact [0:40 by=5]', 'gender'), typical='median')

Model contains polynomial or cubic / quadratic terms. Consider using `terms="journal_impact [all]"` to get smooth plots. See also package-vignette 'Marginal Effects at Specific Values'.



In [49]:
MEs

x,predicted,std.error,conf.low,conf.high,group
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
0,0.04425078,0.005161337,0.04382492,0.0446806,Male
0,0.03398714,0.005950657,0.0336063,0.03437215,Female
5,0.05257793,0.004830762,0.05210829,0.05305158,Male
5,0.03844141,0.005414385,0.03805107,0.0388356,Female
10,0.06236983,0.004968072,0.06180282,0.0629417,Male
10,0.04345318,0.005612115,0.04299827,0.04391268,Female
15,0.0738432,0.005538575,0.07310422,0.07458905,Male
15,0.04908501,0.006476967,0.04849586,0.04968094,Female
20,0.08723082,0.006427947,0.08623291,0.08823917,Male
20,0.05540447,0.007789826,0.05461084,0.05620895,Female


In [50]:
write.csv(MEs, paste(data_root, "gender_jif_1.csv", sep = ''), row.names=FALSE)

### Gender x Affiliation rank (all tweets)

1. Gender gap among all obs, without on_tw control

In [62]:
base_str <- "self_promotion ~ 1 + gender * affiliation_rank_cate + authorship_pos + author_pub_count_cate + \
            I(author_pub_count_cate^2) + affiliation_cate + num_authors + journal_impact + author_citation_log + pub_year"

In [63]:
equation <- as.formula(paste(base_str, " + (1|doi) + ", keywords, sep = " "))
m_x_affi_1 <- glmer(formula = equation, data = mydata, family = "binomial", control = glmerControl(optimizer = "nloptwrap"), nAGQ = 0)


In [64]:
tidy(m_x_affi_1)

effect,group,term,estimate,std.error,statistic,p.value
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
fixed,,(Intercept),-4.422724602,0.0115978429,-381.340276,0.0
fixed,,genderFemale,-0.356963538,0.0063639173,-56.091794,0.0
fixed,,affiliation_rank_cate,-0.048167163,0.000784605,-61.390334,0.0
fixed,,authorship_posfirst_position,0.361373332,0.0049697211,72.715013,0.0
fixed,,authorship_posmiddle_position,-0.637952072,0.0043955279,-145.136623,0.0
fixed,,authorship_possolo_author,0.827763583,0.0099727492,83.002547,0.0
fixed,,author_pub_count_cate,0.286187861,0.0031760304,90.108667,0.0
fixed,,I(author_pub_count_cate^2),-0.020637995,0.0002616367,-78.880357,0.0
fixed,,affiliation_cateinternational,0.051566808,0.0042391094,12.164538,4.801538e-34
fixed,,num_authors,-0.002237177,7.90557e-05,-28.298742,3.5812699999999995e-176


In [65]:
MEs = ggemmeans(m_x_affi_1, terms=c('affiliation_rank_cate', 'gender'), typical='median')

Model contains polynomial or cubic / quadratic terms. Consider using `terms="affiliation_rank_cate [all]"` to get smooth plots. See also package-vignette 'Marginal Effects at Specific Values'.



In [66]:
MEs

x,predicted,std.error,conf.low,conf.high,group
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
0,0.06063785,0.005771201,0.05999674,0.06128537,Male
0,0.04322103,0.006969604,0.04265965,0.04378945,Female
1,0.05795157,0.005389575,0.05737757,0.05853096,Male
1,0.04135986,0.006319545,0.04087154,0.04185376,Female
2,0.05537729,0.00510093,0.05485662,0.0559026,Male
2,0.03957553,0.005820682,0.03914418,0.04001144,Female
3,0.05291093,0.004921652,0.05242962,0.0533964,Male
3,0.03786513,0.005514206,0.03747336,0.03826084,Female
4,0.05054854,0.004863851,0.05009297,0.05100802,Male
4,0.03622587,0.005432773,0.03585594,0.03659947,Female


In [67]:
write.csv(MEs, paste(data_root, "gender_affi_1.csv", sep = ''), row.names=FALSE)

### Gender x Prior pubs (all tweets)

1. Gender gap among all obs, without on_tw control

In [79]:
base_str <- "self_promotion ~ 1 + gender * author_pub_count_cate + gender * I(author_pub_count_cate^2) + \
            authorship_pos + affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + \
            author_citation_log + pub_year"

In [80]:
equation <- as.formula(paste(base_str, " + (1|doi) + ", keywords, sep = " "))
m_x_pub_1 <- glmer(formula = equation, data = mydata, family = "binomial", control = glmerControl(optimizer = "nloptwrap"), nAGQ = 0)


In [81]:
tidy(m_x_pub_1)

effect,group,term,estimate,std.error,statistic,p.value
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
fixed,,(Intercept),-4.398753137,0.01237284,-355.516985,0.0
fixed,,genderFemale,-0.410416988,0.01216888,-33.726756,2.343594e-249
fixed,,author_pub_count_cate,0.276654425,0.003698525,74.801281,0.0
fixed,,I(author_pub_count_cate^2),-0.019937989,0.0003134588,-63.606418,0.0
fixed,,authorship_posfirst_position,0.361585737,0.004970781,72.742238,0.0
fixed,,authorship_posmiddle_position,-0.637747907,0.004395533,-145.090004,0.0
fixed,,authorship_possolo_author,0.827844306,0.009972872,83.009622,0.0
fixed,,affiliation_rank_cate,-0.047484897,0.0006852405,-69.29669,0.0
fixed,,affiliation_cateinternational,0.051698895,0.004239094,12.195742,3.27506e-34
fixed,,num_authors,-0.002237775,7.906016e-05,-28.30471,3.0240619999999997e-176


In [82]:
MEs = ggemmeans(m_x_pub_1, terms=c('author_pub_count_cate', 'gender'), typical='median')

Model contains polynomial or cubic / quadratic terms. Consider using `terms="author_pub_count_cate [all]"` to get smooth plots. See also package-vignette 'Marginal Effects at Specific Values'.



In [83]:
MEs

x,predicted,std.error,conf.low,conf.high,group
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
0,0.02143957,0.012278217,0.02094046,0.02195031,Male
0,0.01432584,0.013591256,0.01395451,0.01470689,Female
1,0.02754165,0.009568524,0.02704379,0.02804842,Male
1,0.01879816,0.010199951,0.01843295,0.01917047,Female
2,0.03398411,0.007546232,0.03350188,0.03447302,Male
2,0.02363259,0.007987264,0.02327405,0.02399652,Female
3,0.04029793,0.006177675,0.03983226,0.04076881,Male
3,0.028475,0.006783712,0.02810948,0.02884513,Female
4,0.04594684,0.0053676,0.04548787,0.04641021,Male
4,0.03289762,0.006208042,0.0325127,0.03328694,Female


In [84]:
write.csv(MEs, paste(data_root, "gender_pub_1.csv", sep = ''), row.names=FALSE)

### Disciplines

In [23]:
nrow(mydata[mydata$General == 1, ])

Life Sciences

In [24]:
nrow(mydata[mydata$Life_Sciences == 1, ])

1. Gender gap among all obs, without on_tw control

In [36]:
base_str <- "self_promotion ~ 1 + gender + authorship_pos + author_pub_count_cate + I(author_pub_count_cate^2) + \
        affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + author_citation_log + pub_year"
equation <- as.formula(paste(base_str, " + (1|doi) + ", keywords, sep = " "))
m_all_life <- glmer(equation, data = mydata[(mydata$Life_Sciences == 1), ], family = "binomial", control = glmerControl(optimizer = "nloptwrap"), nAGQ = 0)


fixed-effect model matrix is rank deficient so dropping 1 column / coefficient



In [37]:
tidy(m_all_life)

effect,group,term,estimate,std.error,statistic,p.value
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
fixed,,(Intercept),-4.813465949,0.0220898968,-217.9035052,0.0
fixed,,genderFemale,-0.466440791,0.0068443824,-68.149435,0.0
fixed,,authorship_posfirst_position,0.333586352,0.0090991841,36.6611281,3.042284e-294
fixed,,authorship_posmiddle_position,-0.699493571,0.0078527233,-89.0765594,0.0
fixed,,authorship_possolo_author,0.675606457,0.0245341379,27.5374036,6.263644e-167
fixed,,author_pub_count_cate,0.267688879,0.0059048075,45.3340567,0.0
fixed,,I(author_pub_count_cate^2),-0.023081625,0.0004762754,-48.4627718,0.0
fixed,,affiliation_rank_cate,-0.066842773,0.0012426536,-53.7903502,0.0
fixed,,affiliation_cateinternational,0.064354916,0.0075346643,8.5411789,1.3285930000000002e-17
fixed,,num_authors,-0.005235464,0.0003729941,-14.0363191,9.344145e-45


In [57]:
stargazer(m_all_life, type = "latex", single.row=TRUE, ci = FALSE, report = ('vc*p'), star.cutoffs = c(0.05, 0.01, 0.001))


% Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
% Date and time: Wed, Nov 23, 2022 - 08:17:13 PM
\begin{table}[!htbp] \centering 
  \caption{} 
  \label{} 
\begin{tabular}{@{\extracolsep{5pt}}lc} 
\\[-1.8ex]\hline 
\hline \\[-1.8ex] 
 & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ 
\cline{2-2} 
\\[-1.8ex] & self\_promotion \\ 
\hline \\[-1.8ex] 
 genderFemale & $-$0.466$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posfirst\_position & 0.334$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posmiddle\_position & $-$0.699$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_possolo\_author & 0.676$^{***}$ \\ 
  & p = 0.000 \\ 
  author\_pub\_count\_cate & 0.268$^{***}$ \\ 
  & p = 0.000 \\ 
  I(author\_pub\_count\_cate$\hat{\mkern6mu}$2) & $-$0.023$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_rank\_cate & $-$0.067$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_cateinternational & 0.064$^{***}$ \\ 
  & p = 0.000 \\ 
  num\_authors & $

In [38]:
# weighted average for factor variable, median for non-factor variables
MEs = ggemmeans(m_all_life, terms=c('gender'), typical='median')

Model contains polynomial or cubic / quadratic terms. Consider using
  `terms="gender [all]"` to get smooth plots. See also package-vignette
  'Marginal Effects at Specific Values'.



In [39]:
MEs

x,predicted,std.error,conf.low,conf.high,group
<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
Male,0.0336318,0.01098575,0.03293899,0.03433867,1
Female,0.02136272,0.01190659,0.02088025,0.02185609,1


In [40]:
write.csv(MEs, paste(data_root, "pred_life_1.csv", sep = ''), row.names=FALSE)

3. gender gap in self-promotion among active subset

In [209]:
nrow(mydata[(mydata$Life_Sciences == 1) & (mydata$is_active_on_twitter == 'True'), ])

In [180]:
base_str <- "self_promotion ~ 1 + gender + authorship_pos + author_pub_count_cate + \
            affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + author_citation_log + pub_year"
equation <- as.formula(paste(base_str, " + (1|doi)", keywords, sep = " "))
m_life <- glmer(formula = equation, data = mydata[(mydata$Life_Sciences == 1) & (mydata$is_active_on_twitter == 'True'), ], 
                     family = "binomial", control = glmerControl(optimizer = "nloptwrap"), nAGQ = 0)


fixed-effect model matrix is rank deficient so dropping 1 column / coefficient



In [181]:
tidy(m_life)

effect,group,term,estimate,std.error,statistic,p.value
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
fixed,,(Intercept),0.1206971,0.0392268322,3.07690162,0.002091643
fixed,,genderFemale,-0.2183207,0.0125685128,-17.37045065,1.381368e-67
fixed,,authorship_posfirst_position,0.2453335,0.0179843905,13.64147019,2.269388e-42
fixed,,authorship_posmiddle_position,-0.8476353,0.0135153406,-62.71653156,0.0
fixed,,authorship_possolo_author,0.1916216,0.0441673282,4.33853645,1.434347e-05
fixed,,author_pub_count_cate,-0.1453093,0.0058457544,-24.85723986,2.159407e-136
fixed,,affiliation_rank_cate,-0.007905535,0.0021254844,-3.71940404,0.0001996934
fixed,,affiliation_cateinternational,0.271454,0.0123784798,21.92951139,1.358786e-106
fixed,,num_authors,-0.002468949,0.0003607207,-6.84448979,7.674891e-12
fixed,,journal_impact,0.01770711,0.0010340683,17.12372975,9.874123e-66


In [182]:
# weighted average for factor variable, median for non-factor variables
MEs = ggemmeans(m_life, terms=c('gender'), typical='median')

In [183]:
MEs

x,predicted,std.error,conf.low,conf.high,group
<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
Male,0.4167002,0.01778392,0.4082535,0.4251961,1
Female,0.3647851,0.02002754,0.3557386,0.3739281,1


In [184]:
write.csv(MEs, paste(data_root, "pred_life.csv", sep = ''), row.names=FALSE)

Social Sciences

In [41]:
nrow(mydata[mydata$Social_Sciences == 1, ])

1. Gender gap among all obs, without on_tw control

In [42]:
base_str <- "self_promotion ~ 1 + gender + authorship_pos + author_pub_count_cate + I(author_pub_count_cate^2) + \
        affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + author_citation_log + pub_year"
equation <- as.formula(paste(base_str, " + (1|doi) + ", keywords, sep = " "))
m_all_social <- glmer(equation, data = mydata[(mydata$Social_Sciences == 1), ], family = "binomial", control = glmerControl(optimizer = "nloptwrap"), nAGQ = 0)


fixed-effect model matrix is rank deficient so dropping 1 column / coefficient



In [43]:
tidy(m_all_social)

effect,group,term,estimate,std.error,statistic,p.value
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
fixed,,(Intercept),-3.659023198,0.030518178,-119.8965153,0.0
fixed,,genderFemale,-0.194535354,0.0100548715,-19.3473735,2.144978e-83
fixed,,authorship_posfirst_position,0.441591263,0.0122354734,36.0910649,3.131978e-285
fixed,,authorship_posmiddle_position,-0.227395932,0.0133592909,-17.021557,5.683889e-65
fixed,,authorship_possolo_author,0.694386698,0.0172369362,40.2848098,0.0
fixed,,author_pub_count_cate,0.359038145,0.0084934797,42.2722086,0.0
fixed,,I(author_pub_count_cate^2),-0.020269595,0.0007677849,-26.4000953,1.366722e-153
fixed,,affiliation_rank_cate,-0.02705076,0.0018249422,-14.8228035,1.043374e-49
fixed,,affiliation_cateinternational,0.263132215,0.0113704843,23.1416894,1.762724e-118
fixed,,num_authors,-0.06639444,0.0024237917,-27.3927997,3.341445e-165


In [58]:
stargazer(m_all_social, type = "latex", single.row=TRUE, ci = FALSE, report = ('vc*p'), star.cutoffs = c(0.05, 0.01, 0.001))


% Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
% Date and time: Wed, Nov 23, 2022 - 08:17:39 PM
\begin{table}[!htbp] \centering 
  \caption{} 
  \label{} 
\begin{tabular}{@{\extracolsep{5pt}}lc} 
\\[-1.8ex]\hline 
\hline \\[-1.8ex] 
 & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ 
\cline{2-2} 
\\[-1.8ex] & self\_promotion \\ 
\hline \\[-1.8ex] 
 genderFemale & $-$0.195$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posfirst\_position & 0.442$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posmiddle\_position & $-$0.227$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_possolo\_author & 0.694$^{***}$ \\ 
  & p = 0.000 \\ 
  author\_pub\_count\_cate & 0.359$^{***}$ \\ 
  & p = 0.000 \\ 
  I(author\_pub\_count\_cate$\hat{\mkern6mu}$2) & $-$0.020$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_rank\_cate & $-$0.027$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_cateinternational & 0.263$^{***}$ \\ 
  & p = 0.000 \\ 
  num\_authors & $

In [44]:
# weighted average for factor variable, median for non-factor variables
MEs = ggemmeans(m_all_social, terms=c('gender'), typical='median')

Model contains polynomial or cubic / quadratic terms. Consider using
  `terms="gender [all]"` to get smooth plots. See also package-vignette
  'Marginal Effects at Specific Values'.



In [45]:
MEs

x,predicted,std.error,conf.low,conf.high,group
<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
Male,0.11474562,0.0115537,0.11246537,0.117066,1
Female,0.09641637,0.01233801,0.09433008,0.09854377,1


In [46]:
write.csv(MEs, paste(data_root, "pred_social_1.csv", sep = ''), row.names=FALSE)

3. gender gap in self-promotion among active subset

In [208]:
nrow(mydata[(mydata$Social_Sciences == 1) & (mydata$is_active_on_twitter == 'True'), ])

In [185]:
base_str <- "self_promotion ~ 1 + gender + authorship_pos + author_pub_count_cate + \
            affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + author_citation_log + pub_year"
equation <- as.formula(paste(base_str, " + (1|doi)", keywords, sep = " "))
m_social <- glmer(formula = equation, data = mydata[(mydata$Social_Sciences == 1) & (mydata$is_active_on_twitter == 'True'), ], 
                     family = "binomial", control = glmerControl(optimizer = "nloptwrap"), nAGQ = 0)

fixed-effect model matrix is rank deficient so dropping 1 column / coefficient



In [186]:
tidy(m_social)

effect,group,term,estimate,std.error,statistic,p.value
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
fixed,,(Intercept),0.21836386,0.055402366,3.9414177,8.100144e-05
fixed,,genderFemale,-0.14331643,0.017696638,-8.0985118,5.563557e-16
fixed,,authorship_posfirst_position,0.49101458,0.022678256,21.6513381,5.904102e-104
fixed,,authorship_posmiddle_position,-0.30396662,0.022030072,-13.7978044,2.627333e-43
fixed,,authorship_possolo_author,0.4927516,0.031404351,15.6905518,1.755231e-55
fixed,,author_pub_count_cate,-0.02841247,0.009025351,-3.1480732,0.001643505
fixed,,affiliation_rank_cate,0.01812928,0.003160481,5.7362419,9.680038e-09
fixed,,affiliation_cateinternational,0.26544087,0.019428605,13.6623745,1.7033749999999998e-42
fixed,,num_authors,-0.04429829,0.003120429,-14.1962164,9.669391e-46
fixed,,journal_impact,0.0742762,0.005308885,13.9909232,1.77098e-44


In [187]:
# weighted average for factor variable, median for non-factor variables
MEs = ggemmeans(m_social, terms=c('gender'), typical='median')

In [188]:
MEs

x,predicted,std.error,conf.low,conf.high,group
<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
Male,0.5537216,0.01795944,0.5450077,0.5624027,1
Female,0.5180928,0.02041974,0.5080945,0.5280766,1


In [189]:
write.csv(MEs, paste(data_root, "pred_social.csv", sep = ''), row.names=FALSE)

Physical Sciences

In [200]:
nrow(mydata[mydata$Physical_Sciences == 1, ])

1. Gender gap among all obs, without on_tw control

In [47]:
base_str <- "self_promotion ~ 1 + gender + authorship_pos + author_pub_count_cate + I(author_pub_count_cate^2) + \
        affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + author_citation_log + pub_year"
equation <- as.formula(paste(base_str, " + (1|doi) + ", keywords, sep = " "))
m_all_phy <- glmer(equation, data = mydata[(mydata$Physical_Sciences == 1), ], family = "binomial", control = glmerControl(optimizer = "nloptwrap"), nAGQ = 0)


fixed-effect model matrix is rank deficient so dropping 1 column / coefficient



In [48]:
tidy(m_all_phy)

effect,group,term,estimate,std.error,statistic,p.value
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
fixed,,(Intercept),-5.0482005523,0.02847375,-177.293135,0.0
fixed,,genderFemale,-0.2843431306,0.009337878,-30.4505092,1.179452e-203
fixed,,authorship_posfirst_position,0.3618488393,0.01118594,32.3485266,1.4551169999999999e-229
fixed,,authorship_posmiddle_position,-0.55598608,0.009907004,-56.1205075,0.0
fixed,,authorship_possolo_author,0.8438211181,0.02446021,34.4977054,8.682828000000001e-261
fixed,,author_pub_count_cate,0.2665663776,0.007714589,34.5535441,1.260951e-261
fixed,,I(author_pub_count_cate^2),-0.0228229415,0.00062927,-36.2689167,5.0019839999999996e-288
fixed,,affiliation_rank_cate,-0.0426525804,0.001566954,-27.2200566,3.760451e-163
fixed,,affiliation_cateinternational,0.0228971403,0.009821648,2.331293,0.01973791
fixed,,num_authors,-0.0015649006,7.034405e-05,-22.2463807,1.222708e-109


In [59]:
stargazer(m_all_phy, type = "latex", single.row=TRUE, ci = FALSE, report = ('vc*p'), star.cutoffs = c(0.05, 0.01, 0.001))


% Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
% Date and time: Wed, Nov 23, 2022 - 08:18:01 PM
\begin{table}[!htbp] \centering 
  \caption{} 
  \label{} 
\begin{tabular}{@{\extracolsep{5pt}}lc} 
\\[-1.8ex]\hline 
\hline \\[-1.8ex] 
 & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ 
\cline{2-2} 
\\[-1.8ex] & self\_promotion \\ 
\hline \\[-1.8ex] 
 genderFemale & $-$0.284$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posfirst\_position & 0.362$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posmiddle\_position & $-$0.556$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_possolo\_author & 0.844$^{***}$ \\ 
  & p = 0.000 \\ 
  author\_pub\_count\_cate & 0.267$^{***}$ \\ 
  & p = 0.000 \\ 
  I(author\_pub\_count\_cate$\hat{\mkern6mu}$2) & $-$0.023$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_rank\_cate & $-$0.043$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_cateinternational & 0.023$^{*}$ \\ 
  & p = 0.020 \\ 
  num\_authors & $-$

In [49]:
# weighted average for factor variable, median for non-factor variables
MEs = ggemmeans(m_all_phy, terms=c('gender'), typical='median')

Model contains polynomial or cubic / quadratic terms. Consider using
  `terms="gender [all]"` to get smooth plots. See also package-vignette
  'Marginal Effects at Specific Values'.



In [50]:
MEs

x,predicted,std.error,conf.low,conf.high,group
<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
Male,0.03170544,0.0150575,0.03081183,0.0326241,1
Female,0.02404731,0.01668219,0.02329178,0.02482672,1


In [51]:
write.csv(MEs, paste(data_root, "pred_phy_1.csv", sep = ''), row.names=FALSE)

3. gender gap in self-promotion among active subset

In [207]:
nrow(mydata[(mydata$Physical_Sciences == 1) & (mydata$is_active_on_twitter == 'True'), ])

In [190]:
base_str <- "self_promotion ~ 1 + gender + authorship_pos + author_pub_count_cate + \
            affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + author_citation_log + pub_year"
equation <- as.formula(paste(base_str, " + (1|doi)", keywords, sep = " "))
m_phy <- glmer(formula = equation, data = mydata[(mydata$Physical_Sciences == 1) & (mydata$is_active_on_twitter == 'True'), ], 
                     family = "binomial", control = glmerControl(optimizer = "nloptwrap"), nAGQ = 0)

fixed-effect model matrix is rank deficient so dropping 1 column / coefficient



In [191]:
tidy(m_phy)

effect,group,term,estimate,std.error,statistic,p.value
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
fixed,,(Intercept),0.052969313,0.0516660867,1.02522401,0.3052575
fixed,,genderFemale,-0.169027063,0.0169382918,-9.97899109,1.883724e-23
fixed,,authorship_posfirst_position,0.387579056,0.0223299764,17.35689496,1.749319e-67
fixed,,authorship_posmiddle_position,-0.61366822,0.0168777184,-36.35966702,1.848533e-289
fixed,,authorship_possolo_author,0.561928675,0.0477144479,11.77690825,5.134192000000001e-32
fixed,,author_pub_count_cate,-0.085244367,0.0080291413,-10.61687218,2.487569e-26
fixed,,affiliation_rank_cate,0.024360717,0.0027060208,9.00241326,2.208096e-19
fixed,,affiliation_cateinternational,0.138415808,0.0165279133,8.37466934,5.5379140000000007e-17
fixed,,num_authors,-0.001579563,0.0001177879,-13.41022299,5.268018999999999e-41
fixed,,journal_impact,0.028256275,0.0017796554,15.87738525,9.088897999999999e-57


In [192]:
# weighted average for factor variable, median for non-factor variables
MEs = ggemmeans(m_phy, terms=c('gender'), typical='median')

In [193]:
MEs

x,predicted,std.error,conf.low,conf.high,group
<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
Male,0.4625152,0.02583562,0.4499536,0.4751245,1
Female,0.420859,0.02900532,0.4070684,0.4347741,1


In [194]:
write.csv(MEs, paste(data_root, "pred_phy.csv", sep = ''), row.names=FALSE)

Health Sciences

In [201]:
nrow(mydata[mydata$Health_Sciences == 1, ])

1. Gender gap among all obs, without on_tw control

In [52]:
base_str <- "self_promotion ~ 1 + gender + authorship_pos + author_pub_count_cate + I(author_pub_count_cate^2) + \
        affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + author_citation_log + pub_year"
equation <- as.formula(paste(base_str, " + (1|doi) + ", keywords, sep = " "))
m_all_health <- glmer(equation, data = mydata[(mydata$Health_Sciences == 1), ], family = "binomial", control = glmerControl(optimizer = "nloptwrap"), nAGQ = 0)


fixed-effect model matrix is rank deficient so dropping 3 columns / coefficients



In [53]:
tidy(m_all_health)

effect,group,term,estimate,std.error,statistic,p.value
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
fixed,,(Intercept),-4.91825743,0.0298975804,-164.5035273,0.0
fixed,,genderFemale,-0.33985683,0.0062684152,-54.2173448,0.0
fixed,,authorship_posfirst_position,0.43833257,0.0084674007,51.7670756,0.0
fixed,,authorship_posmiddle_position,-0.62766951,0.0074498236,-84.2529364,0.0
fixed,,authorship_possolo_author,0.74342094,0.0214074723,34.7271703,3.065299e-264
fixed,,author_pub_count_cate,0.29702684,0.005140709,57.7793529,0.0
fixed,,I(author_pub_count_cate^2),-0.0170261,0.0004280812,-39.7730702,0.0
fixed,,affiliation_rank_cate,-0.04171189,0.0011608284,-35.932864,9.374645e-283
fixed,,affiliation_cateinternational,0.12746164,0.00720709,17.6855897,5.415156e-70
fixed,,num_authors,-0.01043556,0.0004148589,-25.1544911,1.26202e-139


In [60]:
stargazer(m_all_health, type = "latex", single.row=TRUE, ci = FALSE, report = ('vc*p'), star.cutoffs = c(0.05, 0.01, 0.001))


% Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
% Date and time: Wed, Nov 23, 2022 - 08:18:19 PM
\begin{table}[!htbp] \centering 
  \caption{} 
  \label{} 
\begin{tabular}{@{\extracolsep{5pt}}lc} 
\\[-1.8ex]\hline 
\hline \\[-1.8ex] 
 & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ 
\cline{2-2} 
\\[-1.8ex] & self\_promotion \\ 
\hline \\[-1.8ex] 
 genderFemale & $-$0.340$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posfirst\_position & 0.438$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posmiddle\_position & $-$0.628$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_possolo\_author & 0.743$^{***}$ \\ 
  & p = 0.000 \\ 
  author\_pub\_count\_cate & 0.297$^{***}$ \\ 
  & p = 0.000 \\ 
  I(author\_pub\_count\_cate$\hat{\mkern6mu}$2) & $-$0.017$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_rank\_cate & $-$0.042$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_cateinternational & 0.127$^{***}$ \\ 
  & p = 0.000 \\ 
  num\_authors & $

In [54]:
# weighted average for factor variable, median for non-factor variables
MEs = ggemmeans(m_all_health, terms=c('gender'), typical='median')

Model contains polynomial or cubic / quadratic terms. Consider using
  `terms="gender [all]"` to get smooth plots. See also package-vignette
  'Marginal Effects at Specific Values'.



In [55]:
MEs

x,predicted,std.error,conf.low,conf.high,group
<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
Male,0.03458188,0.00811881,0.03405454,0.03511709,1
Female,0.02486564,0.008833066,0.02444929,0.02528889,1


In [56]:
write.csv(MEs, paste(data_root, "pred_health_1.csv", sep = ''), row.names=FALSE)

3. gender gap in self-promotion among active subset

In [206]:
nrow(mydata[(mydata$Health_Sciences == 1) & (mydata$is_active_on_twitter == 'True'), ])

In [195]:
base_str <- "self_promotion ~ 1 + gender + authorship_pos + author_pub_count_cate + \
            affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + author_citation_log + pub_year"
equation <- as.formula(paste(base_str, " + (1|doi)", keywords, sep = " "))
m_health <- glmer(formula = equation, data = mydata[(mydata$Health_Sciences == 1) & (mydata$is_active_on_twitter == 'True'), ], 
                     family = "binomial", control = glmerControl(optimizer = "nloptwrap"), nAGQ = 0)

fixed-effect model matrix is rank deficient so dropping 3 columns / coefficients



In [196]:
tidy(m_health)

effect,group,term,estimate,std.error,statistic,p.value
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
fixed,,(Intercept),-0.3683193159,0.0483770154,-7.613519,2.667325e-14
fixed,,genderFemale,-0.1666184025,0.0109133899,-15.26734,1.262233e-52
fixed,,authorship_posfirst_position,0.3749206066,0.0155373213,24.13032,1.2015820000000002e-128
fixed,,authorship_posmiddle_position,-0.7507860693,0.0123206274,-60.93732,0.0
fixed,,authorship_possolo_author,0.4120372069,0.0372066725,11.07428,1.672139e-28
fixed,,author_pub_count_cate,-0.0928089394,0.0052541051,-17.66408,7.929313e-70
fixed,,affiliation_rank_cate,0.0054113966,0.0019241138,2.81241,0.00491718
fixed,,affiliation_cateinternational,0.3284011212,0.0114546029,28.6698,9.083624e-181
fixed,,num_authors,-0.0053155447,0.0004638389,-11.45989,2.097731e-30
fixed,,journal_impact,0.017671917,0.0006418312,27.53359,6.958389e-167


In [197]:
# weighted average for factor variable, median for non-factor variables
MEs = ggemmeans(m_health, terms=c('gender'), typical='median')

In [198]:
MEs

x,predicted,std.error,conf.low,conf.high,group
<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
Male,0.3479061,0.0122477,0.3424802,0.3533718,1
Female,0.311123,0.01413553,0.3052164,0.3170918,1


In [199]:
write.csv(MEs, paste(data_root, "pred_health.csv", sep = ''), row.names=FALSE)

## Robustness check

### Mixed gender team

Predicting if the author is the first one to self-promote (all obs)

In [30]:
base_str <- "self_promotion_first ~ 1 + gender + authorship_pos + author_pub_count_cate + I(author_pub_count_cate^2) + \
        affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + author_citation_log + pub_year"
equation <- as.formula(paste(base_str, " + (1|doi) + ", keywords, sep = " "))
m_all_first <- glmer(formula = equation, data = mydata, family = "binomial", control = glmerControl(optimizer = "nloptwrap"), nAGQ = 0)


boundary (singular) fit: see help('isSingular')



In [31]:
tidy(m_all_first)

effect,group,term,estimate,std.error,statistic,p.value
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
fixed,,(Intercept),-4.33611,0.011606033,-373.608267,0.0
fixed,,genderFemale,-0.3226285,0.0041278044,-78.159844,0.0
fixed,,authorship_posfirst_position,0.3518395,0.0052627051,66.855249,0.0
fixed,,authorship_posmiddle_position,-0.7574488,0.0048960448,-154.706264,0.0
fixed,,authorship_possolo_author,0.9300299,0.0087673379,106.078933,0.0
fixed,,author_pub_count_cate,0.2987045,0.0034901788,85.584284,0.0
fixed,,I(author_pub_count_cate^2),-0.02316048,0.0002910875,-79.565384,0.0
fixed,,affiliation_rank_cate,-0.04904548,0.0006967838,-70.388365,0.0
fixed,,affiliation_cateinternational,0.05324689,0.0040649763,13.098942,3.338784e-39
fixed,,num_authors,-0.01547413,0.0002785563,-55.551167,0.0


In [224]:
r.squaredGLMM(m_all_first)

“the null model is correct only if all variables used by the original model remain unchanged.”


Unnamed: 0,R2m,R2c
theoretical,0.8628439,0.8628439
delta,0.3788889,0.3788889


In [32]:
# use this search string in Sublime: " \\\\ \n  &" with " &"
stargazer(m_all_first, type = "latex", single.row=TRUE, ci = FALSE, report = ('vc*p'), star.cutoffs = c(0.05, 0.01, 0.001))


% Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
% Date and time: Wed, Nov 23, 2022 - 08:59:59 PM
\begin{table}[!htbp] \centering 
  \caption{} 
  \label{} 
\begin{tabular}{@{\extracolsep{5pt}}lc} 
\\[-1.8ex]\hline 
\hline \\[-1.8ex] 
 & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ 
\cline{2-2} 
\\[-1.8ex] & self\_promotion\_first \\ 
\hline \\[-1.8ex] 
 genderFemale & $-$0.323$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posfirst\_position & 0.352$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posmiddle\_position & $-$0.757$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_possolo\_author & 0.930$^{***}$ \\ 
  & p = 0.000 \\ 
  author\_pub\_count\_cate & 0.299$^{***}$ \\ 
  & p = 0.000 \\ 
  I(author\_pub\_count\_cate$\hat{\mkern6mu}$2) & $-$0.023$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_rank\_cate & $-$0.049$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_cateinternational & 0.053$^{***}$ \\ 
  & p = 0.000 \\ 
  num\_auth

Solo-author (all obs)

In [36]:
base_str <- "self_promotion ~ 1 + gender + author_pub_count_cate + I(author_pub_count_cate^2) + \
        affiliation_rank_cate + affiliation_cate + journal_impact + author_citation_log + pub_year"
equation <- as.formula(paste(base_str, " + (1|doi) + ", keywords, sep = " "))
m_all_solo <- glmer(formula = equation, data = mydata[mydata$authorship_pos == 'solo_author', ], family = "binomial", control = glmerControl(optimizer = "nloptwrap"), nAGQ = 0)


boundary (singular) fit: see help('isSingular')



In [37]:
tidy(m_all_solo)

effect,group,term,estimate,std.error,statistic,p.value
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
fixed,,(Intercept),-2.95835749,0.039818491,-74.2960715,0.0
fixed,,genderFemale,-0.09188927,0.016667739,-5.5130016,3.52765e-08
fixed,,author_pub_count_cate,0.26707775,0.012167606,21.9499009,8.679043e-107
fixed,,I(author_pub_count_cate^2),-0.02042258,0.001147705,-17.794275,7.827379e-71
fixed,,affiliation_rank_cate,-0.02518316,0.00266963,-9.4332026,3.977549e-21
fixed,,affiliation_cateinternational,0.1693036,0.016455161,10.2887841,7.9169730000000005e-25
fixed,,journal_impact,0.02538979,0.00106106,23.9287078,1.539696e-126
fixed,,author_citation_log,-0.09829392,0.004342323,-22.636251,1.9057970000000002e-113
fixed,,pub_year2014,0.41723423,0.033940111,12.2932489,9.846752e-35
fixed,,pub_year2015,0.69680351,0.03229042,21.5792644,2.813076e-103


In [226]:
r.squaredGLMM(m_all_solo)

“the null model is correct only if all variables used by the original model remain unchanged.”


Unnamed: 0,R2m,R2c
theoretical,0.1496402,0.1496402
delta,0.0576489,0.0576489


In [38]:
# use this search string in Sublime: " \\\\ \n  &" with " &"
stargazer(m_all_solo, type = "latex", single.row=TRUE, ci = FALSE, report = ('vc*p'), star.cutoffs = c(0.05, 0.01, 0.001))


% Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
% Date and time: Wed, Nov 23, 2022 - 09:02:38 PM
\begin{table}[!htbp] \centering 
  \caption{} 
  \label{} 
\begin{tabular}{@{\extracolsep{5pt}}lc} 
\\[-1.8ex]\hline 
\hline \\[-1.8ex] 
 & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ 
\cline{2-2} 
\\[-1.8ex] & self\_promotion \\ 
\hline \\[-1.8ex] 
 genderFemale & $-$0.092$^{***}$ \\ 
  & p = 0.00000 \\ 
  author\_pub\_count\_cate & 0.267$^{***}$ \\ 
  & p = 0.000 \\ 
  I(author\_pub\_count\_cate$\hat{\mkern6mu}$2) & $-$0.020$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_rank\_cate & $-$0.025$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_cateinternational & 0.169$^{***}$ \\ 
  & p = 0.000 \\ 
  journal\_impact & 0.025$^{***}$ \\ 
  & p = 0.000 \\ 
  author\_citation\_log & $-$0.098$^{***}$ \\ 
  & p = 0.000 \\ 
  pub\_year2014 & 0.417$^{***}$ \\ 
  & p = 0.000 \\ 
  pub\_year2015 & 0.697$^{***}$ \\ 
  & p = 0.000 \\ 
 

Subset of authors of non-East Asian ethnicity

In [228]:
nrow(mydata)

In [229]:
table(mydata$author_eth_ethnea_broad)


                African               CARIBBEAN                 Chinese 
                  72522                     787                  957964 
              EastAsian         EasternEuropean                 English 
                 606158                  564347                 3495178 
                 Indian           MiddleEastern                     org 
                 442445                  683016                     162 
             POLYNESIAN        SouthernEuropean                 unknown 
                     16                 1807743                   39553 
WesternNorthernEuropean 
                2726861 

In [230]:
# num of observations with East Asian names (Chinese + non-Chinese East Asian).
nrow(mydata[mydata$author_eth_ethnea_broad %in% c('Chinese', 'EastAsian', 'unknown'), ])

In [231]:
nrow(mydata[!mydata$author_eth_ethnea_broad %in% c('Chinese', 'EastAsian', 'unknown'), ])

In [233]:
base_str <- "self_promotion ~ 1 + gender + authorship_pos + author_pub_count_cate + I(author_pub_count_cate^2) + \
        affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + author_citation_log + pub_year"
equation <- as.formula(paste(base_str, " + (1|doi) + ", keywords, sep = " "))
m_all_exc <- glmer(formula = equation, data = mydata[!mydata$author_eth_ethnea_broad %in% c('Chinese', 'EastAsian', 'unknown'), ], family = "binomial", control = glmerControl(optimizer = "nloptwrap"), nAGQ = 0)

In [234]:
tidy(m_all_exc)

effect,group,term,estimate,std.error,statistic,p.value
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
fixed,,(Intercept),-4.298877979,0.01175699,-365.644331,0.0
fixed,,genderFemale,-0.331991022,0.003902762,-85.065659,0.0
fixed,,authorship_posfirst_position,0.361318772,0.005070041,71.265451,0.0
fixed,,authorship_posmiddle_position,-0.623033593,0.004474048,-139.255023,0.0
fixed,,authorship_possolo_author,0.765935885,0.0100681,76.075512,0.0
fixed,,author_pub_count_cate,0.279594819,0.003259163,85.787315,0.0
fixed,,I(author_pub_count_cate^2),-0.019727072,0.0002680684,-73.589701,0.0
fixed,,affiliation_rank_cate,-0.043569604,0.0006972592,-62.486957,0.0
fixed,,affiliation_cateinternational,0.096181908,0.004297686,22.379931,6.173952e-111
fixed,,num_authors,-0.002130662,7.628723e-05,-27.929474,1.170639e-171


In [235]:
# use this search string in Sublime: " \\\\ \n  &" with " &"
stargazer(m_all_exc, type = "latex", single.row=TRUE, ci = FALSE, report = ('vc*p'), star.cutoffs = c(0.05, 0.01, 0.001))



% Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
% Date and time: Wed, Dec 07, 2022 - 03:40:42 PM
\begin{table}[!htbp] \centering 
  \caption{} 
  \label{} 
\begin{tabular}{@{\extracolsep{5pt}}lc} 
\\[-1.8ex]\hline 
\hline \\[-1.8ex] 
 & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ 
\cline{2-2} 
\\[-1.8ex] & self\_promotion \\ 
\hline \\[-1.8ex] 
 genderFemale & $-$0.332$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posfirst\_position & 0.361$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posmiddle\_position & $-$0.623$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_possolo\_author & 0.766$^{***}$ \\ 
  & p = 0.000 \\ 
  author\_pub\_count\_cate & 0.280$^{***}$ \\ 
  & p = 0.000 \\ 
  I(author\_pub\_count\_cate$\hat{\mkern6mu}$2) & $-$0.020$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_rank\_cate & $-$0.044$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_cateinternational & 0.096$^{***}$ \\ 
  & p = 0.000 \\ 
  num\_authors & $