In [10]:
library("lme4")
library("margins")
library("stargazer")
library("emmeans")
library("ggeffects")
library("broom")
library("broom.mixed")
library("MASS")
library("pscl")

In [4]:
options(repr.matrix.max.rows=500, repr.matrix.max.cols=200)

In [5]:
options(dplyr.print_max = 100)

In [6]:
Data_Root <- "/Data/Promotion/revision/"
data_root <- "/Data/Promotion/revision/reg_results_F_M/"

In [7]:
mydata <- read.csv(paste(Data_Root, "reg_data_drop_missing.csv", sep = ''), header = TRUE, stringsAsFactors = TRUE)

In [8]:
nrow(mydata)

In [9]:
ncol(mydata)

In [11]:
mydata = mydata[mydata$gender %in% c("Male", "Female"), ]

In [12]:
mydata = mydata[mydata$affiliation_cate != 'unknown', ]

In [13]:
nrow(mydata)

In [14]:
mydata$pub_year <- as.factor(mydata$pub_year)

In [15]:
mydata <- within(mydata, gender <- relevel(gender, ref = 'Male'))
mydata <- within(mydata, authorship_pos <- relevel(authorship_pos, ref = 'last_position'))
mydata <- within(mydata, affiliation_cate <- relevel(affiliation_cate, ref = 'domestic'))

In [16]:
keywords <- " + Social_Sciences + Materials_Science + Engineering + Chemistry + \
        Biochemistry__Genetics_and_Molecular_Biology + Medicine + Nursing + Agricultural_and_Biological_Sciences + \
        Pharmacology__Toxicology_and_Pharmaceutics + Neuroscience + Business__Management_and_Accounting + \
        Economics__Econometrics_and_Finance + Chemical_Engineering + Physics_and_Astronomy + Computer_Science + \
        Decision_Sciences + Health_Professions + Psychology + Immunology_and_Microbiology + Dentistry + \
        Earth_and_Planetary_Sciences + Environmental_Science + Mathematics + Arts_and_Humanities + Energy + \
        Veterinary + General"

In [17]:
mydata$author_citation_log <- log2(1 + mydata$author_citation)

In [18]:
mydata$total_num_tweets_log <- log2(1 + mydata$total_num_tweets)

In [19]:
mydata$follower_cn_snapshot_log <- log2(1 + mydata$follower_cn_snapshot)
mydata$follower_cn_snapshot_ours_log <- log2(1 + mydata$follower_cn_snapshot_ours)
mydata$follower_cn_snapshot_combine_log <- log2(1 + mydata$follower_cn_snapshot_combine)

## Subset of observations where the author is active on Twitter

### Theirs matching

1. Remove on_tw control
2. Still include gender and self_promotion interaction
3. Add follower control

In [42]:
nrow(mydata[mydata$is_active_on_twitter == "True", ])

Fit a model for all years

In [107]:
base_str <- "total_num_tweets ~ 1 + gender * self_promotion + authorship_pos + follower_cn_snapshot_log + \
        author_pub_count_cate + affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + \
        author_citation_log + pub_year"
equation <- as.formula(paste(base_str, keywords, sep = " "))
m_yes <- glm.nb(equation, data = mydata[mydata$is_active_on_twitter == "True", ])

In [108]:
tidy(m_yes)

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
(Intercept),0.363815394,0.01493831,24.3545247,5.191504e-131
genderFemale,0.148483596,0.005123532,28.9807099,1.1517409999999999e-184
self_promotionTrue,1.235314458,0.0044045,280.4664247,0.0
authorship_posfirst_position,0.109717974,0.005723529,19.1696384,6.636843e-82
authorship_posmiddle_position,0.290013813,0.004421204,65.5961202,0.0
authorship_possolo_author,-0.206360353,0.01066301,-19.3529261,1.925913e-83
follower_cn_snapshot_log,0.094976732,0.000939197,101.1254683,0.0
author_pub_count_cate,-0.102244042,0.001854771,-55.1248905,0.0
affiliation_rank_cate,-0.008475538,0.0006624966,-12.7933298,1.786522e-37
affiliation_cateinternational,0.069202019,0.003874366,17.8615099,2.351899e-71


In [109]:
# use this search string in Sublime: " \\\\ \n  &" with " &"
stargazer(m_yes, type = "latex", single.row=TRUE, ci = FALSE, report = ('vc*p'), star.cutoffs = c(0.05, 0.01, 0.001))


% Table created by stargazer v.5.2.2 by Marek Hlavac, Harvard University. E-mail: hlavac at fas.harvard.edu
% Date and time: Thu, Oct 06, 2022 - 12:07:58 PM
\begin{table}[!htbp] \centering 
  \caption{} 
  \label{} 
\begin{tabular}{@{\extracolsep{5pt}}lc} 
\\[-1.8ex]\hline 
\hline \\[-1.8ex] 
 & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ 
\cline{2-2} 
\\[-1.8ex] & total\_num\_tweets \\ 
\hline \\[-1.8ex] 
 genderFemale & 0.148$^{***}$ \\ 
  & p = 0.000 \\ 
  self\_promotionTrue & 1.235$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posfirst\_position & 0.110$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posmiddle\_position & 0.290$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_possolo\_author & $-$0.206$^{***}$ \\ 
  & p = 0.000 \\ 
  follower\_cn\_snapshot\_log & 0.095$^{***}$ \\ 
  & p = 0.000 \\ 
  author\_pub\_count\_cate & $-$0.102$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_rank\_cate & $-$0.008$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_cateinternational & 0.069$^{***}$ \\ 

In [110]:
# weighted average for factor variable, median for non-factor variables
MEs = ggemmeans(m_yes, terms=c('self_promotion', 'gender', 'pub_year'), typical='median')

In [111]:
MEs

x,predicted,std.error,conf.low,conf.high,group,facet
<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>,<fct>
False,4.31586,0.009845945,4.233373,4.399955,Male,2013
False,5.478465,0.007727278,5.396118,5.562069,Male,2014
False,6.433645,0.006592291,6.351053,6.517312,Male,2015
False,7.339296,0.005999963,7.253494,7.426114,Male,2016
False,9.577959,0.005632767,9.472799,9.684285,Male,2017
False,10.873623,0.005405193,10.759036,10.989431,Male,2018
False,5.006716,0.010460461,4.905113,5.110424,Female,2013
False,6.355423,0.008447499,6.251064,6.461525,Female,2014
False,7.463503,0.007373746,7.356414,7.572151,Female,2015
False,8.514125,0.006797265,8.401449,8.628313,Female,2016


In [112]:
write.csv(MEs, paste(data_root, "gender_return_on_self_promotion_active_sub.csv", sep = ''), row.names=FALSE)

Fit a model for each year

In [38]:
nrow(mydata[(mydata$is_active_on_twitter == "True") & (mydata$pub_year == '2018'), ])

In [21]:
nrow(mydata[(mydata$is_active_on_twitter == "True") & (mydata$pub_year == '2017'), ])

In [22]:
nrow(mydata[(mydata$is_active_on_twitter == "True") & (mydata$pub_year == '2016'), ])

In [39]:
nrow(mydata[(mydata$is_active_on_twitter == "True") & (mydata$pub_year == '2015'), ])

In [23]:
nrow(mydata[(mydata$is_active_on_twitter == "True") & (mydata$pub_year == '2014'), ])

In [40]:
nrow(mydata[(mydata$is_active_on_twitter == "True") & (mydata$pub_year == '2013'), ])

In [78]:
for (year in c('2013', '2014', '2015', '2016', '2017', '2018')){
    base_str <- "total_num_tweets ~ 1 + gender * self_promotion + authorship_pos + follower_cn_snapshot_log + \
        author_pub_count_cate + affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + \
        author_citation_log"
    equation <- as.formula(paste(base_str, keywords, sep = " "))
    m_yes <- glm.nb(equation, data = mydata[(mydata$is_active_on_twitter == "True") & (mydata$pub_year == year), ])
    print(tidy(m_yes))
    MEs = ggemmeans(m_yes, terms=c('self_promotion', 'gender'), typical='median')
    write.csv(MEs, paste(data_root, "gender_return_on_self_promotion_", year, "_active_sub.csv", sep = ''), row.names=FALSE)
}

[90m# A tibble: 41 x 5[39m
   term                                   estimate std.error statistic   p.value
   [3m[90m<chr>[39m[23m                                     [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m
[90m 1[39m (Intercept)                             0.016[4m3[24m   0.066[4m9[24m       0.243 8.08[90me[39m[31m-  1[39m
[90m 2[39m genderFemale                            0.272    0.028[4m4[24m       9.59  8.96[90me[39m[31m- 22[39m
[90m 3[39m self_promotionTrue                      1.44     0.023[4m4[24m      61.9   0   [90m [39m    
[90m 4[39m authorship_posfirst_position            0.044[4m0[24m   0.030[4m0[24m       1.47  1.42[90me[39m[31m-  1[39m
[90m 5[39m authorship_posmiddle_position           0.087[4m3[24m   0.023[4m9[24m       3.65  2.60[90me[39m[31m-  4[39m
[90m 6[39m authorship_possolo_author              -[31m0[39m[31m.[39m[31m246[39m    0.050

## Robustness check 1 (not due to coauthors' self-promotion)

### Solo-author papers (active subset)

In [28]:
subdata = mydata[mydata$authorship_pos == "solo_author", ]

In [29]:
nrow(subdata)

In [41]:
nrow(subdata[subdata$is_active_on_twitter == "True", ])

One big model

In [30]:
base_str <- "total_num_tweets ~ 1 + gender * self_promotion + follower_cn_snapshot_combine_log + \
    author_pub_count_cate + affiliation_rank_cate + affiliation_cate + journal_impact + author_citation_log + pub_year"
equation <- as.formula(paste(base_str, keywords, sep = " "))
m_yes <- glm.nb(equation, data = subdata[subdata$is_active_on_twitter == "True", ])


In [31]:
tidy(m_yes)

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
(Intercept),-0.842860669,0.073260625,-11.5049615,1.24545e-30
genderFemale,0.143961977,0.032576421,4.4192079,9.906331e-06
self_promotionTrue,1.450545095,0.023872792,60.7614352,0.0
follower_cn_snapshot_combine_log,0.154236002,0.004749847,32.4717836,2.668945e-231
author_pub_count_cate,-0.077241269,0.00903548,-8.5486624,1.245232e-17
affiliation_rank_cate,-0.013902025,0.003198486,-4.346439,1.383655e-05
affiliation_cateinternational,0.004329114,0.01976642,0.2190136,0.8266395
journal_impact,0.035265962,0.001275587,27.646857,3.0445039999999996e-168
author_citation_log,0.064260873,0.005172701,12.42308,1.958727e-35
pub_year2014,0.211180372,0.047724704,4.4249697,9.645581e-06


In [32]:
# use this search string in Sublime: " \\\\ \n  &" with " &"
stargazer(m_yes, type = "latex", single.row=TRUE, ci = FALSE, report = ('vc*p'), star.cutoffs = c(0.05, 0.01, 0.001))


% Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
% Date and time: Tue, Nov 29, 2022 - 04:35:30 PM
\begin{table}[!htbp] \centering 
  \caption{} 
  \label{} 
\begin{tabular}{@{\extracolsep{5pt}}lc} 
\\[-1.8ex]\hline 
\hline \\[-1.8ex] 
 & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ 
\cline{2-2} 
\\[-1.8ex] & total\_num\_tweets \\ 
\hline \\[-1.8ex] 
 genderFemale & 0.144$^{***}$ \\ 
  & p = 0.00001 \\ 
  self\_promotionTrue & 1.451$^{***}$ \\ 
  & p = 0.000 \\ 
  follower\_cn\_snapshot\_combine\_log & 0.154$^{***}$ \\ 
  & p = 0.000 \\ 
  author\_pub\_count\_cate & $-$0.077$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_rank\_cate & $-$0.014$^{***}$ \\ 
  & p = 0.00002 \\ 
  affiliation\_cateinternational & 0.004 \\ 
  & p = 0.827 \\ 
  journal\_impact & 0.035$^{***}$ \\ 
  & p = 0.000 \\ 
  author\_citation\_log & 0.064$^{***}$ \\ 
  & p = 0.000 \\ 
  pub\_year2014 & 0.211$^{***}$ \\ 
  & p = 0.00001 \\ 
  pub\_year

## Robustness check 2 (different definition of self-promotion)

### Active subset

Their matching

In [37]:
base_str <- "total_num_tweets ~ 1 + gender * self_promotion_def + authorship_pos + follower_cn_snapshot_combine_log + \
        author_pub_count_cate + affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + \
        author_citation_log + pub_year"
equation <- as.formula(paste(base_str, keywords, sep = " "))
m_yes <- glm.nb(equation, data = mydata[mydata$is_active_on_twitter == "True", ])


In [38]:
tidy(m_yes)

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
(Intercept),0.599030622,0.01542346,38.8389345,0.0
genderFemale,0.093046541,0.00479943,19.3869971,9.937172e-84
self_promotion_defTrue,0.744199211,0.004879306,152.5215365,0.0
authorship_posfirst_position,0.179590733,0.005958296,30.1412905,1.39508e-199
authorship_posmiddle_position,0.171725548,0.004583121,37.4691292,2.932282e-307
authorship_possolo_author,-0.099390986,0.01108423,-8.9668857,3.0501639999999997e-19
follower_cn_snapshot_combine_log,0.134726568,0.0009485293,142.0373237,0.0
author_pub_count_cate,-0.109338019,0.001930348,-56.6416235,0.0
affiliation_rank_cate,-0.006754288,0.0006900955,-9.7874679,1.274475e-22
affiliation_cateinternational,0.096719994,0.004031231,23.9926704,3.316538e-127


In [39]:
# use this search string in Sublime: " \\\\ \n  &" with " &"
stargazer(m_yes, type = "latex", single.row=TRUE, ci = FALSE, report = ('vc*p'), star.cutoffs = c(0.05, 0.01, 0.001))


% Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
% Date and time: Tue, Nov 29, 2022 - 04:40:09 PM
\begin{table}[!htbp] \centering 
  \caption{} 
  \label{} 
\begin{tabular}{@{\extracolsep{5pt}}lc} 
\\[-1.8ex]\hline 
\hline \\[-1.8ex] 
 & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ 
\cline{2-2} 
\\[-1.8ex] & total\_num\_tweets \\ 
\hline \\[-1.8ex] 
 genderFemale & 0.093$^{***}$ \\ 
  & p = 0.000 \\ 
  self\_promotion\_defTrue & 0.744$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posfirst\_position & 0.180$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posmiddle\_position & 0.172$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_possolo\_author & $-$0.099$^{***}$ \\ 
  & p = 0.000 \\ 
  follower\_cn\_snapshot\_combine\_log & 0.135$^{***}$ \\ 
  & p = 0.000 \\ 
  author\_pub\_count\_cate & $-$0.109$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_rank\_cate & $-$0.007$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_cateinternational 