In [34]:
library("lme4")
library("margins")
library("stargazer")
library("emmeans")
library("ggeffects")
library("broom")
library("broom.mixed")
library("MASS")
library("pscl")

In [2]:
options(repr.matrix.max.rows=500, repr.matrix.max.cols=200)

In [3]:
options(dplyr.print_max = 100)

In [4]:
Data_Root <- "/Data/Promotion/revision/"
data_root <- "/Data/Promotion/revision/reg_results_F_M/"

In [5]:
mydata <- read.csv(paste(Data_Root, "reg_data_drop_missing.csv", sep = ''), header = TRUE, stringsAsFactors = TRUE)

In [6]:
nrow(mydata)

In [7]:
ncol(mydata)

In [8]:
mydata = mydata[mydata$gender %in% c("Male", "Female"), ]

In [9]:
mydata = mydata[mydata$affiliation_cate != 'unknown', ]

In [10]:
nrow(mydata)

In [25]:
mydata$pub_year <- as.factor(mydata$pub_year)

In [11]:
mydata <- within(mydata, gender <- relevel(gender, ref = 'Male'))
mydata <- within(mydata, authorship_pos <- relevel(authorship_pos, ref = 'last_position'))
mydata <- within(mydata, affiliation_cate <- relevel(affiliation_cate, ref = 'domestic'))

In [12]:
keywords <- " + Social_Sciences + Materials_Science + Engineering + Chemistry + \
        Biochemistry__Genetics_and_Molecular_Biology + Medicine + Nursing + Agricultural_and_Biological_Sciences + \
        Pharmacology__Toxicology_and_Pharmaceutics + Neuroscience + Business__Management_and_Accounting + \
        Economics__Econometrics_and_Finance + Chemical_Engineering + Physics_and_Astronomy + Computer_Science + \
        Decision_Sciences + Health_Professions + Psychology + Immunology_and_Microbiology + Dentistry + \
        Earth_and_Planetary_Sciences + Environmental_Science + Mathematics + Arts_and_Humanities + Energy + \
        Veterinary + General"

In [13]:
mydata$author_citation_log <- log2(1 + mydata$author_citation)

In [14]:
mydata$total_num_tweets_log <- log2(1 + mydata$total_num_tweets)

In [15]:
mydata$follower_cn_snapshot_log <- log2(1 + mydata$follower_cn_snapshot)
mydata$follower_cn_snapshot_ours_log <- log2(1 + mydata$follower_cn_snapshot_ours)
mydata$follower_cn_snapshot_combine_log <- log2(1 + mydata$follower_cn_snapshot_combine)

## Num of scientists

### Active subset

In [16]:
nrow(mydata[mydata$is_active_on_twitter == "True", ])

Negative Binomial Regression

In [26]:
base_str <- "researcher ~ 1 + gender * self_promotion + authorship_pos + follower_cn_snapshot_log + \
        author_pub_count_cate + affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + \
        author_citation_log + pub_year"
equation <- as.formula(paste(base_str, keywords, sep = " "))
m_yes <- glm.nb(equation, data = mydata[mydata$is_active_on_twitter == "True", ])

In [27]:
tidy(m_yes)

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
(Intercept),-1.0863132302,0.0169117219,-64.234336,0.0
genderFemale,0.1357222726,0.0058153946,23.338446,1.80572e-120
self_promotionTrue,1.2716122186,0.004834114,263.049695,0.0
authorship_posfirst_position,0.0618635652,0.0063306594,9.772057,1.4840760000000001e-22
authorship_posmiddle_position,0.3039535696,0.0049154764,61.836034,0.0
authorship_possolo_author,-0.2620250172,0.0119059613,-22.007884,2.420265e-107
follower_cn_snapshot_log,0.0959089444,0.0010402839,92.194974,0.0
author_pub_count_cate,-0.1887934776,0.002047297,-92.215968,0.0
affiliation_rank_cate,-0.0198679448,0.0007318678,-27.146905,2.754286e-162
affiliation_cateinternational,0.0445921754,0.004271397,10.439717,1.6329680000000002e-25


In [28]:
# you're trying to measure the ratio of the deviance in your model to the null; how much better your model is (residual deviance) than just the intercept (null deviance). If that ratio is tiny, you're 'explaining' most of the deviance in the null; 1 minus that gets you your R-squared.
with(summary(m_yes), 1 - deviance/null.deviance)


In [29]:
# use this search string in Sublime: " \\\\ \n  &" with " &"
stargazer(m_yes, type = "latex", single.row=TRUE, ci = FALSE, report = ('vc*p'), star.cutoffs = c(0.05, 0.01, 0.001))


% Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
% Date and time: Wed, Nov 23, 2022 - 11:59:06 PM
\begin{table}[!htbp] \centering 
  \caption{} 
  \label{} 
\begin{tabular}{@{\extracolsep{5pt}}lc} 
\\[-1.8ex]\hline 
\hline \\[-1.8ex] 
 & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ 
\cline{2-2} 
\\[-1.8ex] & researcher \\ 
\hline \\[-1.8ex] 
 genderFemale & 0.136$^{***}$ \\ 
  & p = 0.000 \\ 
  self\_promotionTrue & 1.272$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posfirst\_position & 0.062$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posmiddle\_position & 0.304$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_possolo\_author & $-$0.262$^{***}$ \\ 
  & p = 0.000 \\ 
  follower\_cn\_snapshot\_log & 0.096$^{***}$ \\ 
  & p = 0.000 \\ 
  author\_pub\_count\_cate & $-$0.189$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_rank\_cate & $-$0.020$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_cateinternational & 0.045$^{***}$ \\ 
  

## Num of non-scientists

### Active subset

Negative Binomial Regression

In [30]:
base_str <- "num_non_scientists ~ 1 + gender * self_promotion + authorship_pos + follower_cn_snapshot_log + \
        author_pub_count_cate + affiliation_rank_cate + affiliation_cate + num_authors + journal_impact + \
        author_citation_log + pub_year"
equation <- as.formula(paste(base_str, keywords, sep = " "))
m_yes <- glm.nb(equation, data = mydata[mydata$is_active_on_twitter == "True", ])

In [31]:
tidy(m_yes)

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
(Intercept),0.035674698,0.01520275,2.3465953,0.01894582
genderFemale,0.145484229,0.005210922,27.9190929,1.564857e-171
self_promotionTrue,1.120696627,0.004471013,250.6583263,0.0
authorship_posfirst_position,0.126166871,0.005810908,21.7120738,1.57776e-104
authorship_posmiddle_position,0.277811146,0.004492993,61.832086,0.0
authorship_possolo_author,-0.181863672,0.01084153,-16.7747257,3.7358e-63
follower_cn_snapshot_log,0.092311363,0.0009540098,96.7614379,0.0
author_pub_count_cate,-0.078904653,0.001882757,-41.9090985,0.0
affiliation_rank_cate,-0.009664552,0.0006725818,-14.3693335,8.060322e-47
affiliation_cateinternational,0.06579122,0.003932545,16.7299352,7.932457e-63


In [32]:
# you're trying to measure the ratio of the deviance in your model to the null; how much better your model is (residual deviance) than just the intercept (null deviance). If that ratio is tiny, you're 'explaining' most of the deviance in the null; 1 minus that gets you your R-squared.
with(summary(m_yes), 1 - deviance/null.deviance)


In [33]:
# use this search string in Sublime: " \\\\ \n  &" with " &"
stargazer(m_yes, type = "latex", single.row=TRUE, ci = FALSE, report = ('vc*p'), star.cutoffs = c(0.05, 0.01, 0.001))


% Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
% Date and time: Thu, Nov 24, 2022 - 12:00:36 AM
\begin{table}[!htbp] \centering 
  \caption{} 
  \label{} 
\begin{tabular}{@{\extracolsep{5pt}}lc} 
\\[-1.8ex]\hline 
\hline \\[-1.8ex] 
 & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ 
\cline{2-2} 
\\[-1.8ex] & num\_non\_scientists \\ 
\hline \\[-1.8ex] 
 genderFemale & 0.145$^{***}$ \\ 
  & p = 0.000 \\ 
  self\_promotionTrue & 1.121$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posfirst\_position & 0.126$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_posmiddle\_position & 0.278$^{***}$ \\ 
  & p = 0.000 \\ 
  authorship\_possolo\_author & $-$0.182$^{***}$ \\ 
  & p = 0.000 \\ 
  follower\_cn\_snapshot\_log & 0.092$^{***}$ \\ 
  & p = 0.000 \\ 
  author\_pub\_count\_cate & $-$0.079$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_rank\_cate & $-$0.010$^{***}$ \\ 
  & p = 0.000 \\ 
  affiliation\_cateinternational & 0.066$^{**