In [27]:
data_root <- "/shared/0/projects/news-quotes/"

In [28]:
library("lme4")
library("margins")
library("stargazer")
library("ggeffects")
library("broom")

In [29]:
mydata <- read.csv(paste(data_root, "reg_data.csv", sep = ''), header = TRUE)

In [58]:
# mydata <- within(mydata, rm("Body_weight", "Demographics", "Clinical_Practice"))

In [30]:
nrow(mydata)

In [31]:
ncol(mydata)

In [32]:
# coarsen reporters into 4 groups.
mydata$reporter_eth_ethnea_broad <- as.character(mydata$reporter_eth_ethnea_broad)
mydata$reporter_eth_ethnea_broad[mydata$reporter_eth_ethnea_broad == 'EasternEuropean'] <- 'European'
mydata$reporter_eth_ethnea_broad[mydata$reporter_eth_ethnea_broad == 'WesternNorthernEuropean'] <- 'European'
mydata$reporter_eth_ethnea_broad[mydata$reporter_eth_ethnea_broad == 'SouthernEuropean'] <- 'European'
mydata$reporter_eth_ethnea_broad[mydata$reporter_eth_ethnea_broad == 'Chinese'] <- 'Asian'
mydata$reporter_eth_ethnea_broad[mydata$reporter_eth_ethnea_broad == 'Indian'] <- 'Asian'
mydata$reporter_eth_ethnea_broad[mydata$reporter_eth_ethnea_broad == 'EastAsian'] <- 'Asian'
mydata$reporter_eth_ethnea_broad[mydata$reporter_eth_ethnea_broad == 'MiddleEastern'] <- 'OtherUnknown'
mydata$reporter_eth_ethnea_broad[mydata$reporter_eth_ethnea_broad == 'African'] <- 'OtherUnknown'
mydata$reporter_eth_ethnea_broad[mydata$reporter_eth_ethnea_broad == 'unknown'] <- 'OtherUnknown'
mydata$reporter_eth_ethnea_broad <- as.factor(mydata$reporter_eth_ethnea_broad)

In [33]:
mydata <- within(mydata, author_eth_ethnea_broad <- relevel(author_eth_ethnea_broad, ref = 'English'))
mydata <- within(mydata, reporter_eth_ethnea_broad <- relevel(reporter_eth_ethnea_broad, ref = 'English'))
mydata <- within(mydata, author_gender_ethnea <- relevel(author_gender_ethnea, ref = 'M'))
mydata <- within(mydata, reporter_gender_ethnea <- relevel(reporter_gender_ethnea, ref = 'M'))

In [34]:
mydata <- within(mydata, author_pos_cate <- relevel(author_pos_cate, ref = 'last_position'))
mydata <- within(mydata, is_top_author <- relevel(is_top_author, ref = 'yes'))
mydata <- within(mydata, is_corresponding <- relevel(is_corresponding, ref = 'yes'))
mydata <- within(mydata, affiliation_cate <- relevel(affiliation_cate, ref = 'domestic'))

In [35]:
base_str <- "is_author_mentioned ~ 1 + author_eth_ethnea_broad + author_gender_ethnea + reporter_eth_ethnea_broad + reporter_gender_ethnea + \
          last_name_length + last_name_prob + author_pos_cate + author_rank + is_top_author + is_corresponding + \
          affiliation_rank + affiliation_cate + num_authors + mention_year_center + gap_in_years + \
          num_words + num_mentioned_papers + FleschReadingEase + sentences_per_paragraph + type_token_ratio"

# 199 keywords
keywords <- " + Cell_biology + Genetics + Biology + Body_mass_index + Health_care + Disease + Gerontology + Population + Public_health + Medicine + Materials_science + Composite_material + Nanotechnology + Cohort_study + Social_psychology + Cohort + Psychological_intervention + Young_adult + Family_medicine + Cancer + Surgery + Randomized_controlled_trial + Placebo + Clinical_trial + Nursing + Applied_psychology + Human_factors_and_ergonomics + Injury_prevention + Suicide_prevention + Psychiatry + Occupational_safety_and_health + Intensive_care_medicine + Pediatrics + Hazard_ratio + Confidence_interval + Retrospective_cohort_study + Vaccination + Psychology + Perception + Cognition + Environmental_health + Obesity + Risk_factor + Quality_of_life + Physical_therapy + Weight_loss + Anatomy + Mental_health + Psychosocial + Anxiety + Distress + Business + Public_relations + Marketing + Immunology + Global_warming + Economics + Climatology + Climate_change + General_surgery + Endocrinology + Internal_medicine + Receptor + Inflammation + Stimulus__physiology_ + Immune_system + Meta_analysis + Sociology + Gene + Cancer_research + Breast_cancer + Cell + Diabetes_mellitus + Blood_pressure + Oncology + Gynecology + Communication + Cognitive_psychology + Adverse_effect + Clinical_endpoint + Pharmacology + Virology + Risk_assessment + Transcription_factor + Political_science + Ecology + Geography + Cross_sectional_study + Odds_ratio + Comorbidity + Environmental_engineering + Chemistry + Medical_emergency + Physics + Social_science + Ethnic_group + Labour_economics + Antibody + Geomorphology + Geophysics + Geology + Ranging + Stroke + Environmental_resource_management + Type_2_diabetes + Cardiology + Molecular_biology + Developmental_psychology + Agriculture + Signal_transduction + Optoelectronics + Psychotherapist + Affect__psychology_ + Clinical_psychology + Anesthesia + Atmospheric_sciences + In_vivo + Biochemistry + Analytical_chemistry + Neuroscience + Botany + Gene_expression + Politics + Demography + Socioeconomic_status + Mortality_rate + Virus + Optics + Condensed_matter_physics + Bioinformatics + Law + Physical_medicine_and_rehabilitation + Stem_cell + Biodiversity + Astrophysics + Astronomy + Radiology + Pathology + Proportional_hazards_model + Chemotherapy + Predation + Food_science + Artificial_intelligence + Overweight + Antibiotics + Microbiology + Zoology + Paleontology + Habitat + Public_administration + Ecosystem + Economic_growth + Organic_chemistry + Government + Autism + Transplantation + Gastroenterology + Insulin + Engineering + Computer_science + Observational_study + Heart_disease + Epidemiology + Obstetrics + Pregnancy + Fishery + Alternative_medicine + Logistic_regression + Offspring + Mood + Bacteria + Prostate_cancer + Evolutionary_biology + Phenomenon + Longitudinal_study + Genome + Mutation + Pedagogy + Dementia + Relative_risk + Microeconomics + Odds + Feeling + Oceanography + Emergency_medicine + Personality + Prospective_cohort_study + Hippocampus + Greenhouse_gas + Biomarker__medicine_ + Myocardial_infarction + Socioeconomics + Drug + Environmental_science + Epigenetics + Inorganic_chemistry + Emergency_department + Medical_prescription + Phenotype"

In [36]:
equation_bar <- as.formula(paste(base_str, keywords, " + (1|journal_title) + (1|outlet)", sep = " "))

### Press Release

In [37]:
subdata = mydata[mydata$category == 'PressRelease', ]

In [38]:
nrow(subdata)

In [39]:
ncol(subdata)

In [40]:
m_pr <- glmer(formula = equation_bar, data = subdata, family = "binomial", control = glmerControl(optimizer = "nloptwrap"), nAGQ = 0)

“Some predictor variables are on very different scales: consider rescaling”

In [41]:
tidy(m_pr)

term,estimate,std.error,statistic,p.value,group
(Intercept),2.328820e+00,2.004092e-01,11.6203253,3.248967e-31,fixed
author_eth_ethnea_broadAfrican,-2.047917e-01,8.609104e-02,-2.3787808,1.737000e-02,fixed
author_eth_ethnea_broadChinese,-2.963318e-01,2.533346e-02,-11.6972476,1.316547e-31,fixed
author_eth_ethnea_broadEastAsian,-2.495585e-01,3.031060e-02,-8.2333745,1.820120e-16,fixed
author_eth_ethnea_broadEasternEuropean,-6.614502e-02,3.441791e-02,-1.9218194,5.462849e-02,fixed
author_eth_ethnea_broadIndian,5.544404e-02,3.146798e-02,1.7619190,7.808298e-02,fixed
author_eth_ethnea_broadMiddleEastern,1.494418e-02,2.967976e-02,0.5035144,6.146027e-01,fixed
author_eth_ethnea_broadSouthernEuropean,-1.424230e-01,2.244180e-02,-6.3463248,2.205195e-10,fixed
author_eth_ethnea_broadunknown,-4.797335e-01,8.792778e-02,-5.4559947,4.869944e-08,fixed
author_eth_ethnea_broadWesternNorthernEuropean,-3.565291e-02,1.809565e-02,-1.9702479,4.880996e-02,fixed


In [16]:
# marg_eff <- marginal_effects(m, data = subdata, variables = c("eth_first_author", "gender_first_author"))

In [18]:
# write.csv(marg_eff, "/shared/0/projects/news-quotes/reg_results/news_type/General.csv", row.names = FALSE)

In [42]:
margins_eff <- margins(m_pr, data = subdata, c("author_eth_ethnea_broad", "author_gender_ethnea"))

In [43]:
summary(margins_eff)

factor,AME,SE,z,p,lower,upper
author_eth_ethnea_broadAfrican,-0.035291863,0.015165312,-2.3271438,0.01995761,-0.0650153283,-0.005568397
author_eth_ethnea_broadChinese,-0.051519768,0.00469775,-10.966904,5.512781000000001e-28,-0.0607271877,-0.04231235
author_eth_ethnea_broadEastAsian,-0.043196291,0.005478611,-7.8845337,3.157123e-15,-0.0539341706,-0.03245841
author_eth_ethnea_broadEasternEuropean,-0.011232918,0.005889638,-1.907234,0.0564903,-0.0227763967,0.0003105608
author_eth_ethnea_broadIndian,0.009283741,0.005252286,1.7675619,0.07713417,-0.0010105509,0.01957803
author_eth_ethnea_broadMiddleEastern,0.002514415,0.004988169,0.5040757,0.6142082,-0.0072622169,0.01229105
author_eth_ethnea_broadSouthernEuropean,-0.024387148,0.003943864,-6.1835677,6.266885e-10,-0.0321169785,-0.01665732
author_eth_ethnea_broadunknown,-0.084717328,0.016214578,-5.2247629,1.743784e-07,-0.116497318,-0.05293734
author_eth_ethnea_broadWesternNorthernEuropean,-0.006033941,0.00307263,-1.9637707,0.04955669,-0.0120561857,-1.169664e-05
author_gender_ethneaF,0.006053793,0.002625263,2.3059757,0.02111199,0.0009083718,0.01119921


In [44]:
write.csv(summary(margins_eff), "/shared/0/projects/news-quotes/reg_results/news_type/PressRelease_margins.csv", row.names = FALSE)


### Sci Tech

In [9]:
subdata = mydata[mydata$category == 'SciTech', ]

In [10]:
nrow(subdata)

In [11]:
ncol(subdata)

In [15]:
m_sci <- glmer(formula = equation_bar, data = subdata, family = "binomial", control = glmerControl(optimizer = "nloptwrap"), nAGQ = 0)

“Some predictor variables are on very different scales: consider rescaling”

In [45]:
# tidy(m_sci)

In [16]:
# marg_eff <- marginal_effects(m, data = subdata, variables = c("eth_first_author", "gender_first_author"))

In [18]:
# write.csv(marg_eff, "/shared/0/projects/news-quotes/reg_results/news_type/General.csv", row.names = FALSE)

In [None]:
margins_eff <- margins(m_sci, data = subdata, c("author_eth_ethnea_broad", "author_gender_ethnea"))

In [None]:
summary(margins_eff)

In [None]:
write.csv(summary(margins_eff), "/shared/0/projects/news-quotes/reg_results/news_type/SciTech_margins.csv", row.names = FALSE)


### General News

In [13]:
subdata = mydata[mydata$category == 'General', ]

In [14]:
nrow(subdata)

In [15]:
ncol(subdata)

In [19]:
m_gn <- glmer(formula = equation_bar, data = subdata, family = "binomial", control = glmerControl(optimizer = "nloptwrap"), nAGQ = 0)

“Some predictor variables are on very different scales: consider rescaling”

In [46]:
# tidy(m_gn)

In [16]:
# marg_eff <- marginal_effects(m, data = subdata, variables = c("eth_first_author", "gender_first_author"))

In [18]:
# write.csv(marg_eff, "/shared/0/projects/news-quotes/reg_results/news_type/General.csv", row.names = FALSE)

In [None]:
margins_eff <- margins(m_gn, data = subdata, c("author_eth_ethnea_broad", "author_gender_ethnea"))

In [None]:
summary(margins_eff)

In [None]:
write.csv(summary(margins_eff), "/shared/0/projects/news-quotes/reg_results/news_type/General_margins.csv", row.names = FALSE)
