In [1]:
data_root <- "/shared/0/projects/news-quotes/"

In [69]:
library("lme4")
library("margins")
library("stargazer")
library("ggeffects")
library("broom")

In [4]:
options(repr.matrix.max.rows=500, repr.matrix.max.cols=200)

In [4]:
# help(glmer)

In [53]:
mydata <- read.csv(paste(data_root, "reg_data.csv", sep = ''), header = TRUE)

In [54]:
nrow(mydata)

In [55]:
ncol(mydata)

In [56]:
# coarsen reporters into 4 groups.
mydata$reporter_eth_ethnea_broad <- as.character(mydata$reporter_eth_ethnea_broad)
mydata$reporter_eth_ethnea_broad[mydata$reporter_eth_ethnea_broad == 'EasternEuropean'] <- 'European'
mydata$reporter_eth_ethnea_broad[mydata$reporter_eth_ethnea_broad == 'WesternNorthernEuropean'] <- 'European'
mydata$reporter_eth_ethnea_broad[mydata$reporter_eth_ethnea_broad == 'SouthernEuropean'] <- 'European'
mydata$reporter_eth_ethnea_broad[mydata$reporter_eth_ethnea_broad == 'Chinese'] <- 'Asian'
mydata$reporter_eth_ethnea_broad[mydata$reporter_eth_ethnea_broad == 'Indian'] <- 'Asian'
mydata$reporter_eth_ethnea_broad[mydata$reporter_eth_ethnea_broad == 'EastAsian'] <- 'Asian'
mydata$reporter_eth_ethnea_broad[mydata$reporter_eth_ethnea_broad == 'MiddleEastern'] <- 'OtherUnknown'
mydata$reporter_eth_ethnea_broad[mydata$reporter_eth_ethnea_broad == 'African'] <- 'OtherUnknown'
mydata$reporter_eth_ethnea_broad[mydata$reporter_eth_ethnea_broad == 'unknown'] <- 'OtherUnknown'
mydata$reporter_eth_ethnea_broad <- as.factor(mydata$reporter_eth_ethnea_broad)

In [58]:
mydata <- within(mydata, author_eth_ethnea_broad <- relevel(author_eth_ethnea_broad, ref = 'English'))
mydata <- within(mydata, reporter_eth_ethnea_broad <- relevel(reporter_eth_ethnea_broad, ref = 'English'))
mydata <- within(mydata, author_gender_ethnea <- relevel(author_gender_ethnea, ref = 'M'))
mydata <- within(mydata, reporter_gender_ethnea <- relevel(reporter_gender_ethnea, ref = 'M'))

In [57]:
mydata <- within(mydata, author_pos_cate <- relevel(author_pos_cate, ref = 'last_position'))
mydata <- within(mydata, is_top_author <- relevel(is_top_author, ref = 'yes'))
mydata <- within(mydata, is_corresponding <- relevel(is_corresponding, ref = 'yes'))
mydata <- within(mydata, affiliation_cate <- relevel(affiliation_cate, ref = 'domestic'))

In [None]:
# mydata[mydata$author_pos_cate == 'first_position', ]

In [145]:
# eth + gender
m1 <- glm(formula = 'is_author_mentioned ~ 1 + author_eth_ethnea_broad + author_gender_ethnea', 
          data = mydata, family = "binomial")

In [146]:
tidy(m1)

term,estimate,std.error,statistic,p.value
(Intercept),-0.307519033,0.004759213,-64.6155251,0.0
author_eth_ethnea_broadAfrican,-0.456738592,0.041827318,-10.9196241,9.287941e-28
author_eth_ethnea_broadChinese,0.131705138,0.011801209,11.1603087,6.376975e-29
author_eth_ethnea_broadEastAsian,0.015380591,0.015506443,0.9918839,0.3212542
author_eth_ethnea_broadEasternEuropean,0.211414999,0.015866362,13.3247303,1.662325e-40
author_eth_ethnea_broadIndian,0.138169624,0.014564941,9.4864525,2.390307e-21
author_eth_ethnea_broadMiddleEastern,0.099526128,0.013229736,7.5229109,5.356994e-14
author_eth_ethnea_broadSouthernEuropean,-0.003009557,0.009948822,-0.3025039,0.762268
author_eth_ethnea_broadunknown,-0.209724852,0.041948476,-4.9995821,5.74547e-07
author_eth_ethnea_broadWesternNorthernEuropean,-0.001658936,0.007531613,-0.220263,0.8256663


In [147]:
# add author feats
m2 <- glm(formula = 'is_author_mentioned ~ 1 + author_eth_ethnea_broad + author_gender_ethnea + \
          last_name_length + last_name_prob + author_pos_cate + author_rank + is_top_author + is_corresponding + \
          affiliation_rank + affiliation_cate + num_authors', data = mydata, family = "binomial")

In [148]:
tidy(m2)

term,estimate,std.error,statistic,p.value
(Intercept),0.4482464,0.03472697,12.907731,4.0712179999999995e-38
author_eth_ethnea_broadAfrican,-0.3944433,0.04375802,-9.014193,1.9832429999999998e-19
author_eth_ethnea_broadChinese,0.09927798,0.0130985,7.579341,3.473158e-14
author_eth_ethnea_broadEastAsian,0.1229969,0.01643778,7.482572,7.288173e-14
author_eth_ethnea_broadEasternEuropean,0.2534993,0.01695727,14.949299,1.5739839999999999e-50
author_eth_ethnea_broadIndian,0.1537399,0.01549344,9.922905,3.3098040000000006e-23
author_eth_ethnea_broadMiddleEastern,0.1342654,0.01423403,9.432708,3.996368e-21
author_eth_ethnea_broadSouthernEuropean,0.04072755,0.01078916,3.774857,0.0001600996
author_eth_ethnea_broadunknown,-0.274819,0.04367402,-6.292506,3.123808e-10
author_eth_ethnea_broadWesternNorthernEuropean,0.07046948,0.008446224,8.343311,7.223863e-17


In [11]:
base_str <- "is_author_mentioned ~ 1 + author_eth_ethnea_broad + author_gender_ethnea + reporter_eth_ethnea_broad + reporter_gender_ethnea + \
          last_name_length + last_name_prob + author_pos_cate + author_rank + is_top_author + is_corresponding + \
          affiliation_rank + affiliation_cate + num_authors + mention_year_center + gap_in_years + \
          num_words + num_mentioned_papers + FleschReadingEase + sentences_per_paragraph + type_token_ratio"

In [150]:
m3 <- glm(formula = base_str, data = mydata, family = "binomial")

In [151]:
tidy(m3)

term,estimate,std.error,statistic,p.value
(Intercept),0.4634964,0.0527769,8.7821841,1.603304e-18
author_eth_ethnea_broadAfrican,-0.3876414,0.04646538,-8.342586,7.268326000000001e-17
author_eth_ethnea_broadChinese,-0.05384181,0.01397807,-3.8518777,0.0001172156
author_eth_ethnea_broadEastAsian,0.03692691,0.01763357,2.094125,0.03624885
author_eth_ethnea_broadEasternEuropean,0.1487907,0.01825293,8.151607,3.591197e-16
author_eth_ethnea_broadIndian,0.04782908,0.0165843,2.8839978,0.003926615
author_eth_ethnea_broadMiddleEastern,0.0832559,0.01532954,5.4310757,5.601538e-08
author_eth_ethnea_broadSouthernEuropean,-0.01703226,0.01162895,-1.464643,0.1430184
author_eth_ethnea_broadunknown,-0.3536939,0.04667558,-7.5777073,3.517149e-14
author_eth_ethnea_broadWesternNorthernEuropean,0.02675263,0.009173428,2.9163173,0.003541901


In [78]:
# 199 keywords
keywords <- " + Cell_biology + Genetics + Biology + Body_mass_index + Health_care + Disease + Gerontology + Population + Public_health + Medicine + Materials_science + Composite_material + Nanotechnology + Cohort_study + Social_psychology + Cohort + Psychological_intervention + Young_adult + Family_medicine + Cancer + Surgery + Randomized_controlled_trial + Placebo + Clinical_trial + Nursing + Applied_psychology + Human_factors_and_ergonomics + Injury_prevention + Suicide_prevention + Psychiatry + Occupational_safety_and_health + Intensive_care_medicine + Pediatrics + Hazard_ratio + Confidence_interval + Retrospective_cohort_study + Vaccination + Psychology + Perception + Cognition + Environmental_health + Obesity + Risk_factor + Quality_of_life + Physical_therapy + Weight_loss + Anatomy + Mental_health + Psychosocial + Anxiety + Distress + Business + Public_relations + Marketing + Immunology + Global_warming + Economics + Climatology + Climate_change + General_surgery + Endocrinology + Internal_medicine + Receptor + Inflammation + Stimulus__physiology_ + Immune_system + Meta_analysis + Sociology + Gene + Cancer_research + Breast_cancer + Cell + Diabetes_mellitus + Blood_pressure + Oncology + Gynecology + Communication + Cognitive_psychology + Adverse_effect + Clinical_endpoint + Pharmacology + Virology + Risk_assessment + Transcription_factor + Political_science + Ecology + Geography + Cross_sectional_study + Odds_ratio + Comorbidity + Environmental_engineering + Chemistry + Medical_emergency + Physics + Social_science + Ethnic_group + Labour_economics + Antibody + Geomorphology + Geophysics + Geology + Ranging + Stroke + Environmental_resource_management + Type_2_diabetes + Cardiology + Molecular_biology + Developmental_psychology + Agriculture + Signal_transduction + Optoelectronics + Psychotherapist + Affect__psychology_ + Clinical_psychology + Anesthesia + Atmospheric_sciences + In_vivo + Biochemistry + Analytical_chemistry + Neuroscience + Botany + Gene_expression + Politics + Demography + Socioeconomic_status + Mortality_rate + Virus + Optics + Condensed_matter_physics + Bioinformatics + Law + Physical_medicine_and_rehabilitation + Stem_cell + Biodiversity + Astrophysics + Astronomy + Radiology + Pathology + Proportional_hazards_model + Chemotherapy + Predation + Food_science + Artificial_intelligence + Overweight + Antibiotics + Microbiology + Zoology + Paleontology + Habitat + Public_administration + Ecosystem + Economic_growth + Organic_chemistry + Government + Autism + Transplantation + Gastroenterology + Insulin + Engineering + Computer_science + Observational_study + Heart_disease + Epidemiology + Obstetrics + Pregnancy + Fishery + Alternative_medicine + Logistic_regression + Offspring + Mood + Bacteria + Prostate_cancer + Evolutionary_biology + Phenomenon + Longitudinal_study + Genome + Mutation + Pedagogy + Dementia + Relative_risk + Microeconomics + Odds + Feeling + Oceanography + Emergency_medicine + Personality + Prospective_cohort_study + Hippocampus + Greenhouse_gas + Biomarker__medicine_ + Myocardial_infarction + Socioeconomics + Drug + Environmental_science + Epigenetics + Inorganic_chemistry + Emergency_department + Medical_prescription + Phenotype"

In [153]:
# add control for keywords.
equation <- as.formula(paste(base_str, keywords, sep = " "))
m4 <- glm(formula = equation, data = mydata, family = "binomial")

In [154]:
tidy(m4)

term,estimate,std.error,statistic,p.value
(Intercept),1.318766,0.05865851,22.48209542,6.213621e-112
author_eth_ethnea_broadAfrican,-0.3710606,0.04720065,-7.86134452,3.800319e-15
author_eth_ethnea_broadChinese,-0.2538687,0.01459414,-17.39524759,8.963857e-68
author_eth_ethnea_broadEastAsian,-0.1793023,0.01819414,-9.854952,6.524787000000001e-23
author_eth_ethnea_broadEasternEuropean,-0.02139949,0.01871939,-1.14317206,0.2529672
author_eth_ethnea_broadIndian,-0.01983846,0.01715031,-1.15674046,0.2473784
author_eth_ethnea_broadMiddleEastern,0.01426086,0.01574671,0.90564073,0.365126
author_eth_ethnea_broadSouthernEuropean,-0.1141493,0.01193668,-9.56289917,1.145033e-21
author_eth_ethnea_broadunknown,-0.3795141,0.04760964,-7.97137111,1.569235e-15
author_eth_ethnea_broadWesternNorthernEuropean,-0.04748734,0.009397026,-5.05344332,4.339151e-07


In [155]:
equation_bar <- as.formula(paste(base_str, keywords, " + (1|journal_title) + (1|outlet)", sep = " "))
m5 <- glmer(formula = equation_bar, data = mydata, family = "binomial", control = glmerControl(optimizer = "nloptwrap"), nAGQ = 0)


In [156]:
summary(m5)


Correlation matrix not shown by default, as p = 236 > 12.
Use print(obj, correlation=TRUE)  or
    vcov(obj)        if you need it



Generalized linear mixed model fit by maximum likelihood (Adaptive
  Gauss-Hermite Quadrature, nAGQ = 0) [glmerMod]
 Family: binomial  ( logit )
Formula: 
is_author_mentioned ~ 1 + author_eth_ethnea_broad + author_gender_ethnea +  
    reporter_eth_ethnea_broad + reporter_gender_ethnea + last_name_length +  
    last_name_prob + author_pos_cate + author_rank + is_top_author +  
    is_corresponding + affiliation_rank + affiliation_cate +  
    num_authors + mention_year_center + gap_in_years + num_words +  
    num_mentioned_papers + FleschReadingEase + sentences_per_paragraph +  
    type_token_ratio + Cell_biology + Genetics + Biology + Body_mass_index +  
    Health_care + Disease + Gerontology + Population + Public_health +  
    Medicine + Materials_science + Composite_material + Nanotechnology +  
    Cohort_study + Social_psychology + Cohort + Psychological_intervention +  
    Young_adult + Family_medicine + Cancer + Surgery + Randomized_controlled_trial +  
    Placebo + Clini

In [157]:
tidy(m5)

term,estimate,std.error,statistic,p.value,group
(Intercept),0.9681466,0.08516415,11.3680066,6.0350109999999995e-30,fixed
author_eth_ethnea_broadAfrican,-0.3660354,0.05216946,-7.0162765,2.278589e-12,fixed
author_eth_ethnea_broadChinese,-0.3762245,0.01604767,-23.44417613,1.5156950000000002e-121,fixed
author_eth_ethnea_broadEastAsian,-0.272095,0.01998486,-13.6150563,3.2589449999999996e-42,fixed
author_eth_ethnea_broadEasternEuropean,-0.009208412,0.0204694,-0.4498624,0.6528097,fixed
author_eth_ethnea_broadIndian,-0.01114116,0.01909284,-0.58352537,0.5595397,fixed
author_eth_ethnea_broadMiddleEastern,0.01559511,0.01724273,0.90444553,0.3657592,fixed
author_eth_ethnea_broadSouthernEuropean,-0.1377195,0.0130801,-10.52892798,6.355411999999999e-26,fixed
author_eth_ethnea_broadunknown,-0.2270461,0.05298014,-4.28549376,1.823338e-05,fixed
author_eth_ethnea_broadWesternNorthernEuropean,-0.0722751,0.01026736,-7.03930682,1.931985e-12,fixed


In [171]:
# library(plyr)
 
# count(mydata, vars=c("eth_first_author", "eth_reporter"))

In [97]:
marg_eff <- marginal_effects(m5, data = mydata, variables = c("author_eth_ethnea_broad", "author_gender_ethnea"))

In [98]:
nrow(marg_eff)

In [158]:
print(colMeans(marg_eff))

Get latex table

In [90]:
# 199
drop_vars <- c("Cell_biology", "Genetics", "Biology", "Body_mass_index", "Health_care", "Disease", "Gerontology", "Population", "Public_health", "Medicine", "Materials_science", "Composite_material", "Nanotechnology", "Cohort_study", "Social_psychology", "Cohort", "Psychological_intervention", "Young_adult", "Family_medicine", "Cancer", "Surgery", "Randomized_controlled_trial", "Placebo", "Clinical_trial", "Nursing", "Applied_psychology", "Human_factors_and_ergonomics", "Injury_prevention", "Suicide_prevention", "Psychiatry", "Occupational_safety_and_health", "Intensive_care_medicine", "Pediatrics", "Hazard_ratio", "Confidence_interval", "Retrospective_cohort_study", "Vaccination", "Psychology", "Perception", "Cognition", "Environmental_health", "Obesity", "Risk_factor", "Quality_of_life", "Physical_therapy", "Weight_loss", "Anatomy", "Mental_health", "Psychosocial", "Anxiety", "Distress", "Business", "Public_relations", "Marketing", "Immunology", "Global_warming", "Economics", "Climatology", "Climate_change", "General_surgery", "Endocrinology", "Internal_medicine", "Receptor", "Inflammation", "Stimulus__physiology_", "Immune_system", "Meta_analysis", "Sociology", "Gene", "Cancer_research", "Breast_cancer", "Cell", "Diabetes_mellitus", "Blood_pressure", "Oncology", "Gynecology", "Communication", "Cognitive_psychology", "Adverse_effect", "Clinical_endpoint", "Pharmacology", "Virology", "Risk_assessment", "Transcription_factor", "Political_science", "Ecology", "Geography", "Cross_sectional_study", "Odds_ratio", "Comorbidity", "Environmental_engineering", "Chemistry", "Medical_emergency", "Physics", "Social_science", "Ethnic_group", "Labour_economics", "Antibody", "Geomorphology", "Geophysics", "Geology", "Ranging", "Stroke", "Environmental_resource_management", "Type_2_diabetes", "Cardiology", "Molecular_biology", "Developmental_psychology", "Agriculture", "Signal_transduction", "Optoelectronics", "Psychotherapist", "Affect__psychology_", "Clinical_psychology", "Anesthesia", "Atmospheric_sciences", "In_vivo", "Biochemistry", "Analytical_chemistry", "Neuroscience", "Botany", "Gene_expression", "Politics", "Demography", "Socioeconomic_status", "Mortality_rate", "Virus", "Optics", "Condensed_matter_physics", "Bioinformatics", "Law", "Physical_medicine_and_rehabilitation", "Stem_cell", "Biodiversity", "Astrophysics", "Astronomy", "Radiology", "Pathology", "Proportional_hazards_model", "Chemotherapy", "Predation", "Food_science", "Artificial_intelligence", "Overweight", "Antibiotics", "Microbiology", "Zoology", "Paleontology", "Habitat", "Public_administration", "Ecosystem", "Economic_growth", "Organic_chemistry", "Government", "Autism", "Transplantation", "Gastroenterology", "Insulin", "Engineering", "Computer_science", "Observational_study", "Heart_disease", "Epidemiology", "Obstetrics", "Pregnancy", "Fishery", "Alternative_medicine", "Logistic_regression", "Offspring", "Mood", "Bacteria", "Prostate_cancer", "Evolutionary_biology", "Phenomenon", "Longitudinal_study", "Genome", "Mutation", "Pedagogy", "Dementia", "Relative_risk", "Microeconomics", "Odds", "Feeling", "Oceanography", "Emergency_medicine", "Personality", "Prospective_cohort_study", "Hippocampus", "Greenhouse_gas", "Biomarker__medicine_", "Myocardial_infarction", "Socioeconomics", "Drug", "Environmental_science", "Epigenetics", "Inorganic_chemistry", "Emergency_department", "Medical_prescription", "Phenotype")


In [160]:
# https://www.jakeruss.com/cheatsheets/stargazer/
# covariate.labels = var_rename
stargazer(m1, m2, m3, m4, m5, type = "latex", omit = drop_vars, 
          dep.var.labels = "is author mentioned", single.row=TRUE, ci = FALSE, star.cutoffs = c(0.05, 0.01, 0.001), report = "vc*")
          
          


% Table created by stargazer v.5.2.2 by Marek Hlavac, Harvard University. E-mail: hlavac at fas.harvard.edu
% Date and time: Wed, Dec 16, 2020 - 02:29:20 PM
\begin{table}[!htbp] \centering 
  \caption{} 
  \label{} 
\begin{tabular}{@{\extracolsep{5pt}}lccccc} 
\\[-1.8ex]\hline 
\hline \\[-1.8ex] 
 & \multicolumn{5}{c}{\textit{Dependent variable:}} \\ 
\cline{2-6} 
\\[-1.8ex] & \multicolumn{2}{c}{is author mentioned} & \multicolumn{3}{c}{NA} \\ 
\\[-1.8ex] & \multicolumn{2}{c}{\textit{logistic}} & \multicolumn{2}{c}{\textit{logistic}} & \textit{generalized linear} \\ 
 & \multicolumn{2}{c}{\textit{}} & \multicolumn{2}{c}{\textit{}} & \textit{mixed-effects} \\ 
\\[-1.8ex] & (1) & (2) & (3) & (4) & (5)\\ 
\hline \\[-1.8ex] 
 author\_eth\_ethnea\_broadAfrican & $-$0.457$^{***}$ & $-$0.394$^{***}$ & $-$0.388$^{***}$ & $-$0.371$^{***}$ & $-$0.366$^{***}$ \\ 
  author\_eth\_ethnea\_broadChinese & 0.132$^{***}$ & 0.099$^{***}$ & $-$0.054$^{***}$ & $-$0.254$^{***}$ & $-$0.376$^{***}$ \\ 
  aut

In [161]:
# use this search string in Sublime: " \\\\ \n  &" with ' &'
# stargazer(m5, type = "latex", omit = drop_vars, single.row=TRUE, ci = TRUE, report = ('vcsp'))
stargazer(m5, type = "latex", single.row=TRUE, ci = FALSE, report = ('vcp'))


% Table created by stargazer v.5.2.2 by Marek Hlavac, Harvard University. E-mail: hlavac at fas.harvard.edu
% Date and time: Wed, Dec 16, 2020 - 02:29:29 PM
\begin{table}[!htbp] \centering 
  \caption{} 
  \label{} 
\begin{tabular}{@{\extracolsep{5pt}}lc} 
\\[-1.8ex]\hline 
\hline \\[-1.8ex] 
 & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ 
\cline{2-2} 
\\[-1.8ex] & is\_author\_mentioned \\ 
\hline \\[-1.8ex] 
 author\_eth\_ethnea\_broadAfrican & $-$0.366 \\ 
  & p = 0.000 \\ 
  author\_eth\_ethnea\_broadChinese & $-$0.376 \\ 
  & p = 0.000 \\ 
  author\_eth\_ethnea\_broadEastAsian & $-$0.272 \\ 
  & p = 0.000 \\ 
  author\_eth\_ethnea\_broadEasternEuropean & $-$0.009 \\ 
  & p = 0.653 \\ 
  author\_eth\_ethnea\_broadIndian & $-$0.011 \\ 
  & p = 0.560 \\ 
  author\_eth\_ethnea\_broadMiddleEastern & 0.016 \\ 
  & p = 0.366 \\ 
  author\_eth\_ethnea\_broadSouthernEuropean & $-$0.138 \\ 
  & p = 0.000 \\ 
  author\_eth\_ethnea\_broadunknown & $-$0.227 \\ 
  & p = 0.00002 \\ 
  au