In [22]:
data_root <- "/shared/0/projects/news-quotes/"

In [23]:
library("lme4")
library("margins")
library("stargazer")
library("ggeffects")
library("broom")

In [24]:
mydata <- read.csv(paste(data_root, "reg_data.csv", sep = ''), header = TRUE)

In [25]:
nrow(mydata)

In [26]:
ncol(mydata)

In [27]:
mydata <- within(mydata, author_gender_ethnea <- relevel(author_gender_ethnea, ref = 'M'))
mydata <- within(mydata, reporter_gender_ethnea <- relevel(reporter_gender_ethnea, ref = 'M'))
mydata <- within(mydata, author_pos_cate <- relevel(author_pos_cate, ref = 'last_position'))
mydata <- within(mydata, is_top_author <- relevel(is_top_author, ref = 'yes'))
mydata <- within(mydata, is_corresponding <- relevel(is_corresponding, ref = 'yes'))
mydata <- within(mydata, affiliation_cate <- relevel(affiliation_cate, ref = 'domestic'))

In [28]:
# 199 keywords
keywords <- " + Cell_biology + Genetics + Biology + Body_mass_index + Health_care + Disease + Gerontology + Population + Public_health + Medicine + Materials_science + Composite_material + Nanotechnology + Cohort_study + Social_psychology + Cohort + Psychological_intervention + Young_adult + Family_medicine + Cancer + Surgery + Randomized_controlled_trial + Placebo + Clinical_trial + Nursing + Applied_psychology + Human_factors_and_ergonomics + Injury_prevention + Suicide_prevention + Psychiatry + Occupational_safety_and_health + Intensive_care_medicine + Pediatrics + Hazard_ratio + Confidence_interval + Retrospective_cohort_study + Vaccination + Psychology + Perception + Cognition + Environmental_health + Obesity + Risk_factor + Quality_of_life + Physical_therapy + Weight_loss + Anatomy + Mental_health + Psychosocial + Anxiety + Distress + Business + Public_relations + Marketing + Immunology + Global_warming + Economics + Climatology + Climate_change + General_surgery + Endocrinology + Internal_medicine + Receptor + Inflammation + Stimulus__physiology_ + Immune_system + Meta_analysis + Sociology + Gene + Cancer_research + Breast_cancer + Cell + Diabetes_mellitus + Blood_pressure + Oncology + Gynecology + Communication + Cognitive_psychology + Adverse_effect + Clinical_endpoint + Pharmacology + Virology + Risk_assessment + Transcription_factor + Political_science + Ecology + Geography + Cross_sectional_study + Odds_ratio + Comorbidity + Environmental_engineering + Chemistry + Medical_emergency + Physics + Social_science + Ethnic_group + Labour_economics + Antibody + Geomorphology + Geophysics + Geology + Ranging + Stroke + Environmental_resource_management + Type_2_diabetes + Cardiology + Molecular_biology + Developmental_psychology + Agriculture + Signal_transduction + Optoelectronics + Psychotherapist + Affect__psychology_ + Clinical_psychology + Anesthesia + Atmospheric_sciences + In_vivo + Biochemistry + Analytical_chemistry + Neuroscience + Botany + Gene_expression + Politics + Demography + Socioeconomic_status + Mortality_rate + Virus + Optics + Condensed_matter_physics + Bioinformatics + Law + Physical_medicine_and_rehabilitation + Stem_cell + Biodiversity + Astrophysics + Astronomy + Radiology + Pathology + Proportional_hazards_model + Chemotherapy + Predation + Food_science + Artificial_intelligence + Overweight + Antibiotics + Microbiology + Zoology + Paleontology + Habitat + Public_administration + Ecosystem + Economic_growth + Organic_chemistry + Government + Autism + Transplantation + Gastroenterology + Insulin + Engineering + Computer_science + Observational_study + Heart_disease + Epidemiology + Obstetrics + Pregnancy + Fishery + Alternative_medicine + Logistic_regression + Offspring + Mood + Bacteria + Prostate_cancer + Evolutionary_biology + Phenomenon + Longitudinal_study + Genome + Mutation + Pedagogy + Dementia + Relative_risk + Microeconomics + Odds + Feeling + Oceanography + Emergency_medicine + Personality + Prospective_cohort_study + Hippocampus + Greenhouse_gas + Biomarker__medicine_ + Myocardial_infarction + Socioeconomics + Drug + Environmental_science + Epigenetics + Inorganic_chemistry + Emergency_department + Medical_prescription + Phenotype"

### Wikipedia

In [29]:
# coarsen reporters into 4 groups.
mydata$reporter_eth_wiki_broad <- as.character(mydata$reporter_eth_wiki_broad)
mydata$reporter_eth_wiki_broad[mydata$reporter_eth_wiki_broad == 'EasternEuropean'] <- 'European'
mydata$reporter_eth_wiki_broad[mydata$reporter_eth_wiki_broad == 'WesternNorthernEuropean'] <- 'European'
mydata$reporter_eth_wiki_broad[mydata$reporter_eth_wiki_broad == 'SouthernEuropean'] <- 'European'
mydata$reporter_eth_wiki_broad[mydata$reporter_eth_wiki_broad == 'Indian'] <- 'Asian'
mydata$reporter_eth_wiki_broad[mydata$reporter_eth_wiki_broad == 'EastAsian'] <- 'Asian'
mydata$reporter_eth_wiki_broad[mydata$reporter_eth_wiki_broad == 'MiddleEastern'] <- 'OtherUnknown'
mydata$reporter_eth_wiki_broad[mydata$reporter_eth_wiki_broad == 'African'] <- 'OtherUnknown'
mydata$reporter_eth_wiki_broad[mydata$reporter_eth_wiki_broad == 'unknown'] <- 'OtherUnknown'
mydata$reporter_eth_wiki_broad <- as.factor(mydata$reporter_eth_wiki_broad)

In [30]:
mydata <- within(mydata, author_eth_wiki_broad <- relevel(author_eth_wiki_broad, ref = 'English'))
mydata <- within(mydata, reporter_eth_wiki_broad <- relevel(reporter_eth_wiki_broad, ref = 'English'))

In [31]:
base_str_wiki <- "is_author_mentioned ~ 1 + author_eth_wiki_broad + author_gender_ethnea + reporter_eth_wiki_broad + reporter_gender_ethnea + \
          last_name_length + last_name_prob + author_pos_cate + author_rank + is_top_author + is_corresponding + \
          affiliation_rank + affiliation_cate + num_authors + mention_year_center + gap_in_years + \
          num_words + num_mentioned_papers + FleschReadingEase + sentences_per_paragraph + type_token_ratio"

In [32]:
equation_bar_wiki <- as.formula(paste(base_str_wiki, keywords, " + (1|journal_title) + (1|outlet)", sep = " "))
m5_wiki <- glmer(formula = equation_bar_wiki, data = mydata, family = "binomial", control = glmerControl(optimizer = "nloptwrap"), nAGQ = 0)

“Some predictor variables are on very different scales: consider rescaling”

In [33]:
tidy(m5_wiki)

term,estimate,std.error,statistic,p.value,group
(Intercept),9.682594e-01,8.497282e-02,11.3949300,4.431858e-30,fixed
author_eth_wiki_broadAfrican,-4.536061e-02,3.074138e-02,-1.4755554,1.400633e-01,fixed
author_eth_wiki_broadEastAsian,-3.084802e-01,1.327876e-02,-23.2311016,2.208820e-119,fixed
author_eth_wiki_broadEasternEuropean,4.966769e-03,2.035708e-02,0.2439824,8.072445e-01,fixed
author_eth_wiki_broadIndian,-1.545044e-02,1.614240e-02,-0.9571342,3.384995e-01,fixed
author_eth_wiki_broadMiddleEastern,4.198259e-02,1.142600e-02,3.6743043,2.384984e-04,fixed
author_eth_wiki_broadSouthernEuropean,-1.063779e-01,1.317774e-02,-8.0725470,6.884663e-16,fixed
author_eth_wiki_broadWesternNorthernEuropean,-5.355332e-02,1.283424e-02,-4.1726907,3.010234e-05,fixed
author_gender_ethneaF,2.309109e-03,8.751403e-03,0.2638559,7.918910e-01,fixed
author_gender_ethneaunknown,-1.400566e-01,1.121207e-02,-12.4915841,8.298471e-36,fixed


In [None]:
margins_eff <- margins(m5_wiki, data = mydata, c("author_eth_wiki_broad", "author_gender_ethnea"))

In [None]:
summary(margins_eff)

In [None]:
write.csv(summary(margins_eff), "/shared/0/projects/news-quotes/reg_results/validation/wiki_margins.csv", row.names = FALSE)


### US Census 2010

In [None]:
mydata <- within(mydata, author_eth_census <- relevel(author_eth_census, ref = 'white'))
mydata <- within(mydata, reporter_eth_census <- relevel(reporter_eth_census, ref = 'white'))

In [14]:
base_str_census <- "is_author_mentioned ~ 1 + author_eth_census + author_gender_ethnea + reporter_eth_census + reporter_gender_ethnea + \
          last_name_length + last_name_prob + author_pos_cate + author_rank + is_top_author + is_corresponding + \
          affiliation_rank + affiliation_cate + num_authors + mention_year_center + gap_in_years + \
          num_words + num_mentioned_papers + FleschReadingEase + sentences_per_paragraph + type_token_ratio"

In [23]:
equation_bar_census <- as.formula(paste(base_str_census, keywords, " + (1|journal_title) + (1|outlet)", sep = " "))
m5_census <- glmer(formula = equation_bar_census, data = mydata, family = "binomial", control = glmerControl(optimizer = "nloptwrap"), nAGQ = 0)

fixed-effect model matrix is rank deficient so dropping 6 columns / coefficients
“Some predictor variables are on very different scales: consider rescaling”

In [12]:
tidy(m5_census)

term,estimate,std.error,statistic,p.value,group
(Intercept),9.335202e-01,8.501630e-02,10.9804849,4.743705e-28,fixed
author_eth_censusapi,-2.093205e-01,1.126669e-02,-18.5787133,4.778251e-77,fixed
author_eth_censusblack,1.099640e-01,4.693285e-02,2.3430064,1.912905e-02,fixed
author_eth_censushispanic,-1.062438e-01,1.716071e-02,-6.1911080,5.974275e-10,fixed
author_gender_ethneaF,9.404157e-04,8.743661e-03,0.1075540,9.143495e-01,fixed
author_gender_ethneaunknown,-1.568502e-01,1.098327e-02,-14.2808288,2.881538e-46,fixed
reporter_eth_censusapi,3.350081e-02,3.230262e-02,1.0370927,2.996927e-01,fixed
reporter_eth_censusblack,5.371195e-01,9.394898e-02,5.7171403,1.083317e-08,fixed
reporter_eth_censushispanic,-6.670898e-02,4.082057e-02,-1.6342000,1.022169e-01,fixed
reporter_eth_censusunknown,1.873102e-01,3.359961e-02,5.5747726,2.478534e-08,fixed


In [None]:
margins_eff <- margins(m5_census, data = mydata, c("author_eth_census", "author_gender_ethnea"))

In [None]:
summary(margins_eff)

In [None]:
write.csv(summary(margins_eff), "/shared/0/projects/news-quotes/reg_results/validation/census_margins.csv", row.names = FALSE)
