In [1]:
suppressWarnings(suppressPackageStartupMessages({
    library(tidyverse)
    library(data.table)
}))


In [2]:
# in_f
phe_f <- '/oak/stanford/groups/mrivas/projects/biobank-methods-dev/snpnet-elastic-net/phenotype.phe'

# out
out_d <- '/oak/stanford/groups/mrivas/projects/biobank-methods-dev/snpnet-PRScs/covar_betas_train_val'

# params
covariates <- c('age', 'sex', paste0('PC', 1:10))


In [3]:
fit_covar_model <- function(df, phe, covariates){
    family <- ifelse(str_replace_all(phe, '[0-9]', '') %in% c('INI', 'QT_FC'), 'gaussian', 'binomial')
    if(family == 'binomial'){
        df[[phe]] <- df[[phe]] - 1
    }
    
    fit <- stats::as.formula(sprintf(
        '%s ~ 1 + %s', phe, paste(covariates, collapse =' + ')
    )) %>%
    glm(family=family, data=df) 
}

In [4]:
fit_to_df <- function(fit){
    fit_df <- summary(fit)$coeff %>%
    as.data.frame() %>%
    rownames_to_column('variable') %>%
    mutate(phenotype = phe)

    colnames(fit_df)[4] <- 'z_or_t_value'
    colnames(fit_df)[5] <- 'P'
    
    fit_df
}


In [5]:
compute_covar_score <- function(phe_df, fit_df, covariates){
    phe_mat <- phe_df %>% 
    mutate(ID = paste(FID, IID, sep='_')) %>%
    column_to_rownames('ID') %>%
    select(all_of(covariates)) %>%
    as.matrix()
    
    beta_mat <- fit_df %>%
    filter(variable %in% covariates) %>%
    column_to_rownames('variable') %>%
    select(Estimate) %>%
    as.matrix()
    
    phe_mat %*% beta_mat %>%
    as.data.frame() %>%
    rownames_to_column('ID') %>%
    separate(ID, c('FID', 'IID'))
}


In [6]:
phe_df <- fread(phe_f, colClasses=c('FID'='character', 'IID'='character'))


In [7]:
phe_df %>% count(split)

split,n
<chr>,<int>
test,67272
train,202276
val,67651


In [8]:
phe_df %>% colnames()

In [9]:
fit_dfs <- list()
covar_dfs <- list()

for(phe in c('HC269', 'HC382', 'INI50', 'INI21001')){
    fit_dfs[[phe]] <- phe_df %>%
    filter(split %in% c('train', 'val')) %>%
    fit_covar_model(phe, covariates) %>%
    fit_to_df()

    covar_dfs[[phe]] <- phe_df %>%
    compute_covar_score(fit_dfs[[phe]], covariates)    
}


In [10]:
for(phe in c('HC269', 'HC382', 'INI50', 'INI21001')){
    fit_dfs[[phe]] %>%
    rename('#variable'='variable') %>%
    fwrite(file.path(out_d, sprintf('%s.covar.betas.tsv', phe)), sep='\t', na = "NA", quote=F)

    covar_dfs[[phe]] %>%
    rename('#FID'='FID') %>%
    fwrite(file.path(out_d, sprintf('%s.covar.scores.tsv', phe)), sep='\t', na = "NA", quote=F)
}
