# load packages

In [None]:
library(data.table)

In [None]:
library(dplyr)

# read in files

In [None]:
raw = fread('adsp_filt_phenos/ADSPIntegratedPhenotypes_DS_2023.08.08.csv',
          sep = ',')
head(raw)

In [None]:
filt_original = fread('adsp_filt_phenos/ADSP_sample_list_filtered_anni.txt',
                    sep = '\t')
head(filt_original)
nrow(filt_original)

In [None]:
igap_overlap = fread('adsp_filt_phenos/Common_ADNI_SbjID_between_ADSP_and_ADGC.csv',
           sep = '\t')
head(igap_overlap)

In [None]:
map = fread('adsp_filt_phenos/mapping_ADNIMERGE_ADNI-ADSP-FU.csv',
           sep = ',')
head(map)

In [None]:
keep = fread('adsp_filt_phenos/ADSPIntegratedPhenotypes_DS_2023.08.08.keep_cohorts_samples.csv',
           header = F)
head(keep)

In [None]:
keep_comb = fread('adsp_filt_phenos/ADSPIntegratedPhenotypes_DS_2023.08.08.keep_comb_cohorts_samples.csv',
           header = F)
head(keep_comb)

In [None]:
keep_quest = fread('adsp_filt_phenos/ADSPIntegratedPhenotypes_DS_2023.08.08.keep_quest_cohorts_samples.csv',
           header = F)
head(keep_quest)

In [None]:
keep_quest_comb = fread('adsp_filt_phenos/ADSPIntegratedPhenotypes_DS_2023.08.08.keep_quest_comb_cohorts_samples.csv',
           header = F)
head(keep_quest_comb)

In [None]:
gene_score = fread('adsp_filt_phenos/IGAP_stage1_ADSP_pval_threshold_0.05_gene_average_score_sample_list.txt')
head(gene_score)

In [None]:
saige = fread('adsp_filt_phenos/ADSPphenotype_forAnalysis.txt',
            sep = '\t')
nrow(saige)
head(saige)

# filter phenotype file to only have individuals with WGS data

In [None]:
pheno_geno = raw[raw$SampleID %in% gene_score$ID,]
nrow(pheno_geno)
nrow(raw)
nrow(gene_score)

# remove igap overlap

## get unique columns in map

In [None]:
map_sub = map%>%
select(Genotype_IID, PTID) %>%
distinct()
nrow(map_sub)
head(map_sub)

## remove IDs from map

In [None]:
map_no_overlap = map_sub[!(map_sub$PTID %in% igap_overlap$In_ADSP),]
nrow(map_no_overlap)
nrow(map_sub)
nrow(map_sub) - nrow(map_no_overlap)
nrow(igap_overlap)
head(map_no_overlap)

## look at IDs that didn't match

In [None]:
map_overlap = map_sub[map_sub$PTID %in% igap_overlap$In_ADSP,]
nrow(map_overlap)

In [None]:
igap_nope = data.frame(igap_overlap[!(igap_overlap$In_ADSP %in% map_overlap$PTID),])
nrow(igap_nope)
head(igap_nope)

## remove IDs from phenotype file

In [None]:
pheno_adni = pheno_geno %>%
filter(grepl('ADNI', SampleID))
nrow(pheno_adni)
head(pheno_adni)

In [None]:
pheno_no_adni = pheno_geno %>%
filter(!grepl('ADNI', SampleID))
nrow(pheno_no_adni)
head(pheno_no_adni)

In [None]:
pheno_adni_no_overlap = pheno_adni[pheno_adni$SampleID %in% map_no_overlap$Genotype_IID,]
nrow(pheno_adni_no_overlap)
nrow(pheno_adni)
nrow(map_no_overlap)

In [None]:
pheno_adni_overlap = pheno_adni[pheno_adni$SampleID %in% map_overlap$Genotype_IID,]
nrow(pheno_adni_overlap)

In [None]:
pheno_raw_adni_overlap = raw[raw$SampleID %in% map_overlap$Genotype_IID,]
nrow(pheno_raw_adni_overlap)

In [None]:
pheno_no_overlap = rbind(pheno_no_adni, pheno_adni_no_overlap)
nrow(pheno_no_overlap)
head(pheno_no_overlap)

# filter pheno covar file for regression

In [None]:
saige_no_overlap = saige[saige$IID %in% pheno_no_overlap$SampleID,]
nrow(saige_no_overlap)
nrow(pheno_no_overlap)
nrow(saige)

# remove anni's original datasets

In [None]:
saige_filt_original = saige_no_overlap[saige_no_overlap$IID %in% filt_original$SampleID,]
nrow(saige_filt_original)
nrow(saige_no_overlap)
nrow(filt_original)

# filter to different dataset combos (new filtering)

In [None]:
saige_keep=saige_filt_original[saige_filt_original$IID %in% keep$V1,]
nrow(saige_keep)
nrow(keep)

In [None]:
saige_keep_quest=saige_filt_original[saige_filt_original$IID %in% keep_quest$V1,]
nrow(saige_keep_quest)
nrow(keep_quest)

In [None]:
saige_keep_comb=saige_filt_original[saige_filt_original$IID %in% keep_comb$V1,]
nrow(saige_keep_comb)
nrow(keep_comb)

In [None]:
saige_keep_quest_comb=saige_filt_original[saige_filt_original$IID %in% keep_quest_comb$V1,]
nrow(saige_keep_quest_comb)
nrow(keep_quest_comb)
head(saige_keep_quest_comb)

# export files

In [None]:
write.table(saige_keep,
            '/project/ritchie02/projects/AD_KMI/common_var_gene_score/adsp_filt_phenos/ADSP.filt_pheno_covar.no_igap.keep.txt',
            sep='\t',
            col.names=T,
            row.names=F,
            quote=F)

In [None]:
write.table(saige_keep_quest,
            '/project/ritchie02/projects/AD_KMI/common_var_gene_score/adsp_filt_phenos/ADSP.filt_pheno_covar.no_igap.keep_quest.txt',
            sep='\t',
            col.names=T,
            row.names=F,
            quote=F)

In [None]:
write.table(saige_keep_comb,
            '/project/ritchie02/projects/AD_KMI/common_var_gene_score/adsp_filt_phenos/ADSP.filt_pheno_covar.no_igap.keep_comb.txt',
            sep='\t',
            col.names=T,
            row.names=F,
            quote=F)

In [None]:
write.table(saige_keep_quest_comb,
            '/project/ritchie02/projects/AD_KMI/common_var_gene_score/adsp_filt_phenos/ADSP.filt_pheno_covar.no_igap.keep_quest_comb.txt',
            sep='\t',
            col.names=T,
            row.names=F,
            quote=F)