### Identifying independent signals not replicated due to GC-correction in meta-analysis

#### MHC region on Human chromosome 6 (GRCh37): chr6:28,477,797-33,448,354

In [1]:
options(scipen=999)
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(purrr))
suppressPackageStartupMessages(library(tidyr))

“package ‘data.table’ was built under R version 4.2.2”
“package ‘dplyr’ was built under R version 4.2.3”
“package ‘ggplot2’ was built under R version 4.2.3”
“package ‘purrr’ was built under R version 4.2.2”
“package ‘tidyr’ was built under R version 4.2.2”


In [28]:
#LD-clump files using 1000 genomes reference data
diamante18_clump_1kg <- fread("/lustre/groups/itg/teams/zeggini/users/archit.singh/lambda/LD-clumpping/DIAMANTE-18/1kG_LD_clumpping/LD_threshold/plink.clumped", header = T)
diamante22_clump <- fread("/lustre/groups/itg/teams/zeggini/users/archit.singh/lambda/LD-clumpping/DIAMANTE-22/1kG_LD_clumpping/LD_threshold/plink.clumped", header = T)
#reading independent signals files
ind_sig_diam18 <- fread("/lustre/groups/itg/teams/zeggini/users/archit.singh/lambda/independent_signals_analysis/DIAMANTE-18/diamante18_independent_signals_with_nearest_gene_ST2.txt")
model_diam18 <- fread("/lustre/groups/itg/teams/zeggini/users/archit.singh/lambda/independent_signals_analysis/DIAMANTE-18/bmi_model_info.txt", header = TRUE)
signal_type_diam18 <- fread("/lustre/groups/itg/teams/zeggini/users/archit.singh/lambda/independent_signals_analysis/DIAMANTE-18/primary_secondary_signal.txt", header = TRUE)
ind_sig_diam22 <- fread("/lustre/groups/itg/teams/zeggini/users/archit.singh/lambda/independent_signals_analysis/DIAMANTE-22/independent_loci_genomic_positions_ST5_10-8GWS_cutoff.txt", header = TRUE)
ind_sig_diam22_index_snp_pos <- fread("/lustre/groups/itg/teams/zeggini/users/archit.singh/lambda/independent_signals_analysis/DIAMANTE-22/independent_loci_genomic_positions_ST5_index_SNPs.txt", sep = "\t")
#reading unreplicated signals file
unrep_diam18_diam22 <- fread("/lustre/groups/itg/teams/zeggini/users/archit.singh/lambda/independent_signals_analysis/DIAMANTE-18/new_analysis_unreplicated_diam18_diam22.txt", header = TRUE)
unrep_diam18_t2dggi <- fread("/lustre/groups/itg/teams/zeggini/users/archit.singh/lambda/independent_signals_analysis/DIAMANTE-18/new_analysis_unreplicated_diam18_t2dggi.txt", header = TRUE)
unrep_diam22_t2dggi <- fread("/lustre/groups/itg/teams/zeggini/users/archit.singh/lambda/independent_signals_analysis/DIAMANTE-22/new_analysis_unreplicated_diam22_t2dggi.txt", header = TRUE)
#combining the files into one DF
ind_signals_diam18 <- cbind(ind_sig_diam18, model_diam18, signal_type_diam18)
#processing the data
#Diamante-18 independent signals
ind_signals_diam18$position <- gsub(".","", ind_signals_diam18$position, fixed = TRUE)
ind_signals_diam18 <- subset(ind_signals_diam18, ind_signals_diam18$BMI.model == "unadjusted")
ind_signals_diam18 <- subset(ind_signals_diam18, ind_signals_diam18$signal_type == "Primary")
ind_signals_diam18$id <- paste0(ind_signals_diam18$chr, ":", ind_signals_diam18$position)
#Diamante-22 independent signals
ind_sig_diam22$start <- substr(ind_sig_diam22$genomic_boundaries, 1, regexpr("-", ind_sig_diam22$genomic_boundaries) - 1)
ind_sig_diam22$end <- substring(ind_sig_diam22$genomic_boundaries, regexpr("-", ind_sig_diam22$genomic_boundaries) + 1)
ind_sig_diam22$start <- gsub(",","", ind_sig_diam22$start, fixed = TRUE)
ind_sig_diam22$end <- gsub(",","", ind_sig_diam22$end, fixed = TRUE)
ind_sig_diam22 <- cbind(ind_sig_diam22, ind_sig_diam22_index_snp_pos)
ind_sig_diam22$position <- gsub(",","", ind_sig_diam22$index_snp_position, fixed = TRUE)

#### Function to create boundaries for the LD-Clumps

In [3]:
#creating boundaries from the LD-clump dataframe
# Function to process the data
process_clumps <- function(df) {
  # Initialize new columns
  df$start_position <- NA
  df$end_position <- NA
  
  # Process each row
  for (i in 1:nrow(df)) {
    # Get the BP value
    bp <- df$BP[i]
    
    # Check if SP2 is "NONE"
    if (df$SP2[i] == "NONE") {
      # Set start and end position to BP
      df$start_position[i] <- bp
      df$end_position[i] <- bp
    } else {
      # Extract positions from SP2 column
      sp2_positions <- unlist(lapply(strsplit(df$SP2[i], ",")[[1]], function(x) {
        as.numeric(gsub("\\(.*\\)", "", unlist(strsplit(x, ":"))[2]))
      }))
      # Calculate start and end positions
      df$start_position[i] <- min(c(bp, sp2_positions))
      df$end_position[i] <- max(c(bp, sp2_positions))
    }
  }
  
  # Add clump column
  df$clump <- paste("clump", seq_len(nrow(df)), sep = "_")
  
  return(df)
}


In [4]:
# Process the data
df <- diamante18_clump_1kg
diamante18_clump_processed <- process_clumps(df)
diamante18_clump_processed <- diamante18_clump_processed %>% select(CHR, SNP, BP, start_position, end_position, clump)
df <- diamante22_clump
diamante22_clump_processed <- process_clumps(df)
diamante22_clump_processed <- diamante22_clump_processed %>% select(CHR, SNP, BP, start_position, end_position, clump)

# SNPs lost due to GC correction

### Find which LD clumps the T2D index SNPs belong to: DIAMANTE-18

#### SNPs which do not find a match are assigned as single variant clumps, i.e. new_clump_{1...i}

In [5]:
assign_clumps <- function(df1, df2) {
  # Ensure column names are consistent
  colnames(df1) <- tolower(colnames(df1))
  colnames(df2) <- tolower(colnames(df2))
  
  # Add columns for clump details
  df1$clump <- NA
  df1$start_position <- NA
  df1$end_position <- NA
  
  # Assign clumps based on df2
  for (i in seq_len(nrow(df2))) {
    idx <- df1$chr == df2$chr[i] & df1$position >= df2$start_position[i] & df1$position <= df2$end_position[i]
    df1$clump[idx] <- df2$clump[i]
    df1$start_position[idx] <- df2$start_position[i]
    df1$end_position[idx] <- df2$end_position[i]
  }
  
  # Assign new clumps to unmatched SNPs
  unmatched <- is.na(df1$clump)
  if (any(unmatched)) {
    num_unmatched <- sum(unmatched)
    df1$clump[unmatched] <- paste0("new_clump_", seq_len(num_unmatched))
    df1$start_position[unmatched] <- df1$position[unmatched]
    df1$end_position[unmatched] <- df1$position[unmatched]
  }
  
  return(df1)
}


In [6]:
diam18_signals_loci <- assign_clumps(ind_signals_diam18, diamante18_clump_processed)
head(diam18_signals_loci)

nearest_gene,index_variant,chr,position,bmi.model,signal_type,id,clump,start_position,end_position
<chr>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
MACF1,rs3768321,1,40035928,unadjusted,Primary,1:40035928,clump_341,39573219,40037726
FAF1,rs58432198,1,51256091,unadjusted,Primary,1:51256091,clump_304,50891117,51532590
PATJ,rs12140153,1,62579891,unadjusted,Primary,1:62579891,clump_464,62579891,62579891
DENND2C,rs184660829,1,115144899,unadjusted,Primary,1:115144899,new_clump_1,115144899,115144899
PTGFRN,rs1127215,1,117532790,unadjusted,Primary,1:117532790,clump_179,117491123,117568947
NOTCH2,rs1493694,1,120526982,unadjusted,Primary,1:120526982,clump_112,120436751,120581285


#### Find if DIAMANTE-18 unreplicated SNPs are "tagged" by T2D index SNPs, i.e. both belong to the same LD-clump

In [7]:
find_snps_in_loci <- function(unrep_diam18_diam22, diamante_18_loci) {
  # Ensure data frames have required columns
  if (!all(c("Chr", "Pos") %in% colnames(unrep_diam18_diam22))) {
    stop("df1 must have 'Chr' and 'Pos' columns")
  }
  if (!all(c("chr", "start_position", "end_position", "clump") %in% colnames(diamante_18_loci))) {
    stop("df2 must have 'chr', 'start_position', 'end_position', and 'clump' columns")
  }
  
  # Initialize a list to store matching results
  results <- list()
  
  # Loop through each row of df2
  for (i in 1:nrow(diamante_18_loci)) {
    loci_chr <- diamante_18_loci$chr[i]
    loci_start <- diamante_18_loci$start_position[i]
    loci_end <- diamante_18_loci$end_position[i]
    clump <- diamante_18_loci$clump[i]
    
    # Filter df1 for SNPs in the same chromosome and position within range
    matching_snps <- unrep_diam18_diam22[unrep_diam18_diam22$Chr == loci_chr & 
                                           unrep_diam18_diam22$Pos >= loci_start & 
                                           unrep_diam18_diam22$Pos <= loci_end, ]
    
    # Add indexSNP and clump_number to the matching results
    matching_snps$id <- diamante_18_loci$id[i]
    matching_snps$clump <- clump
    
    # Append to results
    results[[i]] <- matching_snps
  }
  
  # Combine all matching SNPs into a single data frame
  final_results <- do.call(rbind, results)
  return(final_results)
}


In [8]:
matching_snps_1 <- find_snps_in_loci(unrep_diam18_diam22, diam18_signals_loci)
untag_diam18_diam22 <- anti_join(unrep_diam18_diam22, matching_snps_1, by = "SNP")
cat("No. of SNPs lost due to correction in the DIAMANTE-18 and DIAMANTE-22 comparison:", length(unique(unrep_diam18_diam22$SNP)))
cat("\n")
cat("No. of SNPs not tagged by T2D loci in the DIAMANTE-18 and DIAMANTE-22 comparison:", length(unique(untag_diam18_diam22$SNP)))
matching_snps_2 <- find_snps_in_loci(unrep_diam18_t2dggi, diam18_signals_loci)
untag_diam18_t2dggi <- anti_join(unrep_diam18_t2dggi, matching_snps_2, by = "SNP")
cat("\n")
cat("No. of SNPs lost due to correction in the DIAMANTE-18 and T2DGGI comparison:", length(unique(unrep_diam18_t2dggi$SNP)))
cat("\n")
cat("No. of SNPs not tagged by T2D loci in the DIAMANTE-18 and T2DGGI comparison:", length(unique(untag_diam18_t2dggi$SNP)))

No. of SNPs lost due to correction in the DIAMANTE-18 and DIAMANTE-22 comparison: 8359
No. of SNPs not tagged by T2D loci in the DIAMANTE-18 and DIAMANTE-22 comparison: 2285
No. of SNPs lost due to correction in the DIAMANTE-18 and T2DGGI comparison: 8937
No. of SNPs not tagged by T2D loci in the DIAMANTE-18 and T2DGGI comparison: 2444

#### Removing LD-clumps where T2D loci are located

In [9]:
diamante18_clump_processed_2 <- anti_join(diamante18_clump_processed, diam18_signals_loci, by = "clump")
cat("LD-clumps remaining:", length(unique(diamante18_clump_processed_2$SNP)))

LD-clumps remaining: 370

#### Identifying number of independent loci within the SNPs that are lost due to correction and not tagged by T2D loci

In [10]:
#renaming certain columns for the function
untag_diam18_diam22 <- untag_diam18_diam22 %>% rename(chr = Chr)
untag_diam18_diam22 <- untag_diam18_diam22 %>% rename(position = Pos)
untag_diam18_t2dggi <- untag_diam18_t2dggi %>% rename(chr = Chr)
untag_diam18_t2dggi <- untag_diam18_t2dggi %>% rename(position = Pos)

In [15]:
# Remove SNPs on chromosome 6 within the range of the MHC complex
untag_diam18_diam22 <- untag_diam18_diam22 %>%
  filter(!(chr == 6 & position >= 28477797 & position <= 33448354))
cat("No. of SNPs not tagged by T2D loci in the DIAMANTE-18 and DIAMANTE-22 comparison:", length(unique(untag_diam18_diam22$SNP)))
untag_diam18_t2dggi <- untag_diam18_t2dggi %>%
  filter(!(chr == 6 & position >= 28477797 & position <= 33448354))
cat("\n")
cat("No. of SNPs not tagged by T2D loci in the DIAMANTE-18 and T2DGGI comparison:", length(unique(untag_diam18_t2dggi$SNP)))

No. of SNPs not tagged by T2D loci in the DIAMANTE-18 and DIAMANTE-22 comparison: 1609
No. of SNPs not tagged by T2D loci in the DIAMANTE-18 and T2DGGI comparison: 1799

In [14]:
#calling the function
untag_diam18_diam22_processed <- assign_clumps(untag_diam18_diam22, diamante18_clump_processed_2)
untag_diam18_t2dggi_processed <- assign_clumps(untag_diam18_t2dggi, diamante18_clump_processed_2)
cat("Total independent clumps found:", length(unique(untag_diam18_diam22_processed$clump)))
cat("\n")
cat("Total independent clumps found:", length(unique(untag_diam18_t2dggi_processed$clump)))

Total independent clumps found: 174
Total independent clumps found: 181

In [20]:
#how many untagged SNPs are single variant clumps?
cat("In DIAMANTE-18 and DIAMANTE-22 comparison:", sum(grepl("new_clump", untag_diam18_diam22_processed$clump)))
cat("\n")
cat("In DIAMANTE-18 and T2DGGI comparison:",sum(grepl("new_clump", untag_diam18_t2dggi_processed$clump)))

In DIAMANTE-18 and DIAMANTE-22 comparison: 25
In DIAMANTE-18 and T2DGGI comparison: 30

### Find which LD clumps the T2D index SNPs belong to: DIAMANTE-22

In [29]:
diam22_signals_loci <- assign_clumps(ind_sig_diam22, diamante22_clump_processed)
head(diam22_signals_loci)

chr,genomic_boundaries,start,end,index_snp_position,position,clump,start_position,end_position
<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,"20,229,451-21,229,451",20229451,21229451,20729451,20729451,new_clump_1,20729451,20729451
1,"39,370,793-40,370,793",39370793,40370793,39870793,39870793,clump_367,39573219,40037726
1,"45,858,862-46,858,862",45858862,46858862,46358862,46358862,new_clump_2,46358862,46358862
1,"50,719,188-51,719,188",50719188,51719188,51219188,51219188,clump_306,50891117,51532590
1,"62,079,891-63,079,891",62079891,63079891,62579891,62579891,clump_429,62579891,62579891
1,"63,614,429-64,614,429",63614429,64614429,64114429,64114429,new_clump_3,64114429,64114429


#### Find if DIAMANTE-22 unreplicated SNPs are "tagged" by T2D index SNPs, i.e. both belong to the same LD-clump

In [30]:
#renaming certain columns for the function
unrep_diam22_t2dggi <- unrep_diam22_t2dggi %>% rename(Chr = `chromosome(b37)`)
unrep_diam22_t2dggi <- unrep_diam22_t2dggi %>% rename(Pos = `position(b37)`)
matching_snps <- find_snps_in_loci(unrep_diam22_t2dggi, diam22_signals_loci)
untag_diam22_t2dggi <- anti_join(unrep_diam22_t2dggi, matching_snps, by = "rsID")
cat("No. of SNPs lost due to correction in the DIAMANTE-22 and T2DGGI comparison:", length(unique(unrep_diam22_t2dggi$rsID)))
cat("\n")
cat("No. of SNPs not tagged by T2D loci in the DIAMANTE-22 and T2DGGI comparison:", length(unique(untag_diam22_t2dggi$rsID)))

No. of SNPs lost due to correction in the DIAMANTE-22 and T2DGGI comparison: 10014
No. of SNPs not tagged by T2D loci in the DIAMANTE-22 and T2DGGI comparison: 3334

#### Removing LD-clumps where T2D loci are located

In [31]:
diamante22_clump_processed_2 <- anti_join(diamante22_clump_processed, diam22_signals_loci, by = "clump")
cat("LD-clumps remaining:", length(unique(diamante22_clump_processed_2$SNP)))

LD-clumps remaining: 366

#### Identifying number of independent loci within the SNPs that are lost due to correction and not tagged by T2D loci

In [32]:
#renaming certain columns for the function
untag_diam22_t2dggi <- untag_diam22_t2dggi %>% rename(chr = Chr)
untag_diam22_t2dggi <- untag_diam22_t2dggi %>% rename(position = Pos)

In [34]:
# Remove SNPs on chromosome 6 within the range of the MHC complex
untag_diam22_t2dggi <- untag_diam22_t2dggi %>%
  filter(!(chr == 6 & position >= 28477797 & position <= 33448354))
cat("No. of SNPs not tagged by T2D loci in the DIAMANTE-22 and T2DGGI comparison:", length(unique(untag_diam22_t2dggi$rsID)))

No. of SNPs not tagged by T2D loci in the DIAMANTE-22 and T2DGGI comparison: 2593

In [35]:
untag_diam22_t2dggi_processed <- assign_clumps(untag_diam22_t2dggi, diamante22_clump_processed_2)
cat("Total independent clumps found:", length(unique(untag_diam22_t2dggi_processed$clump)))

Total independent clumps found: 447

In [36]:
#how many untagged SNPs are single variant clumps?
cat("In DIAMANTE-22 and T2DGGI comparison:",sum(grepl("new_clump", untag_diam22_t2dggi_processed$clump)))

In DIAMANTE-22 and T2DGGI comparison: 277

# SNPs lost due to LDSR intercept correction

In [37]:
#reading unreplicated files
unrep_diam18_diam22 <- fread("/lustre/groups/itg/teams/zeggini/users/archit.singh/lambda/independent_signals_analysis/DIAMANTE-18/new_analysis_unreplicated_ldsr_intercept_diam18_diam22.txt", header = TRUE)
unrep_diam18_t2dggi <- fread("/lustre/groups/itg/teams/zeggini/users/archit.singh/lambda/independent_signals_analysis/DIAMANTE-18/new_analysis_unreplicated_ldsr_intercept_diam18_t2dggi.txt", header = TRUE)
unrep_diam22_t2dggi <- fread("/lustre/groups/itg/teams/zeggini/users/archit.singh/lambda/independent_signals_analysis/DIAMANTE-22/new_analysis_unreplicated_ldsr_intercept_diam22_t2dggi.txt", header = TRUE)

#### Find if DIAMANTE-18 unreplicated SNPs are "tagged" by T2D index SNPs, i.e. both belong to the same LD-clump

In [38]:
matching_snps_1 <- find_snps_in_loci(unrep_diam18_diam22, diam18_signals_loci)
untag_diam18_diam22 <- anti_join(unrep_diam18_diam22, matching_snps_1, by = "SNP")
cat("No. of SNPs lost due to correction in the DIAMANTE-18 and DIAMANTE-22 comparison:", length(unique(unrep_diam18_diam22$SNP)))
cat("\n")
cat("No. of SNPs not tagged by T2D loci in the DIAMANTE-18 and DIAMANTE-22 comparison:", length(unique(untag_diam18_diam22$SNP)))
matching_snps_2 <- find_snps_in_loci(unrep_diam18_t2dggi, diam18_signals_loci)
untag_diam18_t2dggi <- anti_join(unrep_diam18_t2dggi, matching_snps_2, by = "SNP")
cat("\n")
cat("No. of SNPs lost due to correction in the DIAMANTE-18 and T2DGGI comparison:", length(unique(unrep_diam18_t2dggi$SNP)))
cat("\n")
cat("No. of SNPs not tagged by T2D loci in the DIAMANTE-18 and T2DGGI comparison:", length(unique(untag_diam18_t2dggi$SNP)))

No. of SNPs lost due to correction in the DIAMANTE-18 and DIAMANTE-22 comparison: 2509
No. of SNPs not tagged by T2D loci in the DIAMANTE-18 and DIAMANTE-22 comparison: 697
No. of SNPs lost due to correction in the DIAMANTE-18 and T2DGGI comparison: 3117
No. of SNPs not tagged by T2D loci in the DIAMANTE-18 and T2DGGI comparison: 838

#### Identifying number of independent loci within the SNPs that are lost due to correction and not tagged by T2D loci

In [39]:
#renaming certain columns for the function
untag_diam18_diam22 <- untag_diam18_diam22 %>% rename(chr = Chr)
untag_diam18_diam22 <- untag_diam18_diam22 %>% rename(position = Pos)
untag_diam18_t2dggi <- untag_diam18_t2dggi %>% rename(chr = Chr)
untag_diam18_t2dggi <- untag_diam18_t2dggi %>% rename(position = Pos)

In [40]:
# Remove SNPs on chromosome 6 within the range of the MHC complex
untag_diam18_diam22 <- untag_diam18_diam22 %>%
  filter(!(chr == 6 & position >= 28477797 & position <= 33448354))
cat("No. of SNPs not tagged by T2D loci in the DIAMANTE-18 and DIAMANTE-22 comparison:", length(unique(untag_diam18_diam22$SNP)))
untag_diam18_t2dggi <- untag_diam18_t2dggi %>%
  filter(!(chr == 6 & position >= 28477797 & position <= 33448354))
cat("\n")
cat("No. of SNPs not tagged by T2D loci in the DIAMANTE-18 and T2DGGI comparison:", length(unique(untag_diam18_t2dggi$SNP)))

No. of SNPs not tagged by T2D loci in the DIAMANTE-18 and DIAMANTE-22 comparison: 490
No. of SNPs not tagged by T2D loci in the DIAMANTE-18 and T2DGGI comparison: 656

In [41]:
#calling the function
untag_diam18_diam22_processed <- assign_clumps(untag_diam18_diam22, diamante18_clump_processed_2)
untag_diam18_t2dggi_processed <- assign_clumps(untag_diam18_t2dggi, diamante18_clump_processed_2)
cat("Total independent clumps found:", length(unique(untag_diam18_diam22_processed$clump)))
cat("\n")
cat("Total independent clumps found:", length(unique(untag_diam18_t2dggi_processed$clump)))

Total independent clumps found: 102
Total independent clumps found: 113

In [42]:
#how many untagged SNPs are single variant clumps?
cat("In DIAMANTE-18 and DIAMANTE-22 comparison:", sum(grepl("new_clump", untag_diam18_diam22_processed$clump)))
cat("\n")
cat("In DIAMANTE-18 and T2DGGI comparison:",sum(grepl("new_clump", untag_diam18_t2dggi_processed$clump)))

In DIAMANTE-18 and DIAMANTE-22 comparison: 12
In DIAMANTE-18 and T2DGGI comparison: 17

#### Find if DIAMANTE-22 unreplicated SNPs are "tagged" by T2D index SNPs, i.e. both belong to the same LD-clump

In [43]:
#renaming certain columns for the function
unrep_diam22_t2dggi <- unrep_diam22_t2dggi %>% rename(Chr = `chromosome(b37)`)
unrep_diam22_t2dggi <- unrep_diam22_t2dggi %>% rename(Pos = `position(b37)`)
matching_snps <- find_snps_in_loci(unrep_diam22_t2dggi, diam22_signals_loci)
untag_diam22_t2dggi <- anti_join(unrep_diam22_t2dggi, matching_snps, by = "rsID")
cat("No. of SNPs lost due to correction in the DIAMANTE-22 and T2DGGI comparison:", length(unique(unrep_diam22_t2dggi$rsID)))
cat("\n")
cat("No. of SNPs not tagged by T2D loci in the DIAMANTE-22 and T2DGGI comparison:", length(unique(untag_diam22_t2dggi$rsID)))

No. of SNPs lost due to correction in the DIAMANTE-22 and T2DGGI comparison: 3041
No. of SNPs not tagged by T2D loci in the DIAMANTE-22 and T2DGGI comparison: 1179

#### Identifying number of independent loci within the SNPs that are lost due to correction and not tagged by T2D loci

In [44]:
#renaming certain columns for the function
untag_diam22_t2dggi <- untag_diam22_t2dggi %>% rename(chr = Chr)
untag_diam22_t2dggi <- untag_diam22_t2dggi %>% rename(position = Pos)

In [45]:
# Remove SNPs on chromosome 6 within the range of the MHC complex
untag_diam22_t2dggi <- untag_diam22_t2dggi %>%
  filter(!(chr == 6 & position >= 28477797 & position <= 33448354))
cat("No. of SNPs not tagged by T2D loci in the DIAMANTE-22 and T2DGGI comparison:", length(unique(untag_diam22_t2dggi$rsID)))

No. of SNPs not tagged by T2D loci in the DIAMANTE-22 and T2DGGI comparison: 1068

In [46]:
untag_diam22_t2dggi_processed <- assign_clumps(untag_diam22_t2dggi, diamante22_clump_processed_2)
cat("Total independent clumps found:", length(unique(untag_diam22_t2dggi_processed$clump)))

Total independent clumps found: 329

In [47]:
#how many untagged SNPs are single variant clumps?
cat("In DIAMANTE-22 and T2DGGI comparison:",sum(grepl("new_clump", untag_diam22_t2dggi_processed$clump)))

In DIAMANTE-22 and T2DGGI comparison: 227