# Prepare pipeline input - SNP matrix

### Loading required packages

In [1]:
suppressMessages(library(tidyverse))
suppressMessages(library(data.table))

### Load data

In [2]:
case <- fread("data/gse69664_gpl20166_theta.txt")
control <- fread("data/gse74100_theta.txt")

### Merge data

In [3]:
merged_data <- merge(case[,-(2:3)], control[,-(2:3)], by = "Name")

### Update snp rsid

In [4]:
# load rsid table
# source: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL19864
omni_zhonghua_id_table <- fread("../snp_array/data/SNPID_HumanOmniZhonghua.txt")

In [5]:
# replace NA with rsid
new_id <- omni_zhonghua_id_table %>% select(SPOT_ID, SNP_ID) %>% mutate(SNP_ID = ifelse(SNP_ID == "", SPOT_ID, SNP_ID))
colnames(new_id) <- c("Name", "New_name")
merged_data_nameupdated <- merge(merged_data, new_id, by = "Name") %>% distinct(New_name, .keep_all = TRUE)

In [6]:
#transpose table
merged_data_nameupdated_pre_t <- merged_data_nameupdated %>% select(-c("Name", "New_name"))
merged_data_nameupdated_t <- t(merged_data_nameupdated_pre_t)

In [7]:
#make dataframe
colnames(merged_data_nameupdated_t) <- merged_data_nameupdated$New_name
merged_data_nameupdated_t_df <- as.data.frame(merged_data_nameupdated_t)

### Add phenotype

In [8]:
# added phenotype to "default" (outcome) column, 
# 446 cases from gse69664_gpl20166, 240 controls from gse74100
merged_data_nameupdated_t_df$default <- c(rep("Yes", 446), rep("No", 240))

In [9]:
# rearrange column
machine_learning_input <- merged_data_nameupdated_t_df %>% select("default", everything())

In [10]:
#convert phenotype column to type.factor
machine_learning_input$default <- as.factor(machine_learning_input$default)

### View table

In [11]:
head(machine_learning_input[1:20], n=3)
tail(machine_learning_input[1:20], n=3)

Unnamed: 0_level_0,default,rs28619217,200610-10,rs367572771,rs144402189,rs375896687,200610-107,200610-108,200610-109,rs199838004,rs374875201,200610-112,rs377546596,200610-114,200610-115,200610-116,200610-117,200610-118,200610-12,200610-120
Unnamed: 0_level_1,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
9323111011_R01C01.Theta,Yes,0.05048173,0.03499985,0.826782,0.07534563,0.02177286,0.02820558,0.04560787,0.01812311,0.03443542,0.02780845,0.03446267,0.0211692,0.02990028,0.06190423,0.04566385,0.05648696,0.01994715,0.9549576,0.8102618
9323111011_R02C01.Theta,Yes,0.04993737,0.03245308,0.8402165,0.05894551,0.0189961,0.02572266,0.03968269,0.01453403,0.03323122,0.01688597,0.031837,0.02603045,0.03385911,0.05766838,0.04933597,0.04873141,0.01553728,0.9269152,0.1162478
9323111011_R03C01.Theta,Yes,0.05869108,0.03736536,0.8192294,0.05012877,0.02201855,0.02720243,0.04521938,0.01640155,0.03577631,0.02005851,0.03068221,0.03114031,0.03123071,0.06928208,0.9000522,0.05716186,0.01727312,0.9517787,0.1337005


Unnamed: 0_level_0,default,rs28619217,200610-10,rs367572771,rs144402189,rs375896687,200610-107,200610-108,200610-109,rs199838004,rs374875201,200610-112,rs377546596,200610-114,200610-115,200610-116,200610-117,200610-118,200610-12,200610-120
Unnamed: 0_level_1,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
6935561049_R06C01.Theta,No,0.05365109,0.04163464,0.8788909,0.05105701,0.02649923,0.02608219,0.05347582,0.02212922,0.04643466,0.0195864,0.04018126,0.040204,0.0352942,0.07331213,0.04371836,0.05557908,0.02113909,0.948177,0.1373871
6935561049_R07C01.Theta,No,0.05172075,0.04166435,0.8512335,0.04520546,0.02920886,0.04163542,0.05255305,0.02218693,0.05050221,0.02614834,0.03996056,0.0368418,0.03735679,0.07095054,0.05054793,0.06356196,0.01504504,0.9537578,0.132513
6935561049_R08C01.Theta,No,0.04086164,0.03338223,0.8867131,0.05380736,0.03776116,0.9675664,0.04026658,0.0212703,0.0475564,0.02043399,0.03441095,0.04352947,0.03269886,0.0634698,0.0584623,0.05729748,0.01805419,0.9447215,0.1372951


### Save processed SNP matrix

In [12]:
saveRDS(machine_learning_input, file = "outputs/snp_matrix.rds")