## Pre-process of omics trait data for downstream analysis：

This notebook explains the process for obtaining and pre-processing omics data of HS Rats used for downstream association studies.

In [11]:
using CSV, DataFrames, DelimitedFiles # packages for manipulating data
using GeneNetworkAPI # package for accessing data from GeneNetwork via API

In [12]:
check_gn()

LoadError: HTTP.Exceptions.RequestError(HTTP.Messages.Request:
"""
GET /api/v_pre1/ HTTP/1.1
Host: gn2.genenetwork.org
Accept: */*
User-Agent: HTTP.jl/1.7.3
Content-Length: 0
Accept-Encoding: gzip

""", EOFError())

In [2]:
gn_url()

"http://gn2.genenetwork.org/api/v_pre1/"

First, let's find the name of the HS rats dataset we would like to obtain from GN.

In [6]:
# Rats_groups = list_groups("rat");
# HS_group = Rats_groups[4, :Name];

HS_group = "HSNIH-Palmer"

"HSNIH-Palmer"

In [7]:
HS_datasets = list_datasets(HS_group);

LoadError: HTTP.Exceptions.RequestError(HTTP.Messages.Request:
"""
GET /api/v_pre1/datasets/HSNIH-Palmer HTTP/1.1
Host: gn2.genenetwork.org
Accept: */*
User-Agent: HTTP.jl/1.7.3
Content-Length: 0
Accept-Encoding: gzip

""", EOFError())

In [None]:
# HS_datasets

In [None]:
# HS_dataset_to_test = HS_datasets[4, :]

#### Get raw omics traits data:

In [None]:
# @time omic_pheno = get_omics(HS_dataset_to_test[:Short_Abbreviation]);

Use `wget` with this url: http://gn2.genenetwork.org/api/v_pre1/sample_data/HSNIH-Rat-PL-RSeq-0818.csv

In [10]:
omic_pheno = CSV.read("/home/zyu20/shareddata/HSNIH-Palmer/HSNIH-Rat-PL-RSeq-0818_sample_data.csv", DataFrame)

Row,id,ENSRNOG00000000001,ENSRNOG00000000007,ENSRNOG00000000008,ENSRNOG00000000009,ENSRNOG00000000010,ENSRNOG00000000012,ENSRNOG00000000017,ENSRNOG00000000021,ENSRNOG00000000024,ENSRNOG00000000033,ENSRNOG00000000034,ENSRNOG00000000036,ENSRNOG00000000040,ENSRNOG00000000041,ENSRNOG00000000042,ENSRNOG00000000043,ENSRNOG00000000044,ENSRNOG00000000047,ENSRNOG00000000048,ENSRNOG00000000053,ENSRNOG00000000054,ENSRNOG00000000060,ENSRNOG00000000062,ENSRNOG00000000064,ENSRNOG00000000065,ENSRNOG00000000066,ENSRNOG00000000068,ENSRNOG00000000070,ENSRNOG00000000073,ENSRNOG00000000075,ENSRNOG00000000081,ENSRNOG00000000082,ENSRNOG00000000091,ENSRNOG00000000095,ENSRNOG00000000096,ENSRNOG00000000098,ENSRNOG00000000104,ENSRNOG00000000105,ENSRNOG00000000108,ENSRNOG00000000111,ENSRNOG00000000112,ENSRNOG00000000113,ENSRNOG00000000121,ENSRNOG00000000122,ENSRNOG00000000123,ENSRNOG00000000127,ENSRNOG00000000129,ENSRNOG00000000130,ENSRNOG00000000133,ENSRNOG00000000137,ENSRNOG00000000138,ENSRNOG00000000142,ENSRNOG00000000145,ENSRNOG00000000150,ENSRNOG00000000151,ENSRNOG00000000155,ENSRNOG00000000156,ENSRNOG00000000157,ENSRNOG00000000158,ENSRNOG00000000161,ENSRNOG00000000164,ENSRNOG00000000165,ENSRNOG00000000166,ENSRNOG00000000167,ENSRNOG00000000168,ENSRNOG00000000169,ENSRNOG00000000170,ENSRNOG00000000172,ENSRNOG00000000175,ENSRNOG00000000177,ENSRNOG00000000184,ENSRNOG00000000185,ENSRNOG00000000186,ENSRNOG00000000187,ENSRNOG00000000190,ENSRNOG00000000195,ENSRNOG00000000196,ENSRNOG00000000201,ENSRNOG00000000204,ENSRNOG00000000219,ENSRNOG00000000221,ENSRNOG00000000230,ENSRNOG00000000231,ENSRNOG00000000233,ENSRNOG00000000236,ENSRNOG00000000237,ENSRNOG00000000239,ENSRNOG00000000244,ENSRNOG00000000245,ENSRNOG00000000246,ENSRNOG00000000247,ENSRNOG00000000248,ENSRNOG00000000249,ENSRNOG00000000250,ENSRNOG00000000251,ENSRNOG00000000257,ENSRNOG00000000258,ENSRNOG00000000262,ENSRNOG00000000264,⋯
Unnamed: 0_level_1,String15,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,String7,⋯
1,00071F4FAF,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,⋯
2,00071F6771,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,⋯
3,00071F768E,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,⋯
4,00071F95F9,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,⋯
5,00071FB160,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,⋯
6,00071FB747,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,⋯
7,00072069AD,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,⋯
8,0007207A73,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,⋯
9,0007207BE7,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,⋯
10,00072126F3,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,⋯


In [None]:
HS_dataset_to_test[:Short_Abbreviation]

#### Remove missing values - get all samples that have no missings

We would like to see the proportion of missing values from the structured omics data:

In [None]:
count = 0;
for i in 1:size(omic_pheno, 1)
    for j in 1:size(omic_pheno, 2)
        if ismissing(omic_pheno[i, j])
            count = count +1;
        end
    end
end

In [None]:
count / (size(omic_pheno, 1) * size(omic_pheno, 2))

We would like to get the omic traits observations for samples that have no missing values...

In [None]:
is_nonmissing = map(x -> !ismissing(x), omic_pheno[:, 2]);
true_samples = omic_pheno[is_nonmissing, 1];
omic_pheno_true = omic_pheno[is_nonmissing, :];

In [None]:
count = 0;
for i in 1:size(omic_pheno_true, 1)
    for j in 1:size(omic_pheno_true, 2)
        if ismissing(omic_pheno_true[i, j])
            count = count +1;
        end
    end
end

In [None]:
to_test = dropmissing(omic_pheno);

In [None]:
sum(to_test[:, 1] .== true_samples)

In [None]:
count = 0

In [None]:
count / (size(omic_pheno_true, 1) * size(omic_pheno_true, 2))

#### Finally, write to file the omics traits with no missing values

In [5]:
# CSV.write("HSNIH-Rat-PL-RSeq-0818_nomissing.csv", omic_pheno_true)

## Summary of issues for acquiring data from GN2: