# Genomic Prediction
### Kelly Swarts
## Use genomic information to predict phenotypes
## The K matrix shrinks information from all the markers to solve the NP problem - what is this?
## 

# 1.  Initial setup steps

## 1a. Prepare environment
Loading packages and functions into R

In [None]:
library(rTASSEL)
library(plot.matrix)
options(repr.plot.width=12, repr.plot.height=5)## this sets a larger size for figures

## 1b. Define input variables

In [None]:
# genotype data for maize and arabidopsis (in the "hdf5" format)
default.par <- par()
zmG <- readGenotypeTableFromPath("./data/282.poly_thinned30kbp.h5")
atG <- readGenotypeTableFromPath("./data/1001genomes_snp-short-indel_only_ACGTN.subsamp170_poly_minCov50_thinned30kpb.h5")
atGKO <- readGenotypeTableFromPath("./data/1001genomes_snp-short-indel_only_ACGTN.subsamp170_poly_minCov50_thinned30kpb_KOfri.h5")
# phenotype data for maize and arabidopsis
zmP <- readPhenotypeFromPath("./data/282_traits.txt")
atP <- readPhenotypeFromPath("./data/Arabidopsis_Phenotypes.trait")
# summary info for maize and arabidopsis 
zmSS <- read.table("./data/282.poly_thinned30kbp_SiteSummary.txt",header=T,as.is=T,sep="\t")
atSS <- read.table("./data/1001genomes_snp-short-indel_only_ACGTN.subsamp170_poly_minCov50_thinned30kpb_SiteSummary.txt",header=T,as.is=T,sep="\t")
zmTS <- read.table("./data/282.poly_thinned30kbp_TaxaSummary.txt",header=T,as.is=T,sep="\t")
atTS <- read.table("./data/1001genomes_snp-short-indel_only_ACGTN.subsamp170_poly_minCov50_thinned30kpb_TaxaSummary.txt",header=T,as.is=T,sep="\t")

# 2.  Generate K (kinship/genetic similarity) matrices
### We will calculate these in two different ways, each with different assumptions regarding population expectations for inbreeding
### "Centered" assumes Hardy-Weinburg and is calculated after J. Yang, S. H. Lee, M. E. Goddard, P. M. Visscher, GCTA: a tool for genome-wide complex trait analysis. Am. J. Hum. Genet. 88, 76–82 (2011).
### "Normalized" allows for inbreeding and is calculated after J. B. Endelman, J.-L. Jannink, Shrinkage estimation of the realized relationship matrix. G3 . 2, 1405–1413 (2012).

In [None]:
# Centered K matrix
zm_cent <- kinshipMatrix(zmG,method = "Centered_IBS")
zm_norm <- kinshipMatrix(zmG,method = "Normalized_IBS")
at_cent <- kinshipMatrix(atG,method = "Centered_IBS")
at_norm <- kinshipMatrix(atG,method = "Normalized_IBS")

# 3. Genomic prediction (GBLUP)
## These results are 5-fold cross validated with 10 reps. What does this mean?

In [None]:
colsK <- c("#c65999","#7aa456")
#arabidopsis
atPredN <- genomicPrediction(tasPhenoObj = atP, kinship = at_norm, doCV = T,kFolds = 5,nIter = 10)
atPredC <- genomicPrediction(tasPhenoObj = atP, kinship = at_cent, doCV = T,kFolds = 5,nIter = 10)
#flowering at 16 degrees
ft16 <- list("normalized_ft16"=atPredN[which(atPredN$Trait=="mean_ft16"),"Accuracy"],"centered_ft16"=atPredC[which(atPredC$Trait=="mean_ft16"),"Accuracy"])
boxplot(ft16,col=colsK,main="16 degree flowering (A. thaliana)",las=2)

## Does kinship matter?

In [None]:
#flowering at 10 degrees
ft10 <- list("normalized_ft10"=atPredN[which(atPredN$Trait=="mean_ft10"),"Accuracy"],"centered_ft10"=atPredC[which(atPredC$Trait=="mean_ft10"),"Accuracy"])
boxplot(ft10,col=colsK,main="10 degree flowering (A. thaliana)",las=2,ylab="Predictive ability")
boxplot(c(ft10,ft16),col=colsK,main="Flowering (A. thaliana)",las=2,ylab="Predictive ability")

## What was the heritability? Are these results surprising?

In [None]:
#maize
zmPredN <- genomicPrediction(tasPhenoObj = zmP, kinship = zm_norm, doCV = T,kFolds = 5,nIter = 10)
zmPredC <- genomicPrediction(tasPhenoObj = zmP, kinship = zm_cent, doCV = T,kFolds = 5,nIter = 10)
#flowering (DTA)
dta <- list("normalized_dta"=zmPredN[which(zmPredN$Trait=="DTA"),"Accuracy"],"centered_dta"=zmPredC[which(zmPredC$Trait=="DTA"),"Accuracy"])
boxplot(dta,col=colsK,main="DTA (Z. mays)",las=2,ylab="Predictive ability")

## What is the heritability for flowering in maize? Why do you think this is different than in arabidopsis?

In [None]:
#flowering (DTA)
dta <- list("normalized_dta"=zmPredN[which(zmPredN$Trait=="DTA"),"Accuracy"],"centered_dta"=zmPredC[which(zmPredC$Trait=="DTA"),"Accuracy"])
boxplot(dta,col=colsK,main="DTA (Z. mays)",las=2,ylab="Predictive ability")

## What was the heritability? Are these results surprising?

In [None]:
#maize FER
fer <- list("normalized_fer"=zmPredN[which(zmPredN$Trait=="FER"),"Accuracy"],"centered_fer"=zmPredC[which(zmPredC$Trait=="FER"),"Accuracy"])
boxplot(fer,col=colsK,main="FER (Z. mays)",las=2,ylab="Predictive ability")

