In [2]:
library(GWASTools)
library(GWASdata)
# Load the SNP annotation (simple data frame)
data(illumina_snp_annot)
# Create a SnpAnnotationDataFrame
snpAnnot <- SnpAnnotationDataFrame(illumina_snp_annot)
# names of columns
varLabels(snpAnnot)
# data
head(pData(snpAnnot))
# Add metadata to describe the columns
meta <- varMetadata(snpAnnot)
meta[c("snpID", "chromosome", "position", "rsID", "alleleA", "alleleB",
  "BeadSetID", "IntensityOnly", "tAA", "tAB", "tBB", "rAA", "rAB", "rBB"),
  "labelDescription"] <- c("unique integer ID for SNPs",
  paste("integer code for chromosome: 1:22=autosomes,",
   "23=X, 24=pseudoautosomal, 25=Y, 26=Mitochondrial, 27=Unknown"),
  "base pair position on chromosome (build 36)",
  "RS identifier",
  "alelleA", "alleleB",
  "BeadSet ID from Illumina",
  "1=no genotypes were attempted for this assay",
  "mean theta for AA cluster",
  "mean theta for AB cluster",
  "mean theta for BB cluster",
  "mean R for AA cluster",
  "mean R for AB cluster",
  "mean R for BB cluster")
varMetadata(snpAnnot) <- meta

Unnamed: 0_level_0,snpID,chromosome,position,rsID,alleleA,alleleB,BeadSetID,IntensityOnly,tAA,tAB,tBB,rAA,rAB,rBB
Unnamed: 0_level_1,<int>,<int>,<int>,<chr>,<chr>,<chr>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,999447,21,13733610,rs3132407,A,G,1185447327,1,0.01374357,0.3290431,0.9184624,1.562203,1.592753,1.614259
2,999465,21,13852569,rs2775671,T,C,1169708488,0,0.06325946,0.5440393,0.9796721,0.4431986,0.4431986,0.4431986
3,999493,21,14038583,rs2775018,T,C,1192445330,0,0.001315146,0.2623954,0.5362323,2.634981,2.314934,2.001276
4,999512,21,14136579,rs3115511,T,C,1149617207,0,0.01100482,0.5692499,0.9846884,0.8781826,0.9453412,0.8209958
5,999561,21,14396024,rs2822404,T,C,1149961944,0,0.04020681,0.5691788,0.9902423,1.09417,1.127079,0.9898759
6,999567,21,14404476,rs1556276,A,G,1149617207,0,0.03089573,0.6842008,0.9837771,0.5954081,0.7681253,0.790015


In [3]:
snpID <- snpAnnot$snpID
snpID <- getSnpID(snpAnnot)
chrom <- snpAnnot[["chromosome"]]
chrom <- getChromosome(snpAnnot)
table(chrom)
chrom <- getChromosome(snpAnnot, char=TRUE)
table(chrom)
position <- getPosition(snpAnnot)
rsID <- getVariable(snpAnnot, "rsID")

chrom
  21   22   23   24   25   26 
1000 1000 1000  100  100  100 

chrom
  21   22    M    X   XY    Y 
1000 1000  100 1000  100  100 

In [4]:
tmp <- snpAnnot[,c("snpID", "chromosome", "position")]
snp <- getAnnotation(tmp)
snp$flag <- sample(c(TRUE, FALSE), nrow(snp), replace=TRUE)
pData(tmp) <- snp
meta <- getMetadata(tmp)
meta["flag", "labelDescription"] <- "flag"
varMetadata(tmp) <- meta
getVariableNames(tmp)
varLabels(tmp)[4] <- "FLAG"
rm(tmp)

In [5]:
# Load the scan annotation (simple data frame)
data(illumina_scan_annot)
# Create a ScanAnnotationDataFrame
scanAnnot <- ScanAnnotationDataFrame(illumina_scan_annot)
# names of columns
varLabels(scanAnnot)
# data
head(pData(scanAnnot))
# Add metadata to describe the columns
meta <- varMetadata(scanAnnot)
meta[c("scanID", "subjectID", "family", "father", "mother",
  "CoriellID", "race", "sex", "status", "genoRunID", "plate",
  "batch", "file"), "labelDescription"] <-
   c("unique ID for scans",
  "subject identifier (may have multiple scans)",
  "family identifier",
  "father identifier as subjectID",
  "mother identifier as subjectID",
  "Coriell subject identifier",
  "HapMap population group",
  "sex coded as M=male and F=female",
  "simulated case/control status" ,
  "genotyping instance identifier",
  "plate containing samples processed together for genotyping chemistry",
  "simulated genotyping batch",
  "raw data file")
varMetadata(scanAnnot) <- meta

Unnamed: 0_level_0,scanID,subjectID,family,father,mother,CoriellID,race,sex,status,genoRunID,plate,batch,file
Unnamed: 0_level_1,<int>,<int>,<int>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>
1,280,200191449,1341,0,0,NA06985,CEU,F,1,WG1000993-DNAG10-CIDR_06985@1007850397,WG0052814-AMP2,A,GENEVA_1M_HapMap_37.csv
2,281,200191449,1341,0,0,NA06985,CEU,F,1,WG1000992-DNAF10-CIDR_06985@1007850586,WG0061258-AMP2,A,GENEVA_1M_HapMap_58.csv
3,282,200030290,1341,200099417,200191449,NA06991,CEU,F,0,WG1000970-DNAB11-CIDR_06991@1007850444,WG0061536-AMP2,B,GENEVA_1M_HapMap_5.csv
4,283,200030290,1341,200099417,200191449,NA06991,CEU,F,0,WG1000969-DNAA11-CIDR_06991@1007850587,WG0053489-AMP2,A,GENEVA_1M_HapMap_3.csv
5,284,200099417,1341,0,0,NA06993,CEU,M,1,WG1000972-DNAE10-CIDR_06993@1007850591,WG0060475-AMP2,C,GENEVA_1M_HapMap_10.csv
6,285,200099417,1341,0,0,NA06993,CEU,M,1,WG1000971-DNAD10-CIDR_06993@1007850421,WG0061540-AMP2,B,GENEVA_1M_HapMap_71.csv


In [6]:
# Load the scan annotation (simple data frame)
data(illumina_scan_annot)
# Create a ScanAnnotationDataFrame
scanAnnot <- ScanAnnotationDataFrame(illumina_scan_annot)
# names of columns
varLabels(scanAnnot)
# data
head(pData(scanAnnot))
# Add metadata to describe the columns
meta <- varMetadata(scanAnnot)
meta[c("scanID", "subjectID", "family", "father", "mother",
  "CoriellID", "race", "sex", "status", "genoRunID", "plate",
  "batch", "file"), "labelDescription"] <-
   c("unique ID for scans",
  "subject identifier (may have multiple scans)",
  "family identifier",
  "father identifier as subjectID",
  "mother identifier as subjectID",
  "Coriell subject identifier",
  "HapMap population group",
  "sex coded as M=male and F=female",
  "simulated case/control status" ,
  "genotyping instance identifier",
  "plate containing samples processed together for genotyping chemistry",
  "simulated genotyping batch",
  "raw data file")
varMetadata(scanAnnot) <- meta

Unnamed: 0_level_0,scanID,subjectID,family,father,mother,CoriellID,race,sex,status,genoRunID,plate,batch,file
Unnamed: 0_level_1,<int>,<int>,<int>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>
1,280,200191449,1341,0,0,NA06985,CEU,F,1,WG1000993-DNAG10-CIDR_06985@1007850397,WG0052814-AMP2,A,GENEVA_1M_HapMap_37.csv
2,281,200191449,1341,0,0,NA06985,CEU,F,1,WG1000992-DNAF10-CIDR_06985@1007850586,WG0061258-AMP2,A,GENEVA_1M_HapMap_58.csv
3,282,200030290,1341,200099417,200191449,NA06991,CEU,F,0,WG1000970-DNAB11-CIDR_06991@1007850444,WG0061536-AMP2,B,GENEVA_1M_HapMap_5.csv
4,283,200030290,1341,200099417,200191449,NA06991,CEU,F,0,WG1000969-DNAA11-CIDR_06991@1007850587,WG0053489-AMP2,A,GENEVA_1M_HapMap_3.csv
5,284,200099417,1341,0,0,NA06993,CEU,M,1,WG1000972-DNAE10-CIDR_06993@1007850591,WG0060475-AMP2,C,GENEVA_1M_HapMap_10.csv
6,285,200099417,1341,0,0,NA06993,CEU,M,1,WG1000971-DNAD10-CIDR_06993@1007850421,WG0061540-AMP2,B,GENEVA_1M_HapMap_71.csv


In [7]:
scanID <- scanAnnot$scanID
scanID <- getScanID(scanAnnot)
sex <- scanAnnot[["sex"]]
sex <- getSex(scanAnnot)
subjectID <- getVariable(scanAnnot, "subjectID")

In [8]:
# Define a path to the raw data files
path <- system.file("extdata", "illumina_raw_data", package="GWASdata")

geno.file <- "tmp.geno.gds"

# first 3 samples only
scan.annotation <- illumina_scan_annot[1:3, c("scanID", "genoRunID", "file")]
names(scan.annotation)[2] <- "scanName"

snp.annotation <- illumina_snp_annot[,c("snpID", "rsID", "chromosome", "position")]
# indicate which column of SNP annotation is referenced in data files
names(snp.annotation)[2] <-  "snpName"

col.nums <- as.integer(c(1,2,12,13))
names(col.nums) <- c("snp", "sample", "a1", "a2")

diag.geno.file <- "diag.geno.RData"
diag.geno <- createDataFile(path = path,
  filename = geno.file,
  file.type = "gds",
  variables = "genotype",
  snp.annotation = snp.annotation,
  scan.annotation = scan.annotation,
  sep.type = ",",
  skip.num = 11,
  col.total = 21,
  col.nums = col.nums,
  scan.name.in.file = 1,
  diagnostics.filename = diag.geno.file,
  verbose = FALSE)
# Look at the values included in the "diag.geno" object which holds
#   all output from the function call
names(diag.geno)
# `read.file' is a vector indicating whether (1) or not (0) each file
#   specified in the `files' argument was read successfully
table(diag.geno$read.file)
# `row.num' is a vector of the number of rows read from each file
table(diag.geno$row.num)
# `sample.match' is a vector indicating whether (1) or not (0)
#   the sample name inside the raw text file matches that in the
#   sample annotation data.frame
table(diag.geno$sample.match)
# `snp.chk' is a vector indicating whether (1) or not (0)
#   the raw text file has the expected set of SNP names
table(diag.geno$snp.chk)
# `chk' is a vector indicating whether (1) or not (0) all previous
#   checks were successful and the data were written to the data file
table(diag.geno$chk)

adding variables: genotype




1 
3 


3300 
   3 


1 
3 


1 
3 


1 
3 