/
full_script.R
162 lines (112 loc) · 5.92 KB
/
full_script.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
###############################################################
# Full executable script for cNMTF
#
# Corresponding author:
# Luis Leal, Imperial College London, email: lgl15@imperial.ac.uk
###############################################################
#-------------------------------------------------
# 1. Installation
#-------------------------------------------------
#Clean your current workspace
rm(list=ls()) ; gc()
#Load libraries
library("devtools") #Development tools for packages in R
library("biomaRt") #Query ENSEMBL for SNV functional consequences
library("LDcorSV") #Linkage disequilibrium
library("doParallel") #Processes in parallel
library("igraph") #Network construction
library("ade4") #Principal components analysis
library("VennDiagram") #Venn diagrams
library("snpStats") #
#Installation of cnmtf package from github repository
devtools::install_github("lgl15/cnmtf")
#Create directory for your analysis
dir.create("./test/")
#-------------------------------------------------
# 2. Preprocessing of data
#-------------------------------------------------
#Find variants in high Linkage-disequilibrium
find.snps.ld( file.LD = "./test/fileLD.RData", type.ld = "gene", tmap = tmap, R = R)
#Create SNV-SNV network
create.network( #Parameters for the reference network
net.type = "ppi", #Type of reference network
dedges = dedges, #Define object with edges from reference network
#Parameters for Linkage Disequilibrium
remove.highLD = TRUE,
ld.tao = 0.8, #Treshold of LD
res.ld = "./test/fileLD.RData", #Table of LD
keep.with.LD = NULL, #List of SNPs to include even if they are in LD
#Parameters to construct Wu
R.snps = rownames(R), #List of SNPs in R
work.dat = ".", #Working directory
trait.project = "test", #Trait
n.cores = 3, #Number of cores
tmap = tmap, #Mapping of SNPs to genes
plot.file = "Gu_ppi_test_venn.pdf" ) #File to print Venn diagrams and node degree distribution
#-------------------------------------------------
# 3. Running cNMTF
#-------------------------------------------------
score.cnmtf( R = R, #Relationship matrix
out = out, #Categorical outcome variable
pop = pop, #Population variable
log.file = "logfile_my_experiment", #Log-file to track the performance
#Variables to Save/Load data and workspaces
name.exp = "my_experiment", #Name of experiment to save files
work.dat = "./test/", #Folder to save and load workspaces
#Number of clusters
define.k = "user", #Method to define k1
k1 = 20, #Number of clusters of SNVs
k2 = nlevels(out), #Number of clusters of patients
#Penalisation parameters
wparameters = list(gamma1 = 0.5, #Weight for the SNV network, Lu
gamma2 = 1, #Weight for the outcome matrix, Vo
gamma3 = 1), #Weight for the kernel matrix, A
save.parameters = TRUE, #Save parameters to file
run.t.par = 4, #Number of repetitions for parameters fitting
max.try0 = 4, #Maximum number of tries to fit the parameter
snps.known = NULL, #List of known SNV associated with the trait
#Variables to control performance of the algorithm
parallel.opt = T, #Run some instances of the algorithm in parallel
n.cores = 3, #Number of cores to use in the parallel processing
init = 0, #Type of seeding/initialisation of matrices in the algorithm
do.U = TRUE, #Perform clustering of SNPs
calcObj = 20, #Check convergency every 20 iterations
calcObj2 = 40, #Start checking convergency after first 40 iterations
iters = 300, #Number of iterations
run.t.exp = 10, #Number of repetitions for the experiment
display.iters = FALSE, #Display the iterations of function cnmtf
#Randomisations
score.pvalues = T, #Estimate p-values for the scores
random.parallel = T, #Run each randomisation in parallel
randomisations = 100, #Number of randomisations
#Construction of penalisation terms
file.Gu = "./test/Gu_ppi_test.RData" #Workspace with SNV-SNV network
)
#-------------------------------------------------
# 4. Prioritising SNVs
#-------------------------------------------------
#Calculate the delta score and create Manhattan plots
analyze.cnmtf (trait.project = "test", #Trait
name.exp = "my_experiment", #Name of experiment to analyse
work.dat = "./test/", #Working directory
alpha.cnmtf = 0.005, #Significance level for the delta SNV score
d.conf = NULL, #Optional. A dataframe of patients by confounder variables
snps.known = NULL, #Optional. List of known SNVs associated with the disease
snps.known2 = NULL, #Optional. A second list of SNVs to depict on the Manhattan plots
tmap = tmap
)
#Map variants to genes and add functional annotations
t.res = annotate.results( name.exp = "my_experiment", #Define experiment id
snps.known = NULL, #Optional. List of SNVs known to be associated with the disease
snps.known2 = NULL, #Optional. A second list of SNVs.
add.david.annotations = TRUE, #Use DAVID web service or export/import manually
email.david = email.david, #Email account registered in DAVID.
add.ensemble.conseq = FALSE, #Add SNV consequences from ENSEMBL
work.dat = "./test/", #Working directory
tmap = tmap, #Mapping of SNPs to genes, chr and genomic position
file.LD = "./test/fileLD.RData", #Workspace with a table of pairwise LD
ld.tao = 0.8 #Treshold of LD
)
#Extract table of prioritised variants
t.snvs = t.res[[1]]
t.snvs [1:8]