In [2]:
setwd("/data/bcu_projects/MelBrainSys_PostdocProject_Gruetzmann/publications/2022-my-MelBrainSys-paper/scripts-etc-for-publication/")

In [3]:
dataPath = "regNet/Data/"

In [4]:
trainingSetRatio = 3/4 # how many samples used for training, the remainder are used for test data

In [5]:
# full data set gene expression
fullExprDataFile = paste0( dataPath, "TCGA-expression.csv" )
# Gene, Chromsome, Location, Samples
fullExprData = read.delim( file = fullExprDataFile, header = TRUE, sep = "\t", stringsAsFactors = F)
colnames(fullExprData) = gsub("\\.","-",colnames(fullExprData))
head(fullExprData,3)

Unnamed: 0_level_0,geneSymbol,chr,pos,TCGA-BF-A1PU-01,TCGA-BF-A1PV-01,TCGA-BF-A1PX-01,TCGA-BF-A1PZ-01,TCGA-BF-A3DJ-01,TCGA-BF-A3DM-01,TCGA-BF-A3DN-01,⋯,TCGA-GF-A2C7-01,TCGA-GF-A3OT-06,TCGA-GN-A262-06,TCGA-GN-A263-01,TCGA-GN-A265-06,TCGA-GN-A267-06,TCGA-GN-A268-06,TCGA-GN-A26A-06,TCGA-GN-A26D-06,TCGA-HR-A5NC-01
Unnamed: 0_level_1,<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,NOC2L,chr1,887127,6.702927,7.506216,7.3536418,7.5527593,6.7940935,7.8001608,6.438719,⋯,6.320593,7.774894,7.869414,5.85987063,8.0767768,7.698355,7.609606,8.2928188,6.689367,6.129037
2,KLHL17,chr1,898531,2.524458,3.581854,1.8863971,3.3867555,2.5493458,2.4142473,1.246689,⋯,2.304777,4.523496,3.997421,0.04818819,3.2792568,3.688836,4.444516,3.1149303,1.624958,2.909801
3,HES4,chr1,934947,1.030696,3.166422,0.3773996,0.9425393,-0.9829983,0.9366146,-2.111562,⋯,-1.218591,2.376759,1.699015,-0.06880224,0.1592957,2.768496,1.789248,-0.4265386,-1.908413,-2.078192


In [6]:
# full methylation data set
#
fullMethDataFile = paste0( dataPath, "TCGA-methylation.csv" )
#Gene, Chromsome, Location, Samples
fullMethData = read.delim( file = fullMethDataFile, header = TRUE, sep = "\t" )
colnames(fullMethData) = gsub("\\.","-",colnames(fullMethData))
head(fullMethData,3)

Unnamed: 0_level_0,geneSymbol,chr,pos,TCGA-BF-A1PU-01,TCGA-BF-A1PV-01,TCGA-BF-A1PX-01,TCGA-BF-A1PZ-01,TCGA-BF-A3DJ-01,TCGA-BF-A3DM-01,TCGA-BF-A3DN-01,⋯,TCGA-GF-A2C7-01,TCGA-GF-A3OT-06,TCGA-GN-A262-06,TCGA-GN-A263-01,TCGA-GN-A265-06,TCGA-GN-A267-06,TCGA-GN-A268-06,TCGA-GN-A26A-06,TCGA-GN-A26D-06,TCGA-HR-A5NC-01
Unnamed: 0_level_1,<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,NOC2L,chr1,887127,-4.652966,-4.458765,-4.796641,-4.729707,-4.926345,-5.091165,-4.961007,⋯,-4.659859,-4.336808,-4.699023,-4.612412,-4.609387,-4.424475,-4.779394,-4.508756,-4.903969,-4.64449
2,KLHL17,chr1,898531,-4.652966,-4.458765,-4.796641,-4.729707,-4.926345,-5.091165,-4.961007,⋯,-4.659859,-4.336808,-4.699023,-4.612412,-4.609387,-4.424475,-4.779394,-4.508756,-4.903969,-4.64449
3,HES4,chr1,934947,-5.114427,-4.63996,-4.516409,-4.806967,-4.845529,-5.16041,-4.507094,⋯,-5.121116,-4.680573,-5.038763,-4.855266,-4.822812,-4.871468,-3.887074,-5.012732,-5.097629,-4.63437


In [7]:
totalSamples = 4:ncol( fullExprData )
nbSamplesTotal = length( totalSamples )
nbTrainSamples = round( nbSamplesTotal * trainingSetRatio, dig = 0 )
cat( nbSamplesTotal, "samples in total,", nbTrainSamples ,"for training,",
    nbSamplesTotal-nbTrainSamples,"for testing \n")

270 samples in total, 202 for training, 68 for testing 


In [8]:
createTraining_and_TestDataSets = function( dataSetNb ) {
    cat("data set number",dataSetNb,"\n")
    trainSamples = sample( x = totalSamples, size = nbTrainSamples  )
    testSamples  = setdiff( totalSamples, trainSamples )

    cat("training samples\n", trainSamples ,"\n")
    cat("test samples\n", testSamples  ,"\n")

    # Expression Data:
    # save training data set
    trainDataFile = paste0( dataPath, "TrainSet_ExpressionData_regNet_Run_", dataSetNb, ".txt" )
    write.table( fullExprData[ , c( 1:3, trainSamples ) ], file = trainDataFile, row.names = FALSE, col.names = TRUE, quote = FALSE, dec = ".", sep = "\t" )
    cat("saved to ",trainDataFile,"\n")
    
    # save test data set
    testDataFile = paste0( dataPath, "TestSet_ExpressionData_regNet_Run_", dataSetNb, ".txt" )
    write.table( fullExprData[ , c( 1:3, testSamples ) ], file = testDataFile, row.names = FALSE, col.names = TRUE, quote = FALSE, dec = ".", sep = "\t" )
    cat("saved to ",testDataFile,"\n")
    
    # Methylation Data:
    # save training data set
    trainDataFile = paste0( dataPath, "TrainSet_MethylationData_regNet_Run_", dataSetNb, ".txt" )
    write.table( fullMethData[ , c( 1:3, trainSamples ) ], file = trainDataFile, row.names = FALSE, col.names = TRUE, quote = FALSE, dec = ".", sep = "\t" )
    cat("saved to ",trainDataFile,"\n")
    
    # save test data set
    testDataFile = paste0( dataPath, "TestSet_MethylationData_regNet_Run_", dataSetNb, ".txt" )
    write.table( fullMethData[ , c( 1:3, testSamples ) ], file = testDataFile, row.names = FALSE, col.names = TRUE, quote = FALSE, dec = ".", sep = "\t" )
    cat("saved to ",testDataFile,"\n")
}

In [9]:
# create data set 1..10
nbDataSets = 10
for( i in 1:nbDataSets ) {
    createTraining_and_TestDataSets( dataSetNb = i)
}

data set number 1 
training samples
 65 174 7 179 239 121 83 153 14 22 82 173 227 8 183 256 268 36 138 54 70 96 148 167 169 223 51 186 92 131 159 20 180 218 251 97 19 116 21 266 202 38 219 63 238 4 113 123 42 127 216 172 181 185 258 229 189 145 114 164 134 235 222 190 252 101 171 9 104 16 135 112 141 220 208 62 56 77 10 115 237 106 246 43 46 28 163 270 271 188 87 109 199 110 272 13 23 247 129 90 245 249 264 32 191 49 168 241 161 233 232 260 166 149 44 192 85 31 198 84 57 132 107 143 221 255 80 225 37 154 99 40 15 81 152 72 130 74 175 207 133 151 205 5 128 170 248 226 55 25 67 91 209 273 242 122 52 140 146 86 60 18 93 139 102 71 157 47 35 41 267 39 210 156 29 100 78 34 66 117 224 200 269 6 213 262 195 176 79 155 147 17 24 59 236 212 228 124 30 214 206 150 
test samples
 11 12 26 27 33 45 48 50 53 58 61 64 68 69 73 75 76 88 89 94 95 98 103 105 108 111 118 119 120 125 126 136 137 142 144 158 160 162 165 177 178 182 184 187 193 194 196 197 201 203 204 211 215 217 230 231 234 240 243 244 25