# WCGNA
weighted co-expression gene network analysis

# Phase 2 vs. Phase 2

## 0. load libraries

In [40]:
library(tidyverse)
library(WGCNA)
library(janitor) # for row_to_names()

## 1. read and format CSVs

In [36]:
# read in normalized counts from DESeq2 (norm. counts have been added on to every df in this set)
counts <- read.csv('/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_2024/CE24_RNA-seq/analysis/diff_expression/phase2_v_phase2/deseq_res_files/all_genes/bb_cc.csv') %>%
select(-c(X, baseMean, log2FoldChange, lfcSE, stat, pvalue, padj,svalue))

# making gene the row name and removing it as a column
rownames(counts) <- counts$Gene
counts <- counts %>% select(-Gene)

head(counts)

Unnamed: 0_level_0,B1_B1_O01,B1_W5_O50,B2_B5_O51,B2_C4_O40,B3_B4_O41,B3_C3_O30,B3_C6_O66,B3_H4_O41,B3_W1_O06,B3_W4_O41,⋯,W4_W5_G56,W5_B2_G21,W5_C4_G45,W5_H4_G46,W5_W2_G22,W6_B3_G35,W6_B4_G48,W6_H6_G71,W6_W3_G36,W6_W4_G48
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
LOC111116054,1.866636,5.925652,9.397297,7.773785,0.0,11.67523,14.345577,9.244277,7.611847,3.933835,⋯,4.308183,2.436697,11.1084,1.994767,7.317628,5.245991,2.287545,20.92789,1.068844,0.0
LOC111126949,825.986574,471.08932,614.583255,506.159806,450.42615,416.80582,323.427565,554.656609,616.559623,439.606109,⋯,229.410737,540.946738,502.34665,738.063823,646.695362,425.974455,478.096882,467.02232,300.345215,632.2158
LOC111110729,59.732362,91.847603,59.202974,86.375394,87.32752,114.41729,153.236851,185.912678,96.7792,101.296262,⋯,128.16844,61.729658,86.39869,148.610148,110.679121,120.657789,144.115328,118.95851,135.743211,224.8113
LOC111112434,10.2665,1.975217,0.0,1.727508,13.78856,17.51285,6.520717,36.977107,1.087407,9.834589,⋯,6.462274,0.0,13.57694,5.984301,3.658814,0.0,2.287545,17.62348,8.550753,0.0
LOC111120752,335.994538,331.836502,400.324872,303.177631,271.17493,324.57148,496.226567,383.123917,201.170247,259.633138,⋯,418.970782,284.281319,443.10184,344.097323,400.640125,291.67709,328.262692,458.21058,355.925113,453.8444
LOC111128944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,⋯,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


for WCGNA, `rows = treatments` and `columns = gene IDs`

In [41]:
# transpose
input_data <- t(counts)
head(input_data)

Unnamed: 0,LOC111116054,LOC111126949,LOC111110729,LOC111112434,LOC111120752,LOC111128944,LOC111128953,LOC111105691,LOC111105685,LOC111105702,⋯,CYTB,COX2,ATP6,ND2,ND4,ND5,ND6,ND3,ND1,ND4L
B1_B1_O01,1.866636,825.9866,59.73236,10.2665,335.9945,0,0.0,1.866636,0,0,⋯,502990.4,166591.7,107640.5,95330.98,172154.3,184813.8,39053.77,21292.72,138137.6,9067.186
B1_W5_O50,5.925652,471.0893,91.8476,1.975217,331.8365,0,0.0,7.900869,0,0,⋯,619291.8,197022.0,134921.2,94588.22,127725.4,160054.8,48850.09,31242.01,150746.6,10497.292
B2_B5_O51,9.397297,614.5833,59.20297,0.0,400.3249,0,0.0,102.430542,0,0,⋯,697225.0,250352.5,157051.4,153121.44,225691.1,282620.9,64168.51,50507.65,217653.6,13564.999
B2_C4_O40,7.773785,506.1598,86.37539,1.727508,303.1776,0,0.0,27.640126,0,0,⋯,473989.3,226000.4,156837.8,99498.41,148608.0,148361.8,35647.99,28617.03,130731.7,8412.1
B3_B4_O41,0.0,450.4262,87.32752,13.788556,271.1749,0,0.0,19.533787,0,0,⋯,854928.4,292750.6,190109.7,140856.99,182732.8,255927.1,52981.38,39407.69,213753.6,14533.138
B3_C3_O30,11.675233,416.8058,114.41729,17.51285,324.5715,0,2.335047,32.690653,0,0,⋯,628692.6,238688.5,203626.6,137297.24,159619.1,142621.1,48530.44,35505.55,143244.6,9569.021


## 2. run WCGNA

following this [tutorial](https://bioinformaticsworkbook.org/tutorials/wgcna.html#gsc.tab=0)

In [44]:
# check data for WCGNA (built-in function)
gsg <- goodSamplesGenes(input_data, verbose = 3)
gsg$allOK

 Flagging genes and samples with too many missing values...
  ..step 1


In [43]:
# removing flagged genes, subsetting with flags WCGNA returned
input_data <- input_data[gsg$goodSamples, gsg$goodGenes]

# checking again
gsg2 <- goodSamplesGenes(input_data, verbose = 3)
gsg2$allOK


 Flagging genes and samples with too many missing values...
  ..step 1


In [28]:
allowWGCNAThreads() # allow multi-threading 

Allowing multi-threading with up to 24 threads.


In [29]:
# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to = 20, by = 2))

In [None]:
# Call the network topology analysis function
sft = pickSoftThreshold(
  input_data,            
  #blockSize = 30,
  powerVector = powers,
  verbose = 5
  )

pickSoftThreshold: will use block size 1263.
 pickSoftThreshold: calculating connectivity for given powers...
   ..working on genes 1 through 1263 of 35412
   ..working on genes 1264 through 2526 of 35412
   ..working on genes 2527 through 3789 of 35412
   ..working on genes 3790 through 5052 of 35412
   ..working on genes 5053 through 6315 of 35412
   ..working on genes 6316 through 7578 of 35412
   ..working on genes 7579 through 8841 of 35412
   ..working on genes 8842 through 10104 of 35412
   ..working on genes 10105 through 11367 of 35412
   ..working on genes 11368 through 12630 of 35412
   ..working on genes 12631 through 13893 of 35412
   ..working on genes 13894 through 15156 of 35412
   ..working on genes 15157 through 16419 of 35412


In [None]:
par(mfrow = c(1,2));
cex1 = 0.9;

plot(sft$fitIndices[, 1],
     -sign(sft$fitIndices[, 3]) * sft$fitIndices[, 2],
     xlab = "Soft Threshold (power)",
     ylab = "Scale Free Topology Model Fit, signed R^2",
     main = paste("Scale independence")
)
text(sft$fitIndices[, 1],
     -sign(sft$fitIndices[, 3]) * sft$fitIndices[, 2],
     labels = powers, cex = cex1, col = "red"
)
abline(h = 0.90, col = "red")
plot(sft$fitIndices[, 1],
     sft$fitIndices[, 5],
     xlab = "Soft Threshold (power)",
     ylab = "Mean Connectivity",
     type = "n",
     main = paste("Mean connectivity")
)
text(sft$fitIndices[, 1],
     sft$fitIndices[, 5],
     labels = powers,
     cex = cex1, col = "red")