# WGCNA
Weighted gene correlation network for analysis

following [this tutorial](https://bioinformaticsworkbook.org/dataAnalysis/RNA-Seq/RNA-SeqIntro/wgcna.html)

## 0. load libraries

In [23]:
library(tidyverse)
library(DESeq2)
library(WGCNA)

## 1. read CSVs
set-up is similar to DESeq - read in counts matrix and meta data, then run DESeq

In [13]:
count.matrix <- read.csv('/work/pi_sarah_gignouxwolfsohn_uml_edu/julia_mcdonough_student_uml_edu/ce24_rnaseq/featureCounts/featureCounts_matrix.csv') %>%
select(-Length, -B3_Nu_O24) %>%
column_to_rownames('Gene_ID')

dim(count.matrix)
head(count.matrix)

Unnamed: 0_level_0,B1_B1_O01,B1_Nu_O03,B1_W5_O50,B2_B5_O51,B2_C4_O40,B2_Nu_O12,B3_B4_O41,B3_C3_O30,B3_C6_O66,B3_H4_O41,⋯,W5_C4_G45,W5_H4_G46,W5_W2_G22,W6_B3_G35,W6_B4_G48,W6_H6_G71,W6_Nu_G41,W6_Nu_G45,W6_W3_G36,W6_W4_G48
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
LOC111116054,2,1,6,10,9,16,0,10,22,9,⋯,9,2,8,5,2,19,1,2,1,0
LOC111126949,885,652,477,654,586,523,392,357,496,540,⋯,407,740,707,406,418,424,492,330,281,599
LOC111110729,64,209,93,63,100,177,76,98,235,181,⋯,70,149,121,115,126,108,118,115,127,213
LOC111112434,11,7,2,0,2,2,12,15,10,36,⋯,11,6,4,0,2,16,22,0,8,0
LOC111120752,360,586,336,426,351,417,236,278,761,373,⋯,359,345,438,278,287,416,621,251,333,430
LOC111128944,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,1,0,0


### meta data
the meta data csv was already generated in [deseq_p1.v.p1](https://github.com/jgmcdonough/CE24_RNA-seq/blob/main/analysis/diff_expression/phase1_v_phase1/deseq_p1.v.p1.ipynb), so just reading in here

In [17]:
meta <- read.csv('/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_2024/CE24_RNA-seq/sample_metaData.csv') %>%
column_to_rownames('Sample') %>%
mutate(complete_trtmt = paste(Phase1_treatment, Phase2_treatment))
head(meta)

Unnamed: 0_level_0,Phase1_treatment,Phase1_temp,Phase1_DO,Phase1_TankRep,Phase2_treatment,Phase2_temp,Phase2_DO,Phase2_TankRep,complete_trtmt
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<int>,<chr>,<chr>,<chr>,<int>,<chr>
B1_B1_O01,both,warm,hypoxic,1,both,warm,hypoxic,1.0,both both
B1_Nu_O03,both,warm,hypoxic,1,,,,,both NA
B1_W5_O50,both,warm,hypoxic,1,warm,warm,normoxic,5.0,both warm
B2_B5_O51,both,warm,hypoxic,2,both,warm,hypoxic,5.0,both both
B2_C4_O40,both,warm,hypoxic,2,control,ambient,normoxic,4.0,both control
B2_Nu_O12,both,warm,hypoxic,2,,,,,both NA


In [21]:
# All samples
meta <- meta[colnames(count.matrix),]
all(rownames(meta) == colnames(count.matrix))

In [33]:
# Phase 1 samples
meta.p1 <- meta %>%
filter(is.na(Phase2_treatment))

counts.p1 <- count.matrix[,rownames(meta.p1)]
all(rownames(meta.p1) == colnames(counts.p1))

In [34]:
# Phase 2 samples
meta.p2 <- meta %>%
filter(!is.na(Phase2_treatment))

counts.p2 <- count.matrix[,rownames(meta.p2)]
all(rownames(meta.p2) == colnames(counts.p2))

following [this code notebook](https://github.com/SamGurr/Pgenerosa_OA_TagSeq/blob/main/TagSeq/Analysis/Scripts/Day0_WGCNA_all.R) from Gurr et al., 2022

## 2. Phase 1 samples only

### A. create DESeq object

In [36]:
dds.p1 <- DESeqDataSetFromMatrix(countData = counts.p1,
                              colData = meta.p1,
                              design = ~ 1) # don't need design, just need an object to transform

# transoform data
dds.p1_vst <- vst(dds.p1)
dds.p1_vst <- assay(dds.p1_vst) # call only the transformed coutns in the dds object
# fix(dds.d0_vst)
dds.p1_vst <- t(dds.p1_vst) # transpose columns to rows and vice versa

dim(dds.p1_vst)

In [38]:
gsg = goodSamplesGenes(dds.p1_vst, verbose = 3)
gsg$allOK

 Flagging genes and samples with too many missing values...
  ..step 1
  ..Excluding 3857 genes from the calculation due to too many missing samples or zero variance.
  ..step 2
