# Calculating FPKM and TPM

- FPKM = fragments per kilobase million 
- TPM = transcript per million

In [2]:
# load libraries
library(tidyverse)

In [4]:
counts_matrix <- read.csv('/work/pi_sarah_gignouxwolfsohn_uml_edu/julia_mcdonough_student_uml_edu/ce24_rnaseq/featureCounts/featureCounts_matrix.csv')
head(counts_matrix)

Unnamed: 0_level_0,Gene_ID,Length,B1_B1_O01,B1_Nu_O03,B1_W5_O50,B2_B5_O51,B2_C4_O40,B2_Nu_O12,B3_B4_O41,B3_C3_O30,⋯,W5_C4_G45,W5_H4_G46,W5_W2_G22,W6_B3_G35,W6_B4_G48,W6_H6_G71,W6_Nu_G41,W6_Nu_G45,W6_W3_G36,W6_W4_G48
Unnamed: 0_level_1,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,LOC111116054,1017,2,1,6,10,9,16,0,10,⋯,9,2,8,5,2,19,1,2,1,0
2,LOC111126949,4364,885,652,477,654,586,523,392,357,⋯,407,740,707,406,418,424,492,330,281,599
3,LOC111110729,23787,64,209,93,63,100,177,76,98,⋯,70,149,121,115,126,108,118,115,127,213
4,LOC111112434,9649,11,7,2,0,2,2,12,15,⋯,11,6,4,0,2,16,22,0,8,0
5,LOC111120752,6621,360,586,336,426,351,417,236,278,⋯,359,345,438,278,287,416,621,251,333,430
6,LOC111128944,1773,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,1,0,0


## FPKM

FPKM = counts / (gene length in kb * library size in millions)


In [5]:
## calculating library size (total number of assigned reads per sample

# exlcude Gene_ID and length columns
lib.size <- colSums(counts_matrix[, -c(1,2)])
head(lib.size)

In [8]:
fpkm_matrix <- counts_matrix

# convert gene length to kilobases
gene_length_kb <- counts_matrix$Length / 1000

# Loop over each sample column
for (col in 3:ncol(counts_matrix)) {
  fpkm_matrix[[col]] <- counts_matrix[[col]] / (gene_length_kb * (lib.size[col-2] / 1e6))
}

head(fpkm_matrix)

Unnamed: 0_level_0,Gene_ID,Length,B1_B1_O01,B1_Nu_O03,B1_W5_O50,B2_B5_O51,B2_C4_O40,B2_Nu_O12,B3_B4_O41,B3_C3_O30,⋯,W5_C4_G45,W5_H4_G46,W5_W2_G22,W6_B3_G35,W6_B4_G48,W6_H6_G71,W6_Nu_G41,W6_Nu_G45,W6_W3_G36,W6_W4_G48
Unnamed: 0_level_1,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,LOC111116054,1017,0.04514588,0.01608961,0.141489824,0.21138892,0.194334314,0.250570576,0.0,0.2530045,⋯,0.24951959,0.04772322,0.184455542,0.1251282,0.047510771,0.47684327,0.01785379,0.04641034,0.02626848,0.0
2,LOC111126949,4364,4.6555141,2.44472102,2.621371336,3.22178222,2.94876939,1.908745336,2.32656717,2.10490587,⋯,2.62961863,4.11497959,3.798895498,2.3678123,2.314059771,2.47984263,2.04706523,1.78457552,1.72019401,3.2980247
3,LOC111110729,23787,0.06176599,0.14377159,0.093764445,0.05693824,0.092318399,0.11851263,0.08275385,0.10600726,⋯,0.08297389,0.1520083,0.119280204,0.123045,0.12797169,0.11588496,0.0900728,0.11409429,0.14263296,0.2151552
4,LOC111112434,9649,0.02617095,0.01187086,0.004970987,0.0,0.004551721,0.003301252,0.03221167,0.03999983,⋯,0.03214352,0.01509001,0.009720763,0.0,0.005007613,0.04232341,0.04139918,0.0,0.02214948,0.0
5,LOC111120752,6621,1.24821107,1.44823981,1.217056103,1.38321339,1.164156759,1.003098334,0.92321465,1.08036477,⋯,1.52881157,1.26449238,1.551219293,1.0686296,1.047228231,1.60366167,1.70301838,0.89465556,1.34362012,1.5604746
6,LOC111128944,1773,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,⋯,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01331058,0.0,0.0


## TPM

TPM = (gene RPK / (sum of all gene RPKs) * 1e6)


In [5]:
# Separate gene info and counts
gene_info <- counts_matrix %>% select(Gene_ID, Length)
count_info <- counts_matrix %>% select(-Gene_ID, -Length)

# Step 1: Compute RPK (Reads Per Kilobase)
rpk <- count_info / (gene_info$Length / 1000)

# Step 2: Compute scaling factor (sum of RPKs per sample)
scaling_factors <- colSums(rpk)

# Step 3: Compute TPM
tpm <- sweep(rpk, 2, scaling_factors, FUN = "/") * 1e6

# Step 4: Combine TPM with gene info
tpm_df <- cbind(Gene_ID = gene_info$Gene_ID, tpm)

# Inspect
head(tpm_df)


Unnamed: 0_level_0,Gene_ID,B1_B1_O01,B1_Nu_O03,B1_W5_O50,B2_B5_O51,B2_C4_O40,B2_Nu_O12,B3_B4_O41,B3_C3_O30,B3_C6_O66,⋯,W5_C4_G45,W5_H4_G46,W5_W2_G22,W6_B3_G35,W6_B4_G48,W6_H6_G71,W6_Nu_G41,W6_Nu_G45,W6_W3_G36,W6_W4_G48
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,LOC111116054,0.1994808,0.07236005,0.60180479,0.8321459,0.83175842,1.00642945,0.0,1.0119636,1.34953096,⋯,0.9674306,0.20086721,0.80396624,0.5159785,0.17719976,1.8792749,0.07494578,0.20836377,0.10563877,0.0
2,LOC111126949,20.5707747,10.99468203,11.14959211,12.68275,12.6208476,7.66657261,9.119374,8.4191708,7.09051954,⋯,10.1954859,17.31996425,16.55783109,9.7639114,8.63069239,9.7732447,8.59307311,8.01202631,6.91776623,13.0038146
3,LOC111110729,0.2729182,0.6465862,0.39881237,0.224141,0.39512633,0.476012,0.3243677,0.4240062,0.61632433,⋯,0.3217041,0.6398035,0.5198936,0.5073886,0.4772929,0.4567113,0.37810331,0.51223746,0.57359896,0.8483374
4,LOC111112434,0.1156385,0.053387,0.02114331,0.0,0.01948154,0.01325965,0.1262591,0.1599907,0.06465451,⋯,0.124626,0.06351393,0.04236883,0.0,0.01867677,0.1667997,0.17378349,0.0,0.08907421,0.0
5,LOC111120752,5.515324,6.51319151,5.17655738,5.4451072,4.98263618,4.02899542,3.6186961,4.3212267,7.17038033,⋯,5.9274667,5.32225306,6.76113019,4.4066012,3.90582164,6.3201502,7.14884956,4.01664364,5.40337301,6.1528108
6,LOC111128944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,⋯,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05975915,0.0,0.0


In [7]:
write.csv(tpm_df, '/project/pi_sarah_gignouxwolfsohn_uml_edu/julia/CE_2024/CE24_RNA-seq/analysis/diff_expression/tpm.csv', row.names=FALSE)