Please write a python program to convert cell count in cell-count.csv to relative frequency (in percentage) of total cell count for each sample. Total cell count of each sample is the sum of cells in the five populations of that sample. Please return an output file in csv format with cell count and relative frequency of each population of each sample per line. The output file should have the following columns:

sample: the sample id as in column sample in cell-count.csv
total_count: total cell count of sample
population: name of the immune cell population (e.g. b_cell, cd8_t_cell, etc.)
count: cell count
percentage: relative frequency in percentage

In [None]:
from teiko_tools.analysis import cell_counts_calculator

cell_counts_calculator(file_name="cell-count.csv", file_name_out="cell-counts-relative.csv")

# Proof of concept

In [4]:
import pandas as pd

# load the data and look at it
cell_df = pd.read_csv("cell-count.csv")
cell_df.head()

Unnamed: 0,project,subject,condition,age,sex,treatment,response,sample,sample_type,time_from_treatment_start,b_cell,cd8_t_cell,cd4_t_cell,nk_cell,monocyte
0,prj1,sbj1,melanoma,70,F,tr1,y,s1,PBMC,0.0,36000,24000,42000,6000,12000
1,prj1,sbj1,melanoma,70,F,tr1,y,s2,PBMC,7.0,30000,22000,40000,2000,6000
2,prj1,sbj1,melanoma,70,F,tr1,y,s3,PBMC,14.0,35000,26250,37500,10000,16250
3,prj1,sbj2,healthy,65,F,none,,s4,PBMC,,27900,17100,18000,4500,22500
4,prj1,sbj3,melanoma,75,M,tr1,n,s5,PBMC,0.0,60000,30000,37500,4500,18000


In [8]:
# collect column names for the cell types
cell_name_columns = ['b_cell', "cd8_t_cell", "cd4_t_cell", "nk_cell", "monocyte"]
# make new column for total count
cell_df['total_count'] = cell_df.loc[:,cell_name_columns].apply(sum, axis=1)
cell_df

Unnamed: 0,project,subject,condition,age,sex,treatment,response,sample,sample_type,time_from_treatment_start,b_cell,cd8_t_cell,cd4_t_cell,nk_cell,monocyte,total_count
0,prj1,sbj1,melanoma,70,F,tr1,y,s1,PBMC,0.0,36000,24000,42000,6000,12000,120000
1,prj1,sbj1,melanoma,70,F,tr1,y,s2,PBMC,7.0,30000,22000,40000,2000,6000,100000
2,prj1,sbj1,melanoma,70,F,tr1,y,s3,PBMC,14.0,35000,26250,37500,10000,16250,125000
3,prj1,sbj2,healthy,65,F,none,,s4,PBMC,,27900,17100,18000,4500,22500,90000
4,prj1,sbj3,melanoma,75,M,tr1,n,s5,PBMC,0.0,60000,30000,37500,4500,18000,150000
5,prj1,sbj3,melanoma,75,M,tr1,n,s6,PBMC,7.0,27500,24200,33000,5500,19800,110000
6,prj1,sbj4,lung,50,F,tr2,y,s7,PBMC,0.0,31350,19000,23750,8550,12350,95000
7,prj1,sbj4,lung,50,F,tr2,y,s8,tumor,0.0,35000,23000,25000,6000,11000,100000
8,prj1,sbj5,healthy,77,M,none,,s9,PBMC,,45500,27300,32500,6500,18200,130000
9,prj2,sbj6,healthy,45,M,none,,s10,PBMC,,28420,21560,21560,8820,17640,98000


In [29]:
# calculate the relative frequency of each cell type
cell_rf_df = cell_df.loc[:,cell_name_columns].apply(lambda x: x/cell_df['total_count']*100)
# make a new dataframe with the sample and total count columns
cell_rf_df = pd.concat( [cell_df[["sample", "total_count"]], cell_rf_df], axis=1)

# melt the dataframe to get the desired format
cell_rf_df_long = cell_rf_df.melt(id_vars=["sample", "total_count"], var_name="population", value_name="percentage")
cell_rf_df_long

Unnamed: 0,sample,total_count,population,percentage
0,s1,120000,b_cell,30.0
1,s2,100000,b_cell,30.0
2,s3,125000,b_cell,28.0
3,s4,90000,b_cell,31.0
4,s5,150000,b_cell,40.0
...,...,...,...,...
80,s13,115000,monocyte,4.0
81,s14,100000,monocyte,21.0
82,s15,80000,monocyte,15.0
83,s16,100000,monocyte,38.0


In [27]:
# make a new dataframe with the sample and total count columns with the cell counts
columns_to_grab = ["samplxe", "total_count", *cell_name_columns]
# melt the dataframe to get the desired format
cell_df_long = cell_df[columns_to_grab].melt(id_vars=["sample", "total_count"], var_name="population", value_name="count")

cell_df_long

Unnamed: 0,sample,total_count,population,count
0,s1,120000,b_cell,36000
1,s2,100000,b_cell,30000
2,s3,125000,b_cell,35000
3,s4,90000,b_cell,27900
4,s5,150000,b_cell,60000
...,...,...,...,...
80,s13,115000,monocyte,4600
81,s14,100000,monocyte,21000
82,s15,80000,monocyte,12000
83,s16,100000,monocyte,38000


In [28]:
cell_df_long.merge(cell_rf_df_long, how="left", on=["sample", "total_count", "population"])

Unnamed: 0,sample,total_count,population,count,percentage
0,s1,120000,b_cell,36000,30.0
1,s2,100000,b_cell,30000,30.0
2,s3,125000,b_cell,35000,28.0
3,s4,90000,b_cell,27900,31.0
4,s5,150000,b_cell,60000,40.0
...,...,...,...,...,...
80,s13,115000,monocyte,4600,4.0
81,s14,100000,monocyte,21000,21.0
82,s15,80000,monocyte,12000,15.0
83,s16,100000,monocyte,38000,38.0


In [1]:
from teiko_tools.analysis import cell_counts_calculator

cell_counts_calculator(file_name="cell-count.csv", file_name_out="cell-counts-relative.csv")