In [1]:
import pandas as pd
import numpy as np
from scipy import stats
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('data/finalised_rna.csv')

df_normal = df[df['samples.sample_type'] == 'Solid Tissue Normal'].drop(columns=['samples.sample_type', 'ID_short'])
df_tumor = df[df['samples.sample_type'] == 'Primary Tumor'].drop(columns=['samples.sample_type', 'ID_short'])

genes = []
log2_fold_changes = []
p_values = []

gene_names = df_normal.columns

for gene in gene_names:
    normal_expression = df_normal[gene].values
    tumor_expression = df_tumor[gene].values
    
    mean_normal = np.mean(normal_expression)
    mean_tumor = np.mean(tumor_expression)
    
    # Add a small value to avoid division by zero
    if mean_normal == 0:
        mean_normal = 1e-10
    if mean_tumor == 0:
        mean_tumor = 1e-10

    log2fc = np.log2(mean_tumor / mean_normal)
    
    t_stat, p_value = stats.ttest_ind(tumor_expression, normal_expression, nan_policy='omit', equal_var=False)

    genes.append(gene)
    log2_fold_changes.append(log2fc)
    p_values.append(p_value)

results_df = pd.DataFrame({
    'gene': genes,
    'log2FC': log2_fold_changes,
    'p_value': p_values
})

results_df.to_csv('data/differential_expression_results.csv', index=False)

results_df.head()

Unnamed: 0,gene,log2FC,p_value
0,TSPAN6,0.572868,0.0001018592
1,TNMD,0.58828,0.2938481
2,DPM1,0.781827,3.382474e-09
3,SCYL3,0.931428,4.074653e-05
4,C1orf112,1.63391,7.917902e-06
