In [None]:
!pip install scikit-learn pandas numpy statsmodels

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind

In [None]:
# Load the dataset
data = pd.read_excel('lung_expression_data.xlsx')

# Remove rows with gene name NULL
data = data[data['GENE'] != 'NULL']
data = data[data['GENE'] != '']

# Split columns into cancerous and normal
cancerous_cols = [col for col in data.columns if col.startswith('AD')]
normal_cols = [col for col in data.columns if col.startswith('L')]

In [None]:
# Perform t-test for each gene
p_values = []
for index, row in data.iterrows():
    cancerous_values = pd.to_numeric(row[cancerous_cols], errors='coerce')
    normal_values = pd.to_numeric(row[normal_cols], errors='coerce')
    t_stat, p_val = ttest_ind(cancerous_values, normal_values, nan_policy='omit')
    p_values.append(p_val)

# Add p_values to data
data['p_value'] = p_values

# Select genes with p-value < 0.05
selected_genes = data[data['p_value'] < 0.05]

print(selected_genes)

         GENE          PROBESET Result    AD10     AD2     AD3     AD5  \
8        VRK2       AB000450_at    YES   200.9   151.5   207.6   151.5   
34     STXBP2       AB002559_at    YES   287.1   315.0   320.9   173.0   
37      PSMD9       AB003177_at    YES   580.5   832.0   499.6   477.6   
42     LGALS4     AB006781_s_at    YES   105.0    91.3   541.0   124.9   
52      TONDU  AC000115_cds1_at     NO     7.5   257.1     9.0     7.8   
...       ...               ...    ...     ...     ...     ...     ...   
7029    H3F3B         Z48950_at    YES  5055.9  7499.8  6151.2  7697.4   
7055    IDH3G    Z68129_cds1_at    YES   328.5   433.0   399.5   403.2   
7061  IMOGN38         Z68747_at    YES    84.4    99.1   108.8    90.2   
7092    PTPRR       Z79693_s_at    YES    99.0   102.4    50.6    54.9   
7093    ACADS  Z80345_rna1_s_at     NO   -69.6  -101.9   -24.3    24.8   

          AD6     AD7     AD8  ...    LN66    LN67    LN69    LN70    LN71  \
8       145.9   149.2   238.8  ..

In [None]:
# Save the final genes to an Excel file
selected_genes.to_excel("selected_genes.xlsx")