In [18]:
import pandas as pd
import os

# 1. Define the path
file_path = '../data/raw/GSE9476_series_matrix.txt.gz'

# 2. Load the data
# sep='\t': It is separated by tabs, not commas.
# comment='!': Ignore all the metadata lines at the top.
# index_col=0: Use the first column (Probe IDs) as the row names.
print("Loading data... this might take a second.")
df = pd.read_csv(file_path, sep='\t', comment='!', index_col=0)

# 3. Check the shape
print(f"Data Loaded Successfully!")
print(f"Dimensions: {df.shape}")
print("\nFirst 5 rows:")
display(df.head())


import gzip

# 1. Capture the labels from the file header
labels = []
with gzip.open('../data/raw/GSE9476_series_matrix.txt.gz', 'rt') as f:
    for line in f:
        # Find the exact line you discovered
        if "!Sample_source_name_ch1" in line:
            # Split by tab to get individual entries
            parts = line.strip().split('\t')
            # The first part is the header title (!Sample...), so skip it (parts[1:])
            labels = parts[1:]
            break

# 2. Clean up the text (remove quotes)
# We want to turn "Bone Marrow CD34..." into just "Healthy" or "AML"
clean_labels = []
for label in labels:
    # Remove the quotes
    text = label.replace('"', '')
    
    # Logic: If it mentions CD34, it's Healthy. Otherwise, it's AML.
    if "CD34" in text:
        clean_labels.append('Healthy')
    else:
        clean_labels.append('AML')

# 3. Check if the lengths match (Safety Check)
print(f"Count of Labels found: {len(clean_labels)}")
print(f"Count of Columns in df: {len(df.columns)}")

if len(clean_labels) == len(df.columns):
    print("MATCH! Attaching labels to data...")
    # Create a new row in the dataframe for these labels
    df.loc['diagnosis'] = clean_labels
else:
    print("ERROR: Mismatch. Don't proceed.")
    
    
# Next we need to flip the dataframe(Transpose it)
df_t = df.T

# 2. Check the new shape
print(f"Old Shape: {df.shape} (Genes x Patients)")
print(f"New Shape: {df_t.shape} (Patients x Genes)")

# 3. Look at it
display(df_t.head())


df_t.groupby('diagnosis').mean()

Loading data... this might take a second.
Data Loaded Successfully!
Dimensions: (22283, 64)

First 5 rows:


Unnamed: 0_level_0,GSM239170,GSM239323,GSM239324,GSM239326,GSM239328,GSM239329,GSM239331,GSM239332,GSM239333,GSM239334,...,GSM240500,GSM240501,GSM240502,GSM240503,GSM240504,GSM240505,GSM240506,GSM240507,GSM240508,GSM240509
ID_REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1007_s_at,3.016704,3.285669,2.929483,2.92282,3.159503,3.163327,2.985901,3.122709,3.070948,3.078003,...,3.134267,3.20705,2.971385,2.843774,3.037604,3.048857,3.113882,3.033006,3.212747,3.448303
1053_at,7.977735,6.532514,6.388007,6.46668,6.432795,6.407322,6.426471,6.376394,6.46907,6.621342,...,6.550238,6.916009,7.15575,6.47164,6.508025,6.631739,6.52861,6.684813,6.556031,6.341901
117_at,4.207281,4.994966,4.401597,4.747115,4.830046,4.213762,4.884418,4.431888,4.849665,4.432967,...,4.518836,4.193237,4.061884,4.175654,4.115092,5.090686,4.456707,3.923634,4.258683,4.327858
121_at,7.256095,7.420807,6.99934,7.094489,7.024333,7.17929,7.159899,7.009978,6.830979,7.208625,...,7.550825,7.289547,6.936977,7.136469,7.134417,7.281586,7.109759,7.110187,7.094449,7.096146
1255_g_at,2.204955,2.331625,2.133305,2.183329,2.127783,2.269698,2.264706,2.25394,2.287924,2.23319,...,2.286511,2.24272,2.199011,2.099221,2.214287,2.198988,2.162079,2.223595,2.256448,2.240602


Count of Labels found: 64
Count of Columns in df: 64
MATCH! Attaching labels to data...
Old Shape: (22284, 64) (Genes x Patients)
New Shape: (64, 22284) (Patients x Genes)


ID_REF,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,...,AFFX-r2-Hs28SrRNA-M_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at,diagnosis
GSM239170,3.016704,7.977735,4.207281,7.256095,2.204955,7.284373,4.265793,2.694352,3.646303,3.040292,...,5.662607,14.228826,13.713684,2.308299,3.315925,2.931302,2.921329,2.310954,2.979276,Healthy
GSM239323,3.285669,6.532514,4.994966,7.420807,2.331625,6.983594,4.970684,2.916325,8.817892,3.384867,...,6.186479,15.042417,14.486783,2.388966,3.40424,3.057272,3.216571,2.408195,3.111186,AML
GSM239324,2.929483,6.388007,4.401597,6.99934,2.133305,6.863371,4.595545,2.57813,11.424879,2.958851,...,5.821644,13.984025,13.287354,2.273666,3.193683,2.819624,2.770506,2.266408,2.856575,AML
GSM239326,2.92282,6.46668,4.747115,7.094489,2.183329,6.865971,4.575545,2.659718,10.747381,3.047881,...,6.145527,14.128244,13.76009,2.280654,3.269302,2.860894,2.870072,2.29356,2.95689,AML
GSM239328,3.159503,6.432795,4.830046,7.024333,2.127783,7.219841,4.547958,2.567437,9.296043,2.878563,...,6.359857,14.182431,13.60005,2.23884,3.232732,2.841193,2.801787,2.259069,2.902642,AML


ID_REF,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,...,AFFX-r2-Hs28SrRNA-5_at,AFFX-r2-Hs28SrRNA-M_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AML,3.146379,6.422993,6.03117,7.272644,2.252866,7.696075,4.730186,2.793132,8.264505,3.205697,...,6.132754,5.67505,14.539103,13.88757,2.360993,3.357989,3.004821,3.040991,2.363386,3.059377
Healthy,3.132631,7.054754,5.987412,7.325234,2.243145,7.989431,4.579823,2.77677,4.201097,3.119243,...,6.045309,5.544426,14.311515,13.706479,2.338003,3.333773,2.957392,2.98761,2.340729,3.030603
