In [14]:
import os
import pandas as pd
import gzip
import re

# list extracted files
data_dir = "GSE253975_data"
print(os.listdir(data_dir))

# show the only txt files
txt_files = [f for f in os.listdir(data_dir) if f.endswith(".txt.gz")]
for file in txt_files:
    print(file)


['GSM8031360_T4515.txt.gz', 'GSM8031363_T2791.txt.gz', 'GSM8031360_V11U14-043-A1.json.gz', 'GSM8031364_V11U14-040-C1.json.gz', 'GSM8031361_V11U14-042-A1.json.gz', 'GSM8031361_T0081.txt.gz', 'GSM8031364_T5498.txt.gz', 'GSM8031362_V11U14-042-D1.json.gz', 'GSM8031365_V11U14-044-A1.json.gz', 'GSM8031362_T3870.txt.gz', 'GSM8031363_V11U14-042-B1.json.gz', 'GSM8031366_T5359.txt.gz', 'GSM8031365_T4839.txt.gz']
GSM8031360_T4515.txt.gz
GSM8031363_T2791.txt.gz
GSM8031361_T0081.txt.gz
GSM8031364_T5498.txt.gz
GSM8031362_T3870.txt.gz
GSM8031366_T5359.txt.gz
GSM8031365_T4839.txt.gz


In [None]:
# Load One TXT File — Gene Expression Counts Per Spot

file_path = f"{data_dir}/GSM8031360_T4515.txt.gz" 
df = pd.read_csv(file_path, 
                 sep=r"\s+",    # Seperate on any whitespace
                 quotechar='"', # Handle text inside of quotes
                 index_col=0    # Use the first columns (gene names) as row labels
                ) 
print(df.shape, "\n")

# show the first 10 rowsh
with gzip.open(file_path, "rt") as f:
    for i in range(10): 
        print(f.readline())

(36601, 1870) 

"T4515_AAACAGAGCGACTCCT.1" "T4515_AAACAGCTTTCAGAAG.1" "T4515_AAACCGGGTAGGTACC.1" "T4515_AAACCTCATGAAGTTG.1" "T4515_AAACGAGACGGTTGAT.1" "T4515_AAACTGCTGGCTCCAA.1" "T4515_AAACTTGCAAACGTAT.1" "T4515_AAAGACTGGGCGCTTT.1" "T4515_AAAGGGATGTAGCAAG.1" "T4515_AAAGGGCAGCTTGAAT.1" "T4515_AAAGTCGACCCTCAGT.1" "T4515_AAATAACCATACGGGA.1" "T4515_AAATACCTATAAGCAT.1" "T4515_AAATCGTGTACCACAA.1" "T4515_AAATGATTCGATCAGC.1" "T4515_AAATGCTCGTTACGTT.1" "T4515_AAATGGCATGTCTTGT.1" "T4515_AAATGGTCAATGTGCC.1" "T4515_AAATTAACGGGTAGCT.1" "T4515_AAATTAATAAGCGCGA.1" "T4515_AAATTACACGACTCTG.1" "T4515_AAATTACCTATCGATG.1" "T4515_AACAACTGGTAGTTGC.1" "T4515_AACAATTACTCTACGC.1" "T4515_AACAGGAAATCGAATA.1" "T4515_AACAGGATGGGCCGCG.1" "T4515_AACATATCAACTGGTG.1" "T4515_AACATCGATACGTCTA.1" "T4515_AACCAAGACTTCTCTG.1" "T4515_AACCATGGGATCGCTA.1" "T4515_AACCCAGAGACGGAGA.1" "T4515_AACCGAGCTTGGTCAT.1" "T4515_AACCGTTGTGTTTGCT.1" "T4515_AACCTTTAAATACGGT.1" "T4515_AACCTTTACGACGTCT.1" "T4515_AACGATAATGCCGTAG.1" "T4515_AACGA

In [None]:
# Load All The Text Files

txt_files = [f for f in os.listdir(data_dir) if f.endswith(".txt.gz")]

# load each file and store in a dictionary
dfs = {}
for file in txt_files:
    path = os.path.join(data_dir, file)
    
    # read gene expression matrix: genes as rows, spots as columns
    try:
        df = pd.read_csv(
            path, 
            sep=r"\s+"      # split on any whitespace
            # index_col=0      # use the first column (gene names) as row labels
        )

        # give each column a unique name based on its sample
        sample_id = file.split("_")[1].split(".")[0]
        df.columns = [f"{sample_id}_{col}" for col in df.columns]

        # store in dict for later merging
        dfs[file] = df
        print(f"Loaded {file} with shape {df.shape}")
    except Exception as e:
        print(f"Failed to load {file}: {e}")

# merge all samples side-by-side (same genes, different spots)
merged_df = pd.concat(dfs.values(), axis=1)

print("\nFinal merged shape:", merged_df.shape)
merged_df.head()

Loaded GSM8031360_T4515.txt.gz with shape (36601, 1870)
Loaded GSM8031363_T2791.txt.gz with shape (36601, 866)
Loaded GSM8031361_T0081.txt.gz with shape (36601, 562)
Loaded GSM8031364_T5498.txt.gz with shape (36601, 1768)
Loaded GSM8031362_T3870.txt.gz with shape (36601, 1000)
Failed to load GSM8031366_T5359.txt.gz: Compressed file ended before the end-of-stream marker was reached
Loaded GSM8031365_T4839.txt.gz with shape (36601, 799)

Final merged shape: (36601, 6865)


Unnamed: 0,T4515_T4515_AAACAGAGCGACTCCT.1,T4515_T4515_AAACAGCTTTCAGAAG.1,T4515_T4515_AAACCGGGTAGGTACC.1,T4515_T4515_AAACCTCATGAAGTTG.1,T4515_T4515_AAACGAGACGGTTGAT.1,T4515_T4515_AAACTGCTGGCTCCAA.1,T4515_T4515_AAACTTGCAAACGTAT.1,T4515_T4515_AAAGACTGGGCGCTTT.1,T4515_T4515_AAAGGGATGTAGCAAG.1,T4515_T4515_AAAGGGCAGCTTGAAT.1,...,T4839_T4839_TTGCGTCGGCCAACCG.1,T4839_T4839_TTGGAAGAATACAGTC.1,T4839_T4839_TTGGACCATCTGGCAA.1,T4839_T4839_TTGGATATCGTCTACG.1,T4839_T4839_TTGGTCACACTCGTAA.1,T4839_T4839_TTGTAAGGACCTAAGT.1,T4839_T4839_TTGTGAACCTAATCCG.1,T4839_T4839_TTGTGGTAGGAGGGAT.1,T4839_T4839_TTGTGTTTCCCGAAAG.1,T4839_T4839_TTGTTTCCATACAACT.1
MIR1302-2HG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
FAM138A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OR4F5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AL627309.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AL627309.3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# Normilization 

In [18]:
# Load One JSON File — Spatial Coordinates and Metadata Per Spot

import json

file_path = f"{data_dir}/GSM8031360_V11U14-043-A1.json.gz" 

with gzip.open(file_path, "rt") as f:
    data = json.load(f)

# peek at the structure
print(type(data))
print(list(data.keys())[:10])  # top-level keys

print("\n", len(data["oligo"]))
print(data["oligo"][0])

<class 'dict'>
['fiducial', 'oligo', 'transform', 'serialNumber', 'area', 'checksum', 'removeImagePages']

 4992
{'x': 4825, 'y': 30073, 'row': 0, 'col': 0, 'dia': 90.56611, 'imageX': 10188.459, 'imageY': 9735.269}
