In [1]:
# Imports

import os
import re
import gzip
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# List Files

data_dir = "GSE253975_data"
print(os.listdir(data_dir))

# show the only txt files
txt_files = [f for f in os.listdir(data_dir) if f.endswith(".gz")]
for file in txt_files:
    print(file)

['GSM8031360_T4515.txt.gz', 'GSM8031363_T2791.txt.gz', 'GSM8031360_V11U14-043-A1.json.gz', 'GSM8031364_V11U14-040-C1.json.gz', 'GSM8031361_V11U14-042-A1.json.gz', 'GSM8031361_T0081.txt.gz', 'GSM8031364_T5498.txt.gz', 'GSM8031362_V11U14-042-D1.json.gz', 'GSM8031365_V11U14-044-A1.json.gz', 'GSM8031362_T3870.txt.gz', 'GSM8031363_V11U14-042-B1.json.gz', 'GSM8031366_T5359.txt.gz', 'GSM8031365_T4839.txt.gz']
GSM8031360_T4515.txt.gz
GSM8031363_T2791.txt.gz
GSM8031361_T0081.txt.gz
GSM8031364_T5498.txt.gz
GSM8031362_T3870.txt.gz
GSM8031366_T5359.txt.gz
GSM8031365_T4839.txt.gz


In [3]:
# Load One TXT File â€” Gene Expression Counts Per Spot

file_path = f"{data_dir}/GSM8031360_T4515.txt.gz" 
single_df = pd.read_csv(file_path, 
                 sep=r"\s+",    # Seperate on any whitespace
                 quotechar='"', # Handle text inside of quotes
                 index_col=0    # Use the first columns (gene names) as row labels
                ) 
print(single_df.shape, "\n")

# show the first 10 rows
with gzip.open(file_path, "rt") as f:
    for i in range(10): 
        print(f.readline())

(36601, 1870) 

"T4515_AAACAGAGCGACTCCT.1" "T4515_AAACAGCTTTCAGAAG.1" "T4515_AAACCGGGTAGGTACC.1" "T4515_AAACCTCATGAAGTTG.1" "T4515_AAACGAGACGGTTGAT.1" "T4515_AAACTGCTGGCTCCAA.1" "T4515_AAACTTGCAAACGTAT.1" "T4515_AAAGACTGGGCGCTTT.1" "T4515_AAAGGGATGTAGCAAG.1" "T4515_AAAGGGCAGCTTGAAT.1" "T4515_AAAGTCGACCCTCAGT.1" "T4515_AAATAACCATACGGGA.1" "T4515_AAATACCTATAAGCAT.1" "T4515_AAATCGTGTACCACAA.1" "T4515_AAATGATTCGATCAGC.1" "T4515_AAATGCTCGTTACGTT.1" "T4515_AAATGGCATGTCTTGT.1" "T4515_AAATGGTCAATGTGCC.1" "T4515_AAATTAACGGGTAGCT.1" "T4515_AAATTAATAAGCGCGA.1" "T4515_AAATTACACGACTCTG.1" "T4515_AAATTACCTATCGATG.1" "T4515_AACAACTGGTAGTTGC.1" "T4515_AACAATTACTCTACGC.1" "T4515_AACAGGAAATCGAATA.1" "T4515_AACAGGATGGGCCGCG.1" "T4515_AACATATCAACTGGTG.1" "T4515_AACATCGATACGTCTA.1" "T4515_AACCAAGACTTCTCTG.1" "T4515_AACCATGGGATCGCTA.1" "T4515_AACCCAGAGACGGAGA.1" "T4515_AACCGAGCTTGGTCAT.1" "T4515_AACCGTTGTGTTTGCT.1" "T4515_AACCTTTAAATACGGT.1" "T4515_AACCTTTACGACGTCT.1" "T4515_AACGATAATGCCGTAG.1" "T4515_AACGA

In [4]:
# Load One JSON File â€” Spatial Coordinates and Metadata Per Spot

import json
from pprint import pprint

file_path = f"{data_dir}/GSM8031360_V11U14-043-A1.json.gz" 

with gzip.open(file_path, "rt") as f:
    data = json.load(f)

# peek at the structure
print(type(data))
print(list(data.keys())[:10])  # top-level keys

print("\n", len(data["oligo"]))
print(data["oligo"][0])

pprint(data)  # prints neatly formatted structure


<class 'dict'>
['fiducial', 'oligo', 'transform', 'serialNumber', 'area', 'checksum', 'removeImagePages']

 4992
{'x': 4825, 'y': 30073, 'row': 0, 'col': 0, 'dia': 90.56611, 'imageX': 10188.459, 'imageY': 9735.269}
{'area': 'A1',
 'checksum': 'cbe9bba9012161af00fad0d01a173683',
 'fiducial': [{'col': 0,
               'dia': 146.2991,
               'fidName': 'HOURGLASS',
               'imageX': 10784.154,
               'imageY': 10476.255,
               'row': 0,
               'x': 4288,
               'y': 29652},
              {'col': 1,
               'dia': 146.2991,
               'fidName': 'HOURGLASS',
               'imageX': 10601.758,
               'imageY': 10373.979,
               'row': 1,
               'x': 4363,
               'y': 29782},
              {'col': 2,
               'dia': 146.2991,
               'fidName': 'HOURGLASS',
               'imageX': 10781.599,
               'imageY': 10267.272,
               'row': 0,
               'x': 4438,
        

In [5]:
print("Expression matrix shape:", single_df.shape)
print("Spatial JSON spots:", len(data["oligo"]))

Expression matrix shape: (36601, 1870)
Spatial JSON spots: 4992
