In [1]:
# Imports and basic setup
import re
from pathlib import Path
import pandas as pd
from IPython.display import display

pd.options.display.max_columns = 50
pd.options.display.width = 140

print('pandas', pd.__version__)

pandas 2.3.2


In [11]:
# Read file and collect !Sample_ header lines
# Read directly from the absolute path you provided to avoid any cwd issues
series_path = Path('/Users/user/BIOF 3001/Project/data/geo/GSE213478_series_matrix.txt')
text = series_path.read_text(encoding='utf-8')
lines = text.splitlines()
sample_lines = [L for L in lines if L.startswith('!Sample_')]
print('Header sample lines found:', len(sample_lines))

Header sample lines found: 45


In [18]:
# Parse header lines into keys and lists of values
sample_fields = {}
for L in sample_lines:
    parts = L.split('\t')
    key = parts[0].lstrip('!')
    # strip surrounding quotes and whitespace from values
    vals = [v.strip().strip('"') for v in parts[1:]]
    sample_fields.setdefault(key, []).append(vals)

# Determine number of samples
if 'Sample_geo_accession' in sample_fields:
    n = len(sample_fields['Sample_geo_accession'][0])
elif 'Sample_title' in sample_fields:
    n = len(sample_fields['Sample_title'][0])
else:
    raise RuntimeError('No sample id lines found in the series_matrix header')

# Helper to make safe column names from Sample_* keys
def safe_col(k):
    # remove leading 'Sample_' and replace non-alnum with '_'
    s = k.replace('Sample_','')
    import re as _re
    return _re.sub('[^0-9a-zA-Z]+','_', s).strip('_')

records = []
for i in range(n):
    rec = {}
    rec['GSM'] = sample_fields.get('Sample_geo_accession', [['']*n])[0][i] if 'Sample_geo_accession' in sample_fields else ''
    rec['title'] = sample_fields.get('Sample_title', [['']*n])[0][i] if 'Sample_title' in sample_fields else ''
    # Aggregate characteristics lines (there can be multiple "Sample_characteristics_ch1" lines)
    char_parts = []
    for k, vlist in sample_fields.items():
        if k.startswith('Sample_characteristics'):
            for line_vals in vlist:
                if i < len(line_vals) and line_vals[i] != '':
                    char_parts.append(line_vals[i])
    rec['characteristics'] = ' | '.join(char_parts)
    # Also capture other Sample_* fields (e.g., source name, organism, library, etc.) as separate columns
    for k, vlist in sample_fields.items():
        if k in ('Sample_geo_accession','Sample_title') or k.startswith('Sample_characteristics'):
            continue
        # take the first line block for this key (most Sample_* keys have only one block)
        try:
            val = vlist[0][i] if i < len(vlist[0]) else ''
        except Exception:
            # defensive fallback
            val = ''
        if val != '':
            rec[safe_col(k)] = val
    records.append(rec)

len(records)

987

In [19]:
# Extract tissue, age, sex, donor_id from characteristics using regex (case-insensitive)
re_tissue = re.compile(r'(?i)tissue: *([^|;]+)')
re_age = re.compile(r'(?i)age: *([0-9]+(?:-[0-9]+)?)')
# accept numeric encodings (1/2) as well as textual M/F or male/female
re_sex = re.compile(r'(?i)sex: *([12]|[MFmf]|male|female)')
# prefer collaborator_participant_id (e.g., GTEX-...) or participant_id, then fall back to generic labels
re_subject_collab = re.compile(r'(?i)collaborator_participant_id: *([A-Za-z0-9_-]+)')
re_subject_part = re.compile(r'(?i)participant_id: *([A-Za-z0-9_-]+)')
re_subject_generic = re.compile(r'(?i)(?:subjectid|subject_id|patient|patient_id|donor): *([A-Za-z0-9_-]+)')

def parse_age_raw(ar):
    if not ar:
        return '', None
    if '-' in ar:
        try:
            a,b = [float(x) for x in ar.split('-')[:2]]
            return ar, (a+b)/2
        except:
            return ar, None
    try:
        v = float(ar)
        return ar, v
    except:
        return ar, None

out = []
for r in records:
    ch = r.get('characteristics','') or ''
    tissue = ''
    age_raw = ''
    age_num = None
    sex = ''
    donor = ''
    m = re_tissue.search(ch)
    if m: tissue = m.group(1).strip()
    m = re_age.search(ch)
    if m:
        age_raw, age_num = parse_age_raw(m.group(1).strip())
    m = re_sex.search(ch)
    if m:
        s = m.group(1).strip()
        # map numeric codes to letters if needed
        if s in ('1','2'):
            sex = 'M' if s == '1' else 'F'
        else:
            s2 = s[0].upper() if s else ''
            if s2 in ('M','F'): sex = s2
    # donor id: prefer collaborator_participant_id (GTEX id), then participant_id, then generic patterns
    m = re_subject_collab.search(ch)
    if m:
        donor = m.group(1).strip()
    else:
        m = re_subject_part.search(ch)
        if m:
            donor = m.group(1).strip()
        else:
            m = re_subject_generic.search(ch)
            if m: donor = m.group(1).strip()
    # also pull any additional columns we captured earlier (source_name, organism, etc.)
    rec = {
        'GSM': r.get('GSM',''),
        'title': r.get('title',''),
        'tissue': tissue,
        'age_raw': age_raw,
        'age_num': age_num,
        'sex': sex,
        'donor_id': donor,
        'characteristics': ch,
    }
    # include any extra Sample_* columns that were recorded (already added to record keys)
    for k,v in r.items():
        if k in ('GSM','title','characteristics'):
            continue
        if k not in rec:
            rec[k] = v
    out.append(rec)

df = pd.DataFrame(out)
# coerce age_num to numeric
df['age_num'] = pd.to_numeric(df['age_num'], errors='coerce')
df.shape

(987, 37)

In [20]:
# Save CSV and show preview/summary
outp = Path('/Users/user/BIOF 3001/Project/data/geo/gse213478_metadata_parsed.csv')
outp.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(outp, index=False)
print('Wrote:', outp.resolve())
display(df.head(20))
print('\nTissue counts:')
print(df['tissue'].value_counts(dropna=False).head(30))
print('\nMissingness summary (age/sex):')
print('age_num missing:', df['age_num'].isna().sum(), '/', len(df))
print('sex missing:', df['sex'].eq('').sum(), '/', len(df))

Wrote: /Users/user/BIOF 3001/Project/data/geo/gse213478_metadata_parsed.csv


Unnamed: 0,GSM,title,tissue,age_raw,age_num,sex,donor_id,characteristics,status,submission_date,last_update_date,type,channel_count,source_name_ch1,organism_ch1,molecule_ch1,extract_protocol_ch1,label_ch1,label_protocol_ch1,taxid_ch1,hyb_protocol,scan_protocol,description,data_processing,platform_id,contact_name,contact_email,contact_laboratory,contact_department,contact_institute,contact_address,contact_city,contact_state,contact_zip_postal_code,contact_country,supplementary_file,data_row_count
0,GSM6586938,GTEX-11EM3-1326-SM-GW24C,Breast - Mammary Tissue,20-29,24.5,F,GTEX-11EM3,sample_plate: CO-25654826 | sample_plate_posit...,Public on Sep 16 2022,Sep 15 2022,Sep 16 2022,genomic,1,Breast - Mammary Tissue,Homo sapiens,genomic DNA,DNA samples were extracted from GTEx tissue sa...,Cy5 and Cy3,Standard Illumina Protocol,9606,All samples were prepared and analyzed in acc...,Arrays were imaged using BeadArray Reader usin...,sample name:,Raw DNAm data was processed with ChAMP softwar...,GPL21145,"Meritxell,,Oliva",meritxellop@gmail.com,Brandon Pierce & Lin Chen,Public Health Sciences,University of Chicago,5841 S. Maryland Ave,Chicago,Illinois,60637,USA,NONE,0
1,GSM6586939,GTEX-11EMC-2026-SM-GW242,Breast - Mammary Tissue,60-69,64.5,F,GTEX-11EMC,sample_plate: CO-25654826 | sample_plate_posit...,Public on Sep 16 2022,Sep 15 2022,Sep 16 2022,genomic,1,Breast - Mammary Tissue,Homo sapiens,genomic DNA,DNA samples were extracted from GTEx tissue sa...,Cy5 and Cy3,Standard Illumina Protocol,9606,All samples were prepared and analyzed in acc...,Arrays were imaged using BeadArray Reader usin...,sample name:,Raw DNAm data was processed with ChAMP softwar...,GPL21145,"Meritxell,,Oliva",meritxellop@gmail.com,Brandon Pierce & Lin Chen,Public Health Sciences,University of Chicago,5841 S. Maryland Ave,Chicago,Illinois,60637,USA,NONE,0
2,GSM6586940,GTEX-11GSP-0926-SM-GW21X,Breast - Mammary Tissue,60-69,64.5,F,GTEX-11GSP,sample_plate: CO-25654826 | sample_plate_posit...,Public on Sep 16 2022,Sep 15 2022,Sep 16 2022,genomic,1,Breast - Mammary Tissue,Homo sapiens,genomic DNA,DNA samples were extracted from GTEx tissue sa...,Cy5 and Cy3,Standard Illumina Protocol,9606,All samples were prepared and analyzed in acc...,Arrays were imaged using BeadArray Reader usin...,sample name:,Raw DNAm data was processed with ChAMP softwar...,GPL21145,"Meritxell,,Oliva",meritxellop@gmail.com,Brandon Pierce & Lin Chen,Public Health Sciences,University of Chicago,5841 S. Maryland Ave,Chicago,Illinois,60637,USA,NONE,0
3,GSM6586941,GTEX-13D11-1026-SM-GW1D7,Breast - Mammary Tissue,50-59,54.5,F,GTEX-13D11,sample_plate: CO-25654153 | sample_plate_posit...,Public on Sep 16 2022,Sep 15 2022,Sep 16 2022,genomic,1,Breast - Mammary Tissue,Homo sapiens,genomic DNA,DNA samples were extracted from GTEx tissue sa...,Cy5 and Cy3,Standard Illumina Protocol,9606,All samples were prepared and analyzed in acc...,Arrays were imaged using BeadArray Reader usin...,sample name:,Raw DNAm data was processed with ChAMP softwar...,GPL21145,"Meritxell,,Oliva",meritxellop@gmail.com,Brandon Pierce & Lin Chen,Public Health Sciences,University of Chicago,5841 S. Maryland Ave,Chicago,Illinois,60637,USA,NONE,0
4,GSM6586942,GTEX-13PL6-2926-SM-GW1SN,Breast - Mammary Tissue,40-49,44.5,F,GTEX-13PL6,sample_plate: CO-25654823 | sample_plate_posit...,Public on Sep 16 2022,Sep 15 2022,Sep 16 2022,genomic,1,Breast - Mammary Tissue,Homo sapiens,genomic DNA,DNA samples were extracted from GTEx tissue sa...,Cy5 and Cy3,Standard Illumina Protocol,9606,All samples were prepared and analyzed in acc...,Arrays were imaged using BeadArray Reader usin...,sample name:,Raw DNAm data was processed with ChAMP softwar...,GPL21145,"Meritxell,,Oliva",meritxellop@gmail.com,Brandon Pierce & Lin Chen,Public Health Sciences,University of Chicago,5841 S. Maryland Ave,Chicago,Illinois,60637,USA,NONE,0
5,GSM6586943,GTEX-147F3-1626-SM-GW1DU,Breast - Mammary Tissue,50-59,54.5,F,GTEX-147F3,sample_plate: CO-25654153 | sample_plate_posit...,Public on Sep 16 2022,Sep 15 2022,Sep 16 2022,genomic,1,Breast - Mammary Tissue,Homo sapiens,genomic DNA,DNA samples were extracted from GTEx tissue sa...,Cy5 and Cy3,Standard Illumina Protocol,9606,All samples were prepared and analyzed in acc...,Arrays were imaged using BeadArray Reader usin...,sample name:,Raw DNAm data was processed with ChAMP softwar...,GPL21145,"Meritxell,,Oliva",meritxellop@gmail.com,Brandon Pierce & Lin Chen,Public Health Sciences,University of Chicago,5841 S. Maryland Ave,Chicago,Illinois,60637,USA,NONE,0
6,GSM6586944,GTEX-14BIL-2526-SM-GW1SZ,Breast - Mammary Tissue,50-59,54.5,M,GTEX-14BIL,sample_plate: CO-25654823 | sample_plate_posit...,Public on Sep 16 2022,Sep 15 2022,Sep 16 2022,genomic,1,Breast - Mammary Tissue,Homo sapiens,genomic DNA,DNA samples were extracted from GTEx tissue sa...,Cy5 and Cy3,Standard Illumina Protocol,9606,All samples were prepared and analyzed in acc...,Arrays were imaged using BeadArray Reader usin...,sample name:,Raw DNAm data was processed with ChAMP softwar...,GPL21145,"Meritxell,,Oliva",meritxellop@gmail.com,Brandon Pierce & Lin Chen,Public Health Sciences,University of Chicago,5841 S. Maryland Ave,Chicago,Illinois,60637,USA,NONE,0
7,GSM6586945,GTEX-14BIN-1826-SM-GW1X6,Breast - Mammary Tissue,50-59,54.5,F,GTEX-14BIN,sample_plate: CO-25654824 | sample_plate_posit...,Public on Sep 16 2022,Sep 15 2022,Sep 16 2022,genomic,1,Breast - Mammary Tissue,Homo sapiens,genomic DNA,DNA samples were extracted from GTEx tissue sa...,Cy5 and Cy3,Standard Illumina Protocol,9606,All samples were prepared and analyzed in acc...,Arrays were imaged using BeadArray Reader usin...,sample name:,Raw DNAm data was processed with ChAMP softwar...,GPL21145,"Meritxell,,Oliva",meritxellop@gmail.com,Brandon Pierce & Lin Chen,Public Health Sciences,University of Chicago,5841 S. Maryland Ave,Chicago,Illinois,60637,USA,NONE,0
8,GSM6586946,GTEX-14E6C-1326-SM-GW1D1,Breast - Mammary Tissue,40-49,44.5,M,GTEX-14E6C,sample_plate: CO-25654153 | sample_plate_posit...,Public on Sep 16 2022,Sep 15 2022,Sep 16 2022,genomic,1,Breast - Mammary Tissue,Homo sapiens,genomic DNA,DNA samples were extracted from GTEx tissue sa...,Cy5 and Cy3,Standard Illumina Protocol,9606,All samples were prepared and analyzed in acc...,Arrays were imaged using BeadArray Reader usin...,sample name:,Raw DNAm data was processed with ChAMP softwar...,GPL21145,"Meritxell,,Oliva",meritxellop@gmail.com,Brandon Pierce & Lin Chen,Public Health Sciences,University of Chicago,5841 S. Maryland Ave,Chicago,Illinois,60637,USA,NONE,0
9,GSM6586947,GTEX-14H4A-2526-SM-GW1GY,Breast - Mammary Tissue,40-49,44.5,F,GTEX-14H4A,sample_plate: CO-25654154 | sample_plate_posit...,Public on Sep 16 2022,Sep 15 2022,Sep 16 2022,genomic,1,Breast - Mammary Tissue,Homo sapiens,genomic DNA,DNA samples were extracted from GTEx tissue sa...,Cy5 and Cy3,Standard Illumina Protocol,9606,All samples were prepared and analyzed in acc...,Arrays were imaged using BeadArray Reader usin...,sample name:,Raw DNAm data was processed with ChAMP softwar...,GPL21145,"Meritxell,,Oliva",meritxellop@gmail.com,Brandon Pierce & Lin Chen,Public Health Sciences,University of Chicago,5841 S. Maryland Ave,Chicago,Illinois,60637,USA,NONE,0



Tissue counts:
tissue
Colon - Transverse         224
Lung                       223
Ovary                      164
Prostate                   123
Whole Blood                 54
Breast - Mammary Tissue     52
Kidney - Cortex             50
Testis                      50
Muscle - Skeletal           47
Name: count, dtype: int64

Missingness summary (age/sex):
age_num missing: 0 / 987
sex missing: 0 / 987
