The following script will parse the GEO IDs (samples) from the R scripts that were retrieved manually from archS4 for each individual cell type category.
The aim is to extract the expression vectors of the selected samples from the hdf5 file downloaded from archS4 dataset version V11.

In [1]:
import pandas as pd
import h5py
import os

### Loading samples list

### Parsing R scripts from archs4 to extract sample geo_ids and cell types

In [2]:
labels=pd.DataFrame(columns=["geo_id","type"])
for subdir, dirs, files in os.walk("./"):
    for file in files:
        samples=[]
        filepath = subdir + os.sep + file
        if filepath.endswith(".r"):
            with open(filepath) as inf:
                temp_df=pd.DataFrame(columns=["geo_id","type"])
                content=inf.read().splitlines()
                content=str(content[16:-18]).strip()
                for sample in content[content.find("(")+1:content.find(")")].split(","):
                    clean_sample=sample.strip(" \\'\"")
                    if clean_sample!="":
                        samples.append(clean_sample)
            temp_df["geo_id"]=samples
            temp_df["type"]=file.strip(".r")
            labels=labels.append(temp_df,ignore_index=True)

In [3]:
labels.head()

Unnamed: 0,geo_id,type
0,GSM2132128,Podocyte
1,GSM2132132,Podocyte
2,GSM2132130,Podocyte
3,GSM2132129,Podocyte
4,GSM2132133,Podocyte


In [4]:
labels.shape

(74985, 2)

In [5]:
neuron=labels[labels["type"]=="Neuron"].index
len(neuron)

5326

searching for duplicated samples...

In [6]:
len(labels["geo_id"])

74985

In [7]:
len(set(labels["geo_id"]))

71961

In [8]:
duplicated=list(set([x for x in list(labels.geo_id.values) if list(labels.geo_id.values).count(x)>1]))

In [9]:
len(duplicated)

3023

In [10]:
duplicated[0:10]

['GSM3026829',
 'GSM2431629',
 'GSM3729259',
 'GSM4766708',
 'GSM2574514',
 'GSM3615530',
 'GSM2287040',
 'GSM3965173',
 'GSM3177890',
 'GSM2493511']

In [15]:
#What are these duplicated samples? They belong to the same experiment, they were just labeled both as tissues and cell lines
labels[labels["geo_id"]=="GSM2493511"]

Unnamed: 0,geo_id,type
16449,GSM2493511,Fibroblast
67279,GSM2493511,IMR90


In [16]:
labels[labels["geo_id"]=="GSM3177890"]

Unnamed: 0,geo_id,type
6648,GSM3177890,Plasmacytoid Dendritic Cell
26448,GSM3177890,Dendritic


In [17]:
labels.set_index("geo_id",drop=False,inplace=True)

In [18]:
#dropping out duplicated samples keeping just the first one
labels.drop_duplicates(subset="geo_id",keep="last",inplace=True)

In [19]:
labels.shape

(71961, 2)

In [20]:
labels.to_csv("samples_types.csv",index=None)