The following script will parse the GEO IDs (samples) from the R scripts that were retrieved manually from archS4 for each individual cell type category.
The aim is to extract the expression vectors of the selected samples from the hdf5 file downloaded from archS4 dataset version V11.

In [1]:
import pandas as pd
import os

### Loading samples list

### Parsing R scripts from archs4 to extract sample geo_ids and cell types

In [2]:
labels=[]
for subdir, dirs, files in os.walk("../inputs/"):
    for file in files:
        samples=[]
        filepath = subdir + os.sep + file
        if filepath.endswith(".r"):
            with open(filepath) as inf:
                temp_df=pd.DataFrame(columns=["geo_id","type"])
                content=inf.read().splitlines()
                content=str(content[16:-18]).strip()
                for sample in content[content.find("(")+1:content.find(")")].split(","):
                    clean_sample=sample.strip(" \\'\"")
                    if clean_sample!="":
                        samples.append(clean_sample)
            temp_df["geo_id"]=samples
            temp_df["type"]=file.strip(".r")
            labels.append(temp_df)
labels = pd.concat(labels, ignore_index=True)

In [3]:
labels.head()

Unnamed: 0,geo_id,type
0,GSM1241249,HCT116
1,GSM1241140,HCT116
2,GSM1241225,HCT116
3,GSM1241162,HCT116
4,GSM1151059,HCT116


In [4]:
labels.shape

(74985, 2)

In [5]:
neuron=labels[labels["type"]=="Neuron"].index
len(neuron)

5326

searching for duplicated samples...

In [6]:
len(labels["geo_id"])

74985

In [7]:
len(set(labels["geo_id"]))

71961

In [8]:
duplicated=(labels.geo_id.value_counts()>1)[(labels.geo_id.value_counts()>1)].index

In [9]:
len(duplicated)

3023

In [10]:
duplicated[0:10]

Index(['GSM4150378', 'GSM3496101', 'GSM2493016', 'GSM2493037', 'GSM2493036',
       'GSM2493032', 'GSM2493030', 'GSM2493024', 'GSM2493023', 'GSM2493022'],
      dtype='object')

In [11]:
#What are these duplicated samples? They belong to the same experiment, they were just labeled both as tissues and cell lines
labels[labels["geo_id"]=="GSM2493511"]

Unnamed: 0,geo_id,type
30714,GSM2493511,IMR90
40110,GSM2493511,Fibroblast


In [12]:
labels[labels["geo_id"]=="GSM3177890"]

Unnamed: 0,geo_id,type
71788,GSM3177890,Dendritic
74142,GSM3177890,Plasmacytoid Dendritic Cell


In [13]:
labels.set_index("geo_id",drop=False,inplace=True)

In [14]:
#dropping out duplicated samples keeping just the first one
labels.drop_duplicates(subset="geo_id",keep="last",inplace=True)

In [15]:
labels.shape

(71961, 2)

In [16]:
labels.to_csv("../outputs/samples_types.csv",index=None)