In [2]:
import os
import pandas as pd

In [3]:
csv_dir = "csv"
folders = [folder for folder in os.listdir(csv_dir) if "HuBMAP" in folder or "GTEx" in folder or "CxG" in folder]

number of datasets

In [3]:
count = 0
for folder in folders:
    if folder != ".DS_Store":
        count += len(os.listdir(os.path.join(csv_dir,folder)))
print(count)

5391


number of organs

In [4]:
organs = []
for folder in folders:
    if folder != ".DS_Store":
        organs.append(" ".join(folder.split("_")[:-2]))
print(len(set(organs)))

52


number of cell types

In [5]:
cell_types = []
for folder in folders:
    if folder != ".DS_Store":
        for csv_file in os.listdir(os.path.join(csv_dir,folder)):
            data = pd.read_csv(os.path.join(csv_dir,folder,csv_file))
            cell_types.extend(data['cell_type'].unique())
print(len(set(cell_types)))

677


number of datasets from each source

In [4]:
def count_datasets(folders):
    count = 0
    for folder in folders:
        if folder != ".DS_Store":
            count += len(os.listdir(os.path.join(csv_dir,folder)))
    return count

for source in ['CxG','GTEx','HuBMAP']:
    folders = [folder for folder in os.listdir(csv_dir) if source in folder]
    print("{0}: {1}".format(source, count_datasets(folders)))

CxG: 5219
GTEx: 25
HuBMAP: 147


possible duplicates between sources

In [16]:
def get_all_dataset_info(main_dir):
        info = []
        for folder in os.listdir(main_dir):
            if folder != ".DS_Store":
                for file in os.listdir(os.path.join(main_dir,folder)):
                    data = pd.read_csv(os.path.join(main_dir,folder,file))
                    info.append([os.path.join(main_dir,folder,file),data['count'].sum()])
        return info

dataset_info = pd.DataFrame(get_all_dataset_info(csv_dir),columns=['dataset_path','cell_counts'])

In [17]:
dataset_info

Unnamed: 0,dataset_path,cell_counts
0,csv/Small_Intestine_CxG_Portal/cf18e897-e8b2-4...,1101
1,csv/Small_Intestine_CxG_Portal/e09e81de-a537-4...,6294
2,csv/Small_Intestine_CxG_Portal/7ae94573-bd9b-4...,1933
3,csv/Small_Intestine_CxG_Portal/2993616d-12b8-4...,1199
4,csv/Small_Intestine_CxG_Portal/771fe5d8-a50d-4...,128
...,...,...
5771,csv/Parotid_Gland_CxG_Portal/4676160f-b0f6-4de...,118
5772,csv/Parotid_Gland_CxG_Portal/bd13c169-af97-4d8...,9437
5773,csv/Parotid_Gland_CxG_Portal/13c0b826-87e9-4c8...,1270
5774,csv/Parotid_Gland_CxG_Portal/7995d395-6682-4f7...,3628


In [19]:
dataset_info['source'] = dataset_info.apply(lambda row: row['dataset_path'].split("/")[1].split("_")[-2], axis=1)

In [32]:
dataset_info = dataset_info[dataset_info['source'].isin(['CxG','GTEx','HuBMAP'])]

In [34]:
dataset_info

Unnamed: 0,dataset_path,cell_counts,source
0,csv/Small_Intestine_CxG_Portal/cf18e897-e8b2-4...,1101,CxG
1,csv/Small_Intestine_CxG_Portal/e09e81de-a537-4...,6294,CxG
2,csv/Small_Intestine_CxG_Portal/7ae94573-bd9b-4...,1933,CxG
3,csv/Small_Intestine_CxG_Portal/2993616d-12b8-4...,1199,CxG
4,csv/Small_Intestine_CxG_Portal/771fe5d8-a50d-4...,128,CxG
...,...,...,...
5771,csv/Parotid_Gland_CxG_Portal/4676160f-b0f6-4de...,118,CxG
5772,csv/Parotid_Gland_CxG_Portal/bd13c169-af97-4d8...,9437,CxG
5773,csv/Parotid_Gland_CxG_Portal/13c0b826-87e9-4c8...,1270,CxG
5774,csv/Parotid_Gland_CxG_Portal/7995d395-6682-4f7...,3628,CxG


In [35]:
possible_duplicates = dataset_info.groupby('cell_counts').agg(list)

In [41]:
possible_duplicates['keep'] = possible_duplicates.apply(lambda row: False if len(set(row['source'])) == 1 else True, axis=1)

In [45]:
possible_duplicates = possible_duplicates[possible_duplicates['keep']].drop('keep',axis=1).reset_index()

In [48]:
possible_duplicates

Unnamed: 0,cell_counts,dataset_path,source
0,40,"[csv/Skin_GTEx_Portal/GTEX-1CAMR-5015.csv, csv...","[GTEx, CxG, CxG, CxG, CxG]"
1,208,[csv/Heart_CxG_Portal/45dd32d7-00ff-4a1a-9c48-...,"[CxG, HuBMAP, CxG]"
2,326,"[csv/Skin_GTEx_Portal/GTEX-15EOM-5003.csv, csv...","[GTEx, CxG, CxG, CxG, CxG]"
3,501,[csv/Kidney_CxG_Portal/cd9f2c40-6470-4ef6-98b0...,"[CxG, CxG, HuBMAP]"
4,1456,"[csv/Breast_GTEx_Portal/GTEX-1MCC2-5013.csv, c...","[GTEx, CxG, CxG]"
5,1499,[csv/Small_Intestine_CxG_Portal/f909b36b-fb4e-...,"[CxG, HuBMAP, CxG]"
6,1500,[csv/Eye_CxG_Portal/890b91e0-e613-4de6-acab-96...,"[CxG, CxG, HuBMAP, CxG, CxG]"
7,1502,[csv/Small_Intestine_CxG_Portal/1e7713ad-b371-...,"[CxG, GTEx, CxG]"
8,3183,[csv/Heart_CxG_Portal/a9ccfc4c-14c0-40ce-8c07-...,"[CxG, HuBMAP]"
9,3999,[csv/Large_Intestine_CxG_Portal/2993616d-12b8-...,"[CxG, HuBMAP]"


In [47]:
possible_duplicates.to_csv('possible_duplicates.csv')