In [67]:
import os
import pandas as pd
from collections import Counter

In [3]:
csv_dir = "csv"
folders = [folder for folder in os.listdir(csv_dir) if "HuBMAP" in folder or "GTEx" in folder or "CxG" in folder]

number of datasets

In [3]:
count = 0
for folder in folders:
    if folder != ".DS_Store":
        count += len(os.listdir(os.path.join(csv_dir,folder)))
print(count)

5391


number of organs

In [4]:
organs = []
for folder in folders:
    if folder != ".DS_Store":
        organs.append(" ".join(folder.split("_")[:-2]))
print(len(set(organs)))

52


number of cell types

In [5]:
cell_types = []
for folder in folders:
    if folder != ".DS_Store":
        for csv_file in os.listdir(os.path.join(csv_dir,folder)):
            data = pd.read_csv(os.path.join(csv_dir,folder,csv_file))
            cell_types.extend(data['cell_type'].unique())
print(len(set(cell_types)))

677


number of datasets from each source

In [4]:
def count_datasets(folders):
    count = 0
    for folder in folders:
        if folder != ".DS_Store":
            count += len(os.listdir(os.path.join(csv_dir,folder)))
    return count

for source in ['CxG','GTEx','HuBMAP']:
    folders = [folder for folder in os.listdir(csv_dir) if source in folder]
    print("{0}: {1}".format(source, count_datasets(folders)))

CxG: 5219
GTEx: 25
HuBMAP: 147


possible duplicates between sources

In [96]:
def get_all_dataset_info(main_dir):
        info = []
        for folder in os.listdir(main_dir):
            if folder != ".DS_Store":
                for file in os.listdir(os.path.join(main_dir,folder)):
                    data = pd.read_csv(os.path.join(main_dir,folder,file))
                    info.append([os.path.join(main_dir,folder,file),data['count'].sum()])
        return info

dataset_info = pd.DataFrame(get_all_dataset_info(csv_dir),columns=['dataset_path','cell_counts'])

In [97]:
dataset_info

Unnamed: 0,dataset_path,cell_counts
0,csv/Small_Intestine_CxG_Portal/cf18e897-e8b2-4...,1101
1,csv/Small_Intestine_CxG_Portal/e09e81de-a537-4...,6294
2,csv/Small_Intestine_CxG_Portal/7ae94573-bd9b-4...,1933
3,csv/Small_Intestine_CxG_Portal/2993616d-12b8-4...,1199
4,csv/Small_Intestine_CxG_Portal/771fe5d8-a50d-4...,128
...,...,...
5771,csv/Parotid_Gland_CxG_Portal/4676160f-b0f6-4de...,118
5772,csv/Parotid_Gland_CxG_Portal/bd13c169-af97-4d8...,9437
5773,csv/Parotid_Gland_CxG_Portal/13c0b826-87e9-4c8...,1270
5774,csv/Parotid_Gland_CxG_Portal/7995d395-6682-4f7...,3628


In [98]:
dataset_info['source'] = dataset_info.apply(lambda row: row['dataset_path'].split("/")[1].split("_")[-2], axis=1)

In [99]:
dataset_info = dataset_info[dataset_info['source'].isin(['CxG','GTEx','HuBMAP'])]

In [100]:
dataset_info['organ'] = dataset_info.apply(lambda row: " ".join(row['dataset_path'].split("/")[1].split("_")[:-2]), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_info['organ'] = dataset_info.apply(lambda row: " ".join(row['dataset_path'].split("/")[1].split("_")[:-2]), axis=1)


In [101]:
dataset_info

Unnamed: 0,dataset_path,cell_counts,source,organ
0,csv/Small_Intestine_CxG_Portal/cf18e897-e8b2-4...,1101,CxG,Small Intestine
1,csv/Small_Intestine_CxG_Portal/e09e81de-a537-4...,6294,CxG,Small Intestine
2,csv/Small_Intestine_CxG_Portal/7ae94573-bd9b-4...,1933,CxG,Small Intestine
3,csv/Small_Intestine_CxG_Portal/2993616d-12b8-4...,1199,CxG,Small Intestine
4,csv/Small_Intestine_CxG_Portal/771fe5d8-a50d-4...,128,CxG,Small Intestine
...,...,...,...,...
5771,csv/Parotid_Gland_CxG_Portal/4676160f-b0f6-4de...,118,CxG,Parotid Gland
5772,csv/Parotid_Gland_CxG_Portal/bd13c169-af97-4d8...,9437,CxG,Parotid Gland
5773,csv/Parotid_Gland_CxG_Portal/13c0b826-87e9-4c8...,1270,CxG,Parotid Gland
5774,csv/Parotid_Gland_CxG_Portal/7995d395-6682-4f7...,3628,CxG,Parotid Gland


In [102]:
possible_duplicates = dataset_info.groupby(['cell_counts','organ']).agg(list)

In [103]:
possible_duplicates['keep'] = possible_duplicates.apply(lambda row: False if len(set(row['source'])) == 1 else True, axis=1)

In [104]:
possible_duplicates = possible_duplicates[possible_duplicates['keep']].drop('keep',axis=1).reset_index()

In [105]:
possible_duplicates

Unnamed: 0,cell_counts,organ,dataset_path,source
0,40,Skin,"[csv/Skin_GTEx_Portal/GTEX-1CAMR-5015.csv, csv...","[GTEx, CxG]"
1,208,Heart,[csv/Heart_CxG_Portal/45dd32d7-00ff-4a1a-9c48-...,"[CxG, HuBMAP]"
2,501,Kidney,[csv/Kidney_CxG_Portal/cd9f2c40-6470-4ef6-98b0...,"[CxG, CxG, HuBMAP]"
3,1502,Prostate,"[csv/Prostate_GTEx_Portal/GTEX-15CHR-5014.csv,...","[GTEx, CxG]"
4,5998,Kidney,[csv/Kidney_CxG_Portal/16eccda8-805c-4e06-b7b6...,"[CxG, HuBMAP]"
5,6285,Prostate,"[csv/Prostate_GTEx_Portal/GTEX-12BJ1-5007.csv,...","[GTEx, CxG]"
6,10347,Prostate,"[csv/Prostate_GTEx_Portal/GTEX-1HSMQ-5014.csv,...","[GTEx, CxG]"
7,12927,Prostate,"[csv/Prostate_GTEx_Portal/GTEX-1I1GU-5006.csv,...","[GTEx, CxG]"


In [106]:
possible_duplicates.to_csv('possible_duplicates.csv')