# Convert h5ad object to R

In [1]:
import scanpy as sc
import gzip
from pathlib import Path
import os
from scipy.io import mmwrite
import pandas as pd

def h5ad_to_seuratObj(adata_fp,output_dir,dataset_name=None):
    if dataset_name is None:
        dataset_name = Path(adata_fp).stem
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)
        
    adata = sc.read_h5ad(adata_fp)
    
    # write OBS
    adata.obs.to_csv(f"{output_dir}/{dataset_name}_obs.csv.gz", compression="gzip")
    # write VAR
    adata.var.to_csv(f"{output_dir}/{dataset_name}_var.csv.gz", compression="gzip")
    # write UMAP coordinates
    umap_df = pd.DataFrame(
        adata.obsm['X_umap'],
        index=adata.obs.index,
        columns=['UMAP_1', 'UMAP_2'])
    umap_df.to_csv(f"{output_dir}/{dataset_name}_UMAP.csv.gz", compression="gzip")

    # write barcodes.tsv.gz
    barcodes = adata.obs.index.to_series()
    barcodes.to_csv(f"{output_dir}/barcodes.tsv.gz", index=False, sep='\t', compression="gzip", header=False)

    # write features.tsv.gz
    if 'gene_ids' in adata.var.columns:
        gene_id_column = 'gene_ids'
    else:
        gene_id_column = 'gene_ids-1'
        
    features = adata.var[[gene_id_column]]
    features['geneName'] = features.index
    features = features[['geneName',gene_id_column]]
    features.to_csv(f"{output_dir}/features.tsv.gz", index=False, sep='\t', compression="gzip", header=False)

    # Write the matrix to matrix.mtx.gz
    matrix_path = os.path.join(output_dir, "matrix.mtx.gz")
    with gzip.open(matrix_path, 'wb') as f:
        mmwrite(f, adata.layers['counts'].T)
    


In [12]:
def h5ad_to_seuratObj(adata_fp,output_dir,dataset_name=None):
    if dataset_name is None:
        dataset_name = Path(adata_fp).stem
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)        
    adata = sc.read_h5ad(adata_fp)    
    # write OBS
    adata.obs.to_csv(f"{output_dir}/{dataset_name}_obs.csv.gz", compression="gzip")
    # write VAR
    adata.var.to_csv(f"{output_dir}/{dataset_name}_var.csv.gz", compression="gzip")
    # write UMAP coordinates
    umap_df = pd.DataFrame(
        adata.obsm['X_umap'],
        index=adata.obs.index,
        columns=['UMAP_1', 'UMAP_2'])
    umap_df.to_csv(f"{output_dir}/{dataset_name}_UMAP.csv.gz", compression="gzip")
    # write barcodes.tsv.gz
    barcodes = adata.obs.index.to_series()
    barcodes.to_csv(f"{output_dir}/barcodes.tsv.gz", index=False, sep='\t', compression="gzip", header=False)
    # write features.tsv.gz
    if 'gene_ids' in adata.var.columns:
        gene_id_column = 'gene_ids'
    else:
        gene_id_column = 'gene_ids-1'        
    features = adata.var[[gene_id_column]]
    features['geneName'] = features.index
    features = features[['geneName','geneName',gene_id_column]]
    features.to_csv(f"{output_dir}/features.tsv.gz", index=False, sep='\t', compression="gzip", header=False)
    # Write the matrix to matrix.mtx.gz
    matrix_path = os.path.join(output_dir, "matrix.mtx.gz")
    with gzip.open(matrix_path, 'wb') as f:
        mmwrite(f, adata.layers['counts'].T)

# Fetal thyroid 2n samples

In [None]:
adata_fp='/nfs/team292/Thyroid_hm11_mt22/cdata_2n_p1.h5ad'
adata = sc.read_h5ad(adata_fp)

output_dir = '/nfs/team292/Thyroid_hm11_mt22/fThyroid_2n'
# h5ad_to_seuratObj(adata_fp=adata_fp,output_dir=output_dir,dataset_name='fThyroid_2n')


# Fetal thyroid age-matched 2n-T21 samples

In [13]:
adata_fp='/nfs/team292/Thyroid_hm11_mt22/cdata_2n_T21_all.h5ad'
adata = sc.read_h5ad(adata_fp)

output_dir = '/nfs/team292/Thyroid_hm11_mt22/fThyroid_2nT21'
h5ad_to_seuratObj(adata_fp=adata_fp,output_dir=output_dir,dataset_name='fThyroid_2nT21')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['geneName'] = features.index


# Mosteiro23 etal

In [25]:
adata_fp='/nfs/team292/Thyroid_hm11_mt22/public_normal_datasets/Mosteiro_2023_cThy.h5ad'
adata = sc.read_h5ad(adata_fp)

# output_dir = '/nfs/team292/Thyroid_hm11_mt22/public_normal_datasets_mtx'
# h5ad_to_seuratObj(adata_fp=adata_fp,output_dir=output_dir,dataset_name='Mosteiro_2023')


In [34]:
barcodes = adata.obs.index.to_series()
output_dir = '/nfs/team292/Thyroid_hm11_mt22/public_normal_datasets_mtx/Mosteiro_2023/mtx/'
barcodes.to_csv(f"{output_dir}/barcodes.tsv.gz", index=False, sep='\t', compression="gzip", header=False)

In [37]:
features = adata.var
features['geneName'] = features.index
features = features[['geneName','geneName']]
features.to_csv(f"{output_dir}/features.tsv.gz", index=False, sep='\t', compression="gzip", header=False)


In [35]:
# Write the matrix to matrix.mtx.gz
matrix_path = os.path.join(output_dir, "matrix.mtx.gz")
with gzip.open(matrix_path, 'wb') as f:
    mmwrite(f, adata.layers['counts'].T)


# Wang22 etal

In [19]:
adata_fp='/nfs/team292/Thyroid_hm11_mt22/public_normal_datasets/Wang_2022_cThy.h5ad'
#adata = sc.read_h5ad(adata_fp)

output_dir = '/nfs/team292/Thyroid_hm11_mt22/public_normal_datasets_mtx/Wang_2022'
h5ad_to_seuratObj(adata_fp=adata_fp,output_dir=output_dir,dataset_name='Wang_2022')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features['geneName'] = features.index


# Hong23 etal

In [23]:
adata_fp='/nfs/team292/Thyroid_hm11_mt22/public_normal_datasets/Hong_2023_cThy.h5ad'
adata = sc.read_h5ad(adata_fp)

# output_dir = '/nfs/team292/Thyroid_hm11_mt22/public_normal_datasets_mtx/Hong_2023'
# h5ad_to_seuratObj(adata_fp=adata_fp,output_dir=output_dir,dataset_name='Hong_2023')


In [24]:
adata.var


Unnamed: 0,mt,ribo,hb,n_cells_by_counts,mean_counts,log1p_mean_counts,pct_dropout_by_counts,total_counts,log1p_total_counts,n_cells,highly_variable,mean,std
AL627309.1,False,False,False,134,0.002467,0.002464,99.755144,135.0,4.912655,134,False,0.002073,0.035860
AL627309.3,False,False,False,7,0.000128,0.000128,99.987209,7.0,2.079442,7,False,0.000016,0.002172
AL732372.1,False,False,False,3,0.000055,0.000055,99.994518,3.0,1.386294,3,False,0.000065,0.005829
AL669831.5,False,False,False,2442,0.048131,0.047008,95.537770,2634.0,7.876638,2431,False,0.034707,0.154578
FAM87B,False,False,False,37,0.000694,0.000694,99.932390,38.0,3.663562,37,False,0.000133,0.009668
...,...,...,...,...,...,...,...,...,...,...,...,...,...
AC023491.2,False,False,False,26,0.000475,0.000475,99.952491,26.0,3.295837,26,False,0.000629,0.019377
AC004556.1,False,False,False,19,0.000347,0.000347,99.965282,19.0,2.995732,19,False,0.000461,0.020076
AC233755.2,False,False,False,51,0.005537,0.005521,99.906808,303.0,5.717028,43,False,0.000023,0.003247
AC233755.1,False,False,False,91,0.004934,0.004922,99.833717,270.0,5.602119,83,False,0.000068,0.006760


In [None]:
import os
import pandas

# Lu23 etal

In [None]:
dataset_name = 'Lu_2023'
output_dir = '/nfs/team292/Thyroid_hm11_mt22/public_normal_datasets_mtx/Lu_2023'
os.mkdir(output_dir)

adata_fp='/nfs/team292/Thyroid_hm11_mt22/public_normal_datasets/Lu_2023_cThy.h5ad'
adata = sc.read_h5ad(adata_fp)
# write OBS
adata.obs.to_csv(f"{output_dir}/{dataset_name}_obs.csv.gz", compression="gzip")
# write VAR
adata.var.to_csv(f"{output_dir}/{dataset_name}_var.csv.gz", compression="gzip")
# write UMAP coordinates
umap_df = pd.DataFrame(
    adata.obsm['X_umap'],
    index=adata.obs.index,
    columns=['UMAP_1', 'UMAP_2'])
umap_df.to_csv(f"{output_dir}/{dataset_name}_UMAP.csv.gz", compression="gzip")
