In [1]:
import pathlib
from sklearn.model_selection import train_test_split
import anndata
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFECV
from sklearn.metrics import (balanced_accuracy_score, confusion_matrix,
                             fbeta_score, make_scorer)
from sklearn.model_selection import StratifiedKFold

In [2]:
adata = anndata.read_h5ad('Adata/cell_by_feature.cov_filter.rate.h5ad')
adata

AnnData object with n_obs × n_vars = 103982 × 24059 
    obs: 'AllcPath', 'CCC_Rate', 'CG_Rate', 'CG_RateAdj', 'CH_Rate', 'CH_RateAdj', 'FinalReads', 'InputReads', 'MappedReads', 'Region', 'index_name', 'uid', 'BamFilteringRate', 'MappingRate', 'Pos96', 'Plate', 'Col96', 'Row96', 'Col384', 'Row384', 'FACS_Date', 'Slice', 'CellClass', 'l1-umap_0', 'l1-umap_1', 'l1-tsne_0', 'l1-tsne_1', 'MajorType', 'l2-umap_0', 'l2-umap_1', 'l2-tsne_0', 'l2-tsne_1', 'SubType', 'l3-umap_0', 'l3-umap_1', 'l3-tsne_0', 'l3-tsne_1', 'L1CellClass', 'class_tsne_0', 'class_tsne_1', 'class_umap_0', 'class_umap_1', 'Order', 'RegionName', 'MajorRegion', 'SubRegion', 'DetailRegion', 'PotentialOverlap (MMB)', 'Anterior (CCF coords)', 'Posterior (CCF coords)', 'MajorRegionColor', 'SubRegionColor', 'DissectionRegionColor', 'Replicate'
    var: 'chrom', 'start', 'end'

In [44]:
max_cells = 1000

In [45]:
train_cells = []
test_cells = []

for cluster, cells in adata.obs.groupby('SubType')['SubType']:
    if 'Outlier' in cluster:
        continue
    if cells.size > max_cells:
        cells = cells.sample(max_cells, random_state=0)
    n_test = int(max(cells.size * 0.15, 5))
    train_cell, test_cell = train_test_split(cells, test_size=n_test, random_state=0)
    train_cells += train_cell.index.tolist()
    test_cells += test_cell.index.tolist()

In [46]:
len(train_cells)

56667

In [47]:
len(test_cells)

9923

In [49]:
prediction_dir = pathlib.Path(
    '/home/hanliu/project/mouse_rostral_brain/study/FinalPredictionModelRecipe/NeuronPrediction/'
)
marker_paths = list(prediction_dir.glob('**/cluster_markers.txt'))

total_marker = []
for path in marker_paths:
    total_marker += pd.read_csv(path, index_col=0, header=None).index.tolist()
total_marker = sorted(set(total_marker))
len(total_marker)

3364

In [50]:
marker_adata = adata[:, list(map(str, total_marker))]

In [51]:
train_adata = marker_adata[train_cells, :].copy()
test_adata = marker_adata[test_cells, :].copy()

In [52]:
train_adata.write_h5ad('Adata/train.h5ad')
test_adata.write_h5ad('Adata/test.h5ad')

In [53]:
train_adata

AnnData object with n_obs × n_vars = 56667 × 3364 
    obs: 'AllcPath', 'CCC_Rate', 'CG_Rate', 'CG_RateAdj', 'CH_Rate', 'CH_RateAdj', 'FinalReads', 'InputReads', 'MappedReads', 'Region', 'index_name', 'uid', 'BamFilteringRate', 'MappingRate', 'Pos96', 'Plate', 'Col96', 'Row96', 'Col384', 'Row384', 'FACS_Date', 'Slice', 'CellClass', 'l1-umap_0', 'l1-umap_1', 'l1-tsne_0', 'l1-tsne_1', 'MajorType', 'l2-umap_0', 'l2-umap_1', 'l2-tsne_0', 'l2-tsne_1', 'SubType', 'l3-umap_0', 'l3-umap_1', 'l3-tsne_0', 'l3-tsne_1', 'L1CellClass', 'class_tsne_0', 'class_tsne_1', 'class_umap_0', 'class_umap_1', 'Order', 'RegionName', 'MajorRegion', 'SubRegion', 'DetailRegion', 'PotentialOverlap (MMB)', 'Anterior (CCF coords)', 'Posterior (CCF coords)', 'MajorRegionColor', 'SubRegionColor', 'DissectionRegionColor', 'Replicate'
    var: 'chrom', 'start', 'end'

In [54]:
test_adata

AnnData object with n_obs × n_vars = 9923 × 3364 
    obs: 'AllcPath', 'CCC_Rate', 'CG_Rate', 'CG_RateAdj', 'CH_Rate', 'CH_RateAdj', 'FinalReads', 'InputReads', 'MappedReads', 'Region', 'index_name', 'uid', 'BamFilteringRate', 'MappingRate', 'Pos96', 'Plate', 'Col96', 'Row96', 'Col384', 'Row384', 'FACS_Date', 'Slice', 'CellClass', 'l1-umap_0', 'l1-umap_1', 'l1-tsne_0', 'l1-tsne_1', 'MajorType', 'l2-umap_0', 'l2-umap_1', 'l2-tsne_0', 'l2-tsne_1', 'SubType', 'l3-umap_0', 'l3-umap_1', 'l3-tsne_0', 'l3-tsne_1', 'L1CellClass', 'class_tsne_0', 'class_tsne_1', 'class_umap_0', 'class_umap_1', 'Order', 'RegionName', 'MajorRegion', 'SubRegion', 'DetailRegion', 'PotentialOverlap (MMB)', 'Anterior (CCF coords)', 'Posterior (CCF coords)', 'MajorRegionColor', 'SubRegionColor', 'DissectionRegionColor', 'Replicate'
    var: 'chrom', 'start', 'end'