In [1]:
# ML Input Preparation (RNA + ATAC Gene Activity)

import anndata as ad
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load processed RNA and gene activity AnnData
rna_adata = ad.read_h5ad("results/files/ag_rna_annotated_cleaned.h5ad")
gene_activity = ad.read_h5ad("results/files/ag_gene_activity.h5ad")

# Confirm matching cells
assert all(rna_adata.obs_names == gene_activity.obs_names)

# Match gene names (intersection)
common_genes = rna_adata.var_names.intersection(gene_activity.var_names)
print(f"{len(common_genes)} common genes found.")

# Subset to common genes
rna_common = rna_adata[:, common_genes].copy()
atac_common = gene_activity[:, common_genes].copy()

common_genes = rna_common.var_names.intersection(atac_common.var_names).sort_values()

feature_names_rna = [f"{gene}_RNA" for gene in rna_common[:, common_genes].var_names]
feature_names_atac = [f"{gene}_ATAC" for gene in atac_common[:, common_genes].var_names]
feature_names = feature_names_rna + feature_names_atac

rna_scaled = StandardScaler().fit_transform(rna_common[:, common_genes].X.toarray())
atac_scaled = StandardScaler().fit_transform(atac_common[:, common_genes].X.toarray())
X_combined = np.hstack([rna_scaled, atac_scaled])

# Get cell type labels
y = rna_adata.obs['cell_type'].values

# # Save combined data and labels
np.save("results/files/ag_X_combined.npy", X_combined)
np.save("results/files/ag_y_labels.npy", y)

# Also save as DataFrame for better downstream use
X_df = pd.DataFrame(X_combined, columns=feature_names)
X_df['cell_type'] = y 
X_df.to_csv("results/files/ag_X_combined_with_labels.csv", index=False)
pd.Series(feature_names).to_csv("results/files/ag_feature_names.csv", index=False, header=False)


1280 common genes found.
