## Feature Selection

In [21]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import os
from sklearn.model_selection import train_test_split,StratifiedKFold
# just for test

### Import data

In [3]:
# define the function to perform feature engineering
def data_preprocessing(data_df):
    adata = ad.AnnData(X=data_df.values, 
                      obs=data_df.index.to_frame(), 
                      var=pd.DataFrame(index=data_df.columns))
    sc.pp.highly_variable_genes(adata, n_top_genes=5000, flavor='cell_ranger')
    adata_fselected = adata[:, adata.var['highly_variable']]
    return adata_fselected

In [6]:
# import datasets

## CellBench10x (5 lung cancer cell lines)
lc_data_df = pd.read_csv('../data/10x_5cl/10x_5cl_data.csv', index_col=0)
lc_label_df = pd.read_csv('../data/10x_5cl/Labels.csv',header=0) #the first row is header

In [7]:
## Baron(Human) (Human pancreas)
pan_data_df = pd.read_csv('../data/Baron Human/Filtered_Baron_HumanPancreas_data.csv', index_col=0)
pan_label_df = pd.read_csv('../data/Baron Human/Labels.csv',header=0)

In [8]:
## Zheng sorted	PBMC (immune system)
imm_data_df = pd.read_csv('../data/Zheng sorted/Filtered_DownSampled_SortedPBMC_data.csv', index_col=0)
imm_label_df = pd.read_csv('../data/Zheng sorted/Labels.csv',header=0)
imm_label_df

Unnamed: 0,x
0,CD14+ Monocyte
1,CD14+ Monocyte
2,CD14+ Monocyte
3,CD14+ Monocyte
4,CD14+ Monocyte
...,...
19995,CD8+/CD45RA+ Naive Cytotoxic
19996,CD8+/CD45RA+ Naive Cytotoxic
19997,CD8+/CD45RA+ Naive Cytotoxic
19998,CD8+/CD45RA+ Naive Cytotoxic


In [9]:
# feature selection
lc_adata_fselected = data_preprocessing(lc_data_df)
pan_adata_fselected = data_preprocessing(pan_data_df)
imm_adata_fselected = data_preprocessing(imm_data_df)

In [10]:
lc_adata_fselected

View of AnnData object with n_obs × n_vars = 3803 × 5000
    obs: 0
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'hvg'

### cross vaildation

In [22]:
def create_cv_datasets(adata, labels, output_root):
    
    # Split train and test(20%)
    X_train, X_test, y_train, y_test = train_test_split(adata.obs_names, labels, test_size=0.2, random_state=42, stratify=labels)

    train_adata = adata[X_train, :]
    test_adata = adata[X_test, :]


    # Create Stratified K-Fold object
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    
    # Create directories and split data
    for fold_index, (train_idx, valid_idx) in enumerate(skf.split(train_adata.X, y_train)):
        fold_dir = os.path.join(output_root, f'fold_{fold_index + 1}')
        os.makedirs(fold_dir, exist_ok=True)
        
        train_fold_adata = train_adata[train_idx]
        valid_fold_adata = train_adata[valid_idx]
        
        # Split the AnnData and convert to DataFrame
        train_features = pd.DataFrame(train_fold_adata.X.toarray(), columns=train_fold_adata.var_names)
        valid_features = pd.DataFrame(valid_fold_adata.X.toarray(), columns=valid_fold_adata.var_names)
        train_labels = y_train.iloc[train_idx].reset_index(drop=True)
        valid_labels = y_train.iloc[valid_idx].reset_index(drop=True)


        # Save to CSV
        train_features.to_csv(os.path.join(fold_dir, 'train_data.csv'), index=False)
        valid_features.to_csv(os.path.join(fold_dir, 'valid_data.csv'), index=False)
        train_labels.to_csv(os.path.join(fold_dir, 'train_labels.csv'), index=False)
        valid_labels.to_csv(os.path.join(fold_dir, 'valid_labels.csv'), index=False)
    
    test_features = pd.DataFrame(test_adata.X.toarray(), columns=test_adata.var_names)
    test_features.to_csv(os.path.join(output_root, 'test_features.csv'), index=False)
    y_test.reset_index(drop=True).to_csv(os.path.join(output_root, 'test_labels.csv'), index=False)

In [12]:
output_dir = "../clean_data"

In [23]:
create_cv_datasets(lc_adata_fselected,lc_label_df,output_dir+"/cellBench")

In [24]:
create_cv_datasets(pan_adata_fselected,pan_label_df,output_dir+"/baron")



In [None]:
create_cv_datasets(imm_adata_fselected,imm_label_df,output_dir+"/zheng")