In [5]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import os
import math
from sklearn.model_selection import train_test_split,StratifiedKFold

## Feature Selection

### Import data

In [3]:
# define the function to perform feature engineering
def data_preprocessing(data_df):
    adata = ad.AnnData(X=data_df.values, 
                      obs=data_df.index.to_frame(), 
                      var=pd.DataFrame(index=data_df.columns))
    sc.pp.highly_variable_genes(adata, n_top_genes=5000, flavor='cell_ranger')
    adata_fselected = adata[:, adata.var['highly_variable']]
    return adata_fselected

In [6]:
# import datasets

## CellBench10x (5 lung cancer cell lines)
lc_data_df = pd.read_csv('../data/10x_5cl/10x_5cl_data.csv', index_col=0)
lc_label_df = pd.read_csv('../data/10x_5cl/Labels.csv',header=0) #the first row is header

In [7]:
## Baron(Human) (Human pancreas)
pan_data_df = pd.read_csv('../data/Baron Human/Filtered_Baron_HumanPancreas_data.csv', index_col=0)
pan_label_df = pd.read_csv('../data/Baron Human/Labels.csv',header=0)

In [8]:
## Zheng sorted	PBMC (immune system)
imm_data_df = pd.read_csv('../data/Zheng sorted/Filtered_DownSampled_SortedPBMC_data.csv', index_col=0)
imm_label_df = pd.read_csv('../data/Zheng sorted/Labels.csv',header=0)
imm_label_df

Unnamed: 0,x
0,CD14+ Monocyte
1,CD14+ Monocyte
2,CD14+ Monocyte
3,CD14+ Monocyte
4,CD14+ Monocyte
...,...
19995,CD8+/CD45RA+ Naive Cytotoxic
19996,CD8+/CD45RA+ Naive Cytotoxic
19997,CD8+/CD45RA+ Naive Cytotoxic
19998,CD8+/CD45RA+ Naive Cytotoxic


In [9]:
# feature selection
lc_adata_fselected = data_preprocessing(lc_data_df)
pan_adata_fselected = data_preprocessing(pan_data_df)
imm_adata_fselected = data_preprocessing(imm_data_df)

In [10]:
lc_adata_fselected

View of AnnData object with n_obs × n_vars = 3803 × 5000
    obs: 0
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'hvg'

# cross vaildation

#### After PCA

In [42]:
def create_cv_datasets(features, labels, output_root):
    """
    Create cross-validation datasets with an initial split into training and test datasets from CSV files,
    saving features and labels in separate CSV files.
    """

    # Split train and test (20%)
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42, stratify=labels)

    # Create Stratified K-Fold object
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    
    # Create directories and split data
    for fold_index, (train_idx, valid_idx) in enumerate(skf.split(X_train, y_train)):
        fold_dir = os.path.join(output_root, f'fold_{fold_index + 1}')
        os.makedirs(fold_dir, exist_ok=True)
        
        # Select train and validation data
        train_features = X_train.iloc[train_idx]
        valid_features = X_train.iloc[valid_idx]
        train_labels = y_train.iloc[train_idx].reset_index(drop=True)
        valid_labels = y_train.iloc[valid_idx].reset_index(drop=True)

        # Save to CSV
        train_features.to_csv(os.path.join(fold_dir, 'train_features.csv'), index=False)
        valid_features.to_csv(os.path.join(fold_dir, 'valid_features.csv'), index=False)
        train_labels.to_csv(os.path.join(fold_dir, 'train_labels.csv'), index=False)
        valid_labels.to_csv(os.path.join(fold_dir, 'valid_labels.csv'), index=False)

    # Save test set features and labels to separate CSV files
    X_test.to_csv(os.path.join(output_root, 'test_features.csv'), index=False)
    y_test.reset_index(drop=True).to_csv(os.path.join(output_root, 'test_labels.csv'), index=False)


In [43]:
output_dir = "../clean_data_pca"

# Load data from CSV
lc_pca = pd.read_csv("../data_selected/lc_adata_fselected_31.csv",index_col=0)
imm_pca = pd.read_csv("../data_selected/imm_adata_fselected_107.csv",index_col=0)
pan_pca = pd.read_csv("../data_selected/pan_adata_fselected_8.csv",index_col=0)

In [44]:
create_cv_datasets(lc_pca,lc_label_df,output_dir+"/cellBench")

In [45]:
create_cv_datasets(pan_pca,pan_label_df,output_dir+"/baron")



In [46]:
create_cv_datasets(imm_pca,imm_label_df,output_dir+"/zheng")

## No cv datasets
### no cv, just split train and test data, for active learning

In [25]:
def create_datasets(features, labels, output_root):

    # Split train and test (20%)
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42, stratify=labels)
   
    os.makedirs(output_root, exist_ok=True)
    
    # Save test set features and labels to separate CSV files
    X_train.to_csv(os.path.join(output_root, 'train_features.csv'), index=False)
    y_train.reset_index(drop=True).to_csv(os.path.join(output_root, 'train_labels.csv'), index=False)
    X_test.to_csv(os.path.join(output_root, 'test_features.csv'), index=False)
    y_test.reset_index(drop=True).to_csv(os.path.join(output_root, 'test_labels.csv'), index=False)


In [62]:
output_dir = "../clean_data_kmeans"

In [63]:
create_datasets(lc_pca,lc_label_df,output_dir+"/cellBench")

In [64]:
create_datasets(pan_pca,pan_label_df,output_dir+"/baron")

In [65]:
create_datasets(imm_pca,imm_label_df,output_dir+"/zheng")

## Kmeans

In [1]:
# Define the function to perform the Kmeans clustering
# It will return the objective values and the final clusters
# It will stop after converging
def perform_kmeans(mat, initial_center, k):
    center = initial_center.copy()
    obj = []
    cluster = np.zeros((mat.shape[0],1))
    check = True
    count = 0
    while check == True: # iteration loop
        
        obj_sum = 0
        
        for j in range(mat.shape[0]): # sample loop
            dis = []
            for c in range(k): # cluster loop
                d = math.sqrt(sum((mat[j,:]-center[c,:])**2))
                dis.append(d)
            min_idx = dis.index(min(dis))
            cluster[j, 0] = min_idx
        
        center = np.zeros((k, mat.shape[1]), float)
        for j in range(mat.shape[0]): # sample loop
            for c in range(k): # center loop
                if cluster[j, 0] == c:
                    center[c,:] += mat[j,:]
        
        for j in range(k): # center loop
            if np.count_nonzero(cluster == j) != 0:
                center[j, :] = center[j, :]/np.count_nonzero(cluster == j)
        
        for j in range(mat.shape[0]): # sample loop
            for c in range(k): # center loop
                if cluster[j,0] == c:
                    obj_sum += sum((mat[j,:]-center[c,:])**2)
        
        obj.append(obj_sum)
        
        # check converge
        if count != 0:
            if obj[count] == obj[count-1]:
                break
                
        count += 1
        
    return obj, cluster

In [3]:
# define the function to initialize the centroids
# it will return the centroids where each centroid's element is selected randomly 
# from the corresponding column of the original data 
def initialize_center(mat, k):
    center = np.zeros((k, mat.shape[1]))
    np.random.seed(620)
    for c in range(k):
        random_list = np.random.randint(0, mat.shape[0], size=mat.shape[0]).tolist()
        for j in range(mat.shape[1]):
            center[c,j] = mat[random_list[j],j]
    return center

In [4]:
# get data after PCA
lc_pca = pd.read_csv('../data_selected/lc_adata_fselected_31.csv', index_col=0)
pan_pca = pd.read_csv('../data_selected/pan_adata_fselected_8.csv', index_col=0)
imm_pca = pd.read_csv('../data_selected/imm_adata_fselected_107.csv', index_col=0)

#### Perform Kmeans on three datastes after PCA

In [7]:
# lc
lc_k = len(lc_label_df.iloc[:,0].unique())
lc_obj, lc_kmeans_cluster = perform_kmeans(np.array(lc_pca), initialize_center(np.array(lc_pca), lc_k), lc_k)

In [8]:
# pan
pan_k = len(pan_label_df.iloc[:,0].unique())
pan_obj, pan_kmeans_cluster = perform_kmeans(np.array(pan_pca), initialize_center(np.array(pan_pca), pan_k), pan_k)

In [None]:
# imm
imm_k = len(imm_label_df.iloc[:,0].unique())
imm_obj, imm_kmeans_cluster = perform_kmeans(np.array(imm_pca), initialize_center(np.array(imm_pca), imm_k), imm_k)

## LR

### To run the logistic regression model on cross validation dataset, JUST run these cmd:
#### bash cv.sh baron
#### bash cv.sh cellBench
#### bash cv.sh zheng
### The bash script will automatically call the lr_final.py, then generate the metric.txt for each fold

In [10]:
# Calculate average metrics for the 10 fold

def calculate_average_metrics(directory):
    # Initilize DataFrame
    total_train_error = 0.0
    total_test_error = 0.0
    total_f1_score = 0.0
    file_count = 0

    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        
        if os.path.isfile(file_path):
            with open(file_path, 'r') as file:
                lines = file.readlines()
                if len(lines) == 3:
                    total_train_error += float(lines[0].split(',')[1])
                    total_test_error += float(lines[1].split(',')[1])
                    total_f1_score += float(lines[2].split(',')[1])
                    file_count += 1

    if file_count > 0:
        avg_train_error = total_train_error / file_count
        avg_test_error = total_test_error / file_count
        avg_f1_score = total_f1_score / file_count
    else:
        avg_train_error, avg_test_error, avg_f1_score = 0, 0, 0

    return avg_train_error, avg_test_error, avg_f1_score

In [21]:
directory = '../clean_data_pca/baron/metrics'

baron_averages = calculate_average_metrics(directory)

In [13]:
directory = '../clean_data_pca/cellBench/metrics'

cell_averages = calculate_average_metrics(directory)

In [14]:
directory = '../clean_data_pca/zheng/metrics'

zheng_averages = calculate_average_metrics(directory)

In [23]:
def write_csv(output,averages):
    with open(output, 'w') as file:
        file.write("Metric,Value\n")
        file.write(f"Train Error,{averages[0]}\n")
        file.write(f"Test Error,{averages[1]}\n")
        file.write(f"F1 Score,{averages[2]}\n")

In [24]:
write_csv("../LR_results/baron_avg_metrics.csv",baron_averages)
write_csv("../LR_results/cell_avg_metrics.csv",cell_averages)
write_csv("../LR_results/zheng_avg_metrics.csv",zheng_averages)