In [3]:
import numpy as np
import pandas as pd

from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, normalized_mutual_info_score
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.cluster import SpectralClustering

from sklearn.decomposition import PCA, SparsePCA, KernelPCA
from sklearn.manifold import TSNE

from rpy2.robjects import r, pandas2ri
from rpy2.robjects.vectors import StrVector

pandas2ri.activate()

import magic
import scprep

%matplotlib inline

# from sklearnex import patch_sklearn
# patch_sklearn()

import warnings

from sklearn.cluster import KMeans
from tqdm import tqdm

# r('''library(Seurat)''')

In [4]:
def get_cluster_metrics(pred, labels):
    ari_res = []
    ami_res = []
    nmi_res = []

    # Assuming you have the necessary R libraries and the rpy2 interface to use the Seurat method
    try:
        r.assign("data", pred.T)
        seurat = r('''
            countsData = data.frame(data)
            pbmc <- CreateSeuratObject(counts = countsData, project = "thal_single_cell", min.cells = 1, min.features = 1)
            pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", verbose=FALSE)
            all.genes <- rownames(pbmc)
            pbmc <- ScaleData(pbmc, features = all.genes, verbose=FALSE)
            pbmc <- RunPCA(pbmc, verbose=FALSE)
            pbmc <- FindNeighbors(pbmc, verbose=FALSE)
            pbmc <- FindClusters(pbmc, verbose=FALSE)
            Idents(pbmc)
        ''')
        ari_res.append(adjusted_rand_score(labels, seurat))
        ami_res.append(adjusted_mutual_info_score(labels, seurat))
        nmi_res.append(normalized_mutual_info_score(labels, seurat))
    except Exception as e:
        pass

    pred_ = KMeans(n_clusters=len(np.unique(labels)), random_state=42, n_init='auto').fit_predict(pred)

    ari_res.append(adjusted_rand_score(labels, pred_))
    ami_res.append(adjusted_mutual_info_score(labels, pred_))
    nmi_res.append(normalized_mutual_info_score(labels, pred_))

    warnings.filterwarnings("error")

    affinities = ['cosine', 'linear', 'poly']

    for i in affinities:
        try:
            pred_ = SpectralClustering(
                n_clusters=len(np.unique(labels)), 
                random_state=42, 
                affinity=i
            ).fit_predict(pred)
            ari_res.append(adjusted_rand_score(labels, pred_))
            ami_res.append(adjusted_mutual_info_score(labels, pred_))
            nmi_res.append(normalized_mutual_info_score(labels, pred_))
        except:
            ari_res.append(0)
            ami_res.append(0)
            nmi_res.append(0)

    warnings.resetwarnings()
    
    return max(ari_res), max(ami_res), max(nmi_res)

In [5]:
def get_data(i):
    df = pd.read_csv('../../data/{}/data.csv.gz'.format(i), index_col=0)
    tmp = np.sign(df)
    cols = (np.sum(tmp) > int((df.shape[0])*0.05))
    rows = (np.sum(tmp, axis=1) > int((df.shape[1])*0.05))
    df = np.log(df.loc[rows, cols] + 1)
    df_norm = df.copy()
    df_norm = scprep.normalize.library_size_normalize(df_norm)    
    df_norm = scprep.transform.sqrt(df_norm)
    X_norm = pd.DataFrame(df_norm, columns=df.columns)
    labels = df.index
    return X_norm, labels

In [None]:
dir_list = [
    'manno', 'Xu', 'brosens', 'jakel', 'Fujii', 'loureiro', 
    'carey', 'hcabm40k', 'jiang', 'Selewa'
]

res = []
for i in tqdm(dir_list):
    X_norm, labels = get_data(i)
    pred = magic.MAGIC().fit_transform(X_norm)
    res.append(get_cluster_metrics(pred, labels))