In [1]:
import numpy as np
import pandas as pd
import polars as pl

from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, normalized_mutual_info_score
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.cluster import SpectralClustering

from sklearn.decomposition import PCA, SparsePCA, KernelPCA
from sklearn.manifold import TSNE

from rpy2.robjects import r, pandas2ri
from rpy2.robjects.vectors import StrVector

pandas2ri.activate()

import magic
import scprep

%matplotlib inline

# from sklearnex import patch_sklearn
# patch_sklearn()

import warnings

from sklearn.cluster import KMeans
from tqdm import tqdm

# r('''library(Seurat)''')

In [2]:
def get_cluster_metrics(pred, labels):
    ari_res = []
    ami_res = []
    nmi_res = []
    
    pred_ = KMeans(n_clusters=len(np.unique(labels)), random_state=42).fit_predict(pred)

    ari_res.append(adjusted_rand_score(labels, pred_))
    ami_res.append(adjusted_mutual_info_score(labels, pred_))
    nmi_res.append(normalized_mutual_info_score(labels, pred_))

    warnings.filterwarnings("error")

    affinities = ['cosine', 'linear', 'poly']

    for i in tqdm(affinities):
        try:
            pred_ = SpectralClustering(
                n_clusters=len(np.unique(labels)), 
                random_state=42, 
                affinity=i
            ).fit_predict(pred)
            ari_res.append(adjusted_rand_score(labels, pred_))
            ami_res.append(adjusted_mutual_info_score(labels, pred_))
            nmi_res.append(normalized_mutual_info_score(labels, pred_))
        except:
            ari_res.append(0)
            ami_res.append(0)
            nmi_res.append(0)

    warnings.resetwarnings()
    
    return max(ari_res), max(ami_res), max(nmi_res)

In [5]:
def get_data():
    df =pl.read_csv('../data/pbmc40k/data.csv.gz')
    tmp = np.sign(df[:, 1:])
    tmp = np.sum(tmp, axis=0) > df.shape[0] *0.001
    cols = [i for i, j in zip(df.columns[1:], tmp) if j]
    labels = np.array(df[''])
    df = df[:, cols]
    df = np.log(df + 1)
    df_norm = df
    df_norm = scprep.normalize.library_size_normalize(df_norm)    
    df_norm = scprep.transform.sqrt(df_norm)
    return df_norm, labels

In [6]:
X_norm, labels = get_data()

In [9]:
get_cluster_metrics(X_norm, labels)

  super()._check_params_vs_input(X, default_n_init=10)


(0.5894313389685876, 0.6578688141600596, 0.6580663671114987)