In [33]:
import anndata
import scanpy as sc
import numpy as np
from sklearn.decomposition import PCA as pca
import argparse
import matplotlib.pyplot as plt



def read_data(data_path):
    return anndata.read_csv(data_path)

def preprocess_data(adata: anndata.AnnData, scale :bool=True):
    """Preprocessing dataset: filtering genes/cells, normalization and scaling."""
    sc.pp.filter_cells(adata, min_counts=5000)
    sc.pp.filter_cells(adata, min_genes=500)

    sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
    adata.raw = adata

    sc.pp.log1p(adata)
    if scale:
        sc.pp.scale(adata, max_value=10, zero_center=True)
        adata.X[np.isnan(adata.X)] = 0

    return adata


def PCA(X, num_components: int):
    return pca(num_components).fit_transform(X)

# def main():
data_path = 'data/scRNAseq_human_pancreas.csv'
heart = read_data(data_path)
heart = preprocess_data(heart)
X = PCA(heart.X, 100)
df = pd.DataFrame(X)
df.to_pickle('processed_data.pkl')

In [1]:
import pandas as pd
import anndata
import scanpy as sc
import numpy as np
from sklearn.decomposition import PCA as pca
import argparse
import matplotlib.pyplot as plt
from kmeans import KMeans
# Loading processed data
loaded_data = pd.read_pickle('processed_data.pkl')

knn = KMeans(3)
learnt_clustering, learnt_centroids = knn.fit(loaded_data.to_numpy())

random centroids
[[-1.64517803e+01  9.06427956e+00 -6.56731176e+00  1.49685822e+01
  -1.88712466e+00 -2.13022637e+00 -1.23636303e+01 -6.44362307e+00
  -5.94400024e+00  8.95148754e-01 -7.89816046e+00 -1.58444977e+00
   3.38181597e-03  5.92348695e-01 -1.50622797e+00  4.13811302e+00
   2.87819767e+00 -9.51347053e-02 -3.51382065e+00  2.15371147e-01
  -3.76634431e+00 -4.86321747e-01 -2.56057525e+00  1.86067080e+00
   3.55125046e+00  2.53530288e+00 -1.18630791e+00  8.57142329e-01
   1.89069891e+00  9.52825844e-01  4.27287780e-02 -1.05329716e+00
   9.91625786e-01 -1.06595445e+00  9.84307528e-01 -4.09593850e-01
  -1.62349844e+00  2.37178469e+00  2.99360824e+00 -4.08984721e-01
   1.65419209e+00 -4.60975552e+00 -5.89209139e-01  2.98847771e+00
   1.04231381e+00 -5.01494348e-01  1.79273058e-02 -1.38872850e+00
  -3.79595399e-01 -1.25096500e+00 -6.95186436e-01 -5.69978619e+00
   6.09663665e-01  1.31692767e+00 -1.63644648e+00 -3.63141149e-01
   2.40827298e+00  5.86602986e-01 -1.91048652e-01  3.705104

In [2]:
print(learnt_centroids)
print(learnt_clustering)

[[-1.64517803e+01  9.06427956e+00 -6.56731176e+00  1.49685822e+01
  -1.88712466e+00 -2.13022637e+00 -1.23636303e+01 -6.44362307e+00
  -5.94400024e+00  8.95148754e-01 -7.89816046e+00 -1.58444977e+00
   3.38181597e-03  5.92348695e-01 -1.50622797e+00  4.13811302e+00
   2.87819767e+00 -9.51347053e-02 -3.51382065e+00  2.15371147e-01
  -3.76634431e+00 -4.86321747e-01 -2.56057525e+00  1.86067080e+00
   3.55125046e+00  2.53530288e+00 -1.18630791e+00  8.57142329e-01
   1.89069891e+00  9.52825844e-01  4.27287780e-02 -1.05329716e+00
   9.91625786e-01 -1.06595445e+00  9.84307528e-01 -4.09593850e-01
  -1.62349844e+00  2.37178469e+00  2.99360824e+00 -4.08984721e-01
   1.65419209e+00 -4.60975552e+00 -5.89209139e-01  2.98847771e+00
   1.04231381e+00 -5.01494348e-01  1.79273058e-02 -1.38872850e+00
  -3.79595399e-01 -1.25096500e+00 -6.95186436e-01 -5.69978619e+00
   6.09663665e-01  1.31692767e+00 -1.63644648e+00 -3.63141149e-01
   2.40827298e+00  5.86602986e-01 -1.91048652e-01  3.70510489e-01
   1.24952