In [1]:
%load_ext autoreload
%autoreload 2
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
os.getcwd()

'/srv/mfs/hausserlab/fabio/data_analysis/notebooks/nature_cancer_building_blocks_analysis'

In [2]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from src.utils.archetypes import ArchetypalAnalysis
from src.CellAbundance import CellAbundance, generate_abundance_matrix, join_abundance_matrices
from src.utils.visualization import plot_scatter_pca, plot_cumulative_explained_variance, plot_PCAs, plot_CVEs

In [14]:
##########################################################
#      CONTROL PANEL       #  set here your parameters   #
##########################################################
patient_ids = [ 88, 428, 357, 389, 303, 445, 374,  33, 549, 537, 477, 393, 429, 329, 443, 305, 105, 469, 555, 
               532,   6,  40, 415,   7, 221, 10, 395, 494, 332, 409,  87, 479, 359,  36, 265, 273, 229, 261, 
               184, 181, 521, 102, 468, 509,  92,  94, 331, 260, 250, 233, 234,236, 231, 170, 365, 175, 154,  19, 
               132,  98,  58,  61,  16,  63,432, 361, 263, 426, 366,  35, 340,   1, 336]
N_SITE = 100
#RADIUS = 100
RADIUS = 25
#METHOD = 'abs'
METHOD = 'gaussian'
CELL_TYPES = ['Fibroblasts', 'Endothelial', 'T cells', 'B cells', 'Cancer', 'Vascular SMA+', 'Myoepithelial', 'Macrophages']
ROOT_DATA_PATH = "../../data/tnbc_nature_cancer_dataset"
ROOT_OUTPUT_CSV_PATH = "../../output/csv_files_nature"
RANDOM_SEED = 1022

In [15]:
# Generating the sites and cell abbundance!
abbundance_matrix = generate_abundance_matrix(CELL_TYPES, patient_ids, N_SITE, RADIUS, METHOD, root=ROOT_DATA_PATH, random_seed=RANDOM_SEED)

In [16]:
# generating the dataset with all the sites from the patients and the cells belonging to each site (point one)
df = pd.DataFrame()
for ca in abbundance_matrix:
    df_ca = ca.get_site_cell_map_dataframe()
    df_ca['patient_id'] = int(ca.patient_id)
    df = df.append(df_ca)
df = df.reset_index(drop = True)
df.to_csv("{}/site_cells.csv".format(ROOT_OUTPUT_CSV_PATH), index=False, header=True)

In [17]:
df

Unnamed: 0,site_idx,site_x_centers,site_y_centers,x_cell,y_cell,cell_type,cell_id,patient_id
0,0,361.229,262.424,390.575,228.115,Fibroblasts,60,88
1,0,361.229,262.424,328.742,233.364,Fibroblasts,64,88
2,0,361.229,262.424,321.46,259.745,Fibroblasts,85,88
3,0,361.229,262.424,337.844,260.094,Fibroblasts,86,88
4,0,361.229,262.424,357.902,264.492,T cells,91,88
...,...,...,...,...,...,...,...,...
94466,97,303.69,245.172,340.459,252.82,Cancer,647,336
94467,97,303.69,245.172,350.988,245.527,Cancer,627,336
94468,97,303.69,245.172,350.213,234.59,Cancer,598,336
94469,97,303.69,245.172,339.03,264.048,Cancer,677,336


In [18]:
#Abbundance Matrix 
df_2 = pd.DataFrame()
for ca in abbundance_matrix:
    abbundance_df = pd.DataFrame(ca.abundance_matrix)
    abbundance_df['site_idx'] = np.arange(len(abbundance_df))
    abbundance_df['patient_idx'] = ca.patient_id
    df_2 = df_2.append(abbundance_df)
df_2 = df_2.reset_index()
df_2.to_csv("{}/abbundance_matrix.csv".format(ROOT_OUTPUT_CSV_PATH), index=False, header=True)

In [19]:
df_2

Unnamed: 0,index,0,1,2,3,4,5,6,7,site_idx,patient_idx
0,0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0,88
1,1,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,1,88
2,2,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,2,88
3,3,1.778474,0.283981,0.000000,0.0,0.527223,0.132167,0.000000,0.000000,3,88
4,4,0.245570,0.000000,0.277133,0.0,3.643198,0.000000,1.156329,0.000000,4,88
...,...,...,...,...,...,...,...,...,...,...,...
7295,95,4.455369,1.070409,0.375205,0.0,2.488352,0.000000,1.112848,0.461928,95,336
7296,96,6.857301,1.445886,0.000000,0.0,0.345864,0.102890,0.000000,2.111808,96,336
7297,97,2.872044,0.431237,0.550682,0.0,2.322325,0.666016,0.000000,0.210873,97,336
7298,98,1.540786,0.359752,0.226287,0.0,5.622828,0.000000,0.000000,0.324268,98,336


In [20]:
# Archetypal Analysis
sites, patients_ids = join_abundance_matrices(abbundance_matrix)
pca = PCA()
pc = pca.fit_transform(sites)
AA_3D = ArchetypalAnalysis(n_archetypes = 4, 
                        tolerance = 0.001, 
                        max_iter = 200, 
                        random_state = 0, 
                        C = 0.0001, 
                        initialize = 'random',
                        redundancy_try = 30)
AA_3D.fit_transform(pc[:, :3])

array([[-1.6513612 , -0.57122028, -0.0582035 ],
       [-1.6513612 , -0.57122028, -0.0582035 ],
       [-1.6513612 , -0.57122028, -0.0582035 ],
       ...,
       [ 1.30750449,  1.72650856,  0.07957911],
       [ 4.19748402, -0.37937627,  0.05108449],
       [ 0.65778239,  0.16700683, -0.02336665]])

In [21]:
#ope it
df_archetypes = pd.DataFrame(AA_3D.alfa.T, columns=['1', '2', '3', '4'])
df_archetypes['site_idx'] = df_2['site_idx']
df_archetypes['patient_idx'] = df_2['patient_idx']
df_archetypes.to_csv("{}/archetypes_matrix.csv".format(ROOT_OUTPUT_CSV_PATH), index=False, header=True)

In [22]:
df_archetypes

Unnamed: 0,1,2,3,4,site_idx,patient_idx
0,0.000000,0.000000,0.000000,1.000000,0,88
1,0.000000,0.000000,0.000000,1.000000,1,88
2,0.000000,0.000000,0.000000,1.000000,2,88
3,0.019092,0.150599,0.000000,0.830309,3,88
4,0.205522,0.018976,0.018499,0.757003,4,88
...,...,...,...,...,...,...
7295,0.115141,0.372607,0.048867,0.463386,95,336
7296,0.000000,0.560838,0.124008,0.315154,96,336
7297,0.114467,0.238314,0.043521,0.603697,97,336
7298,0.310131,0.124695,0.032929,0.532246,98,336
