In [1]:
%load_ext autoreload
%autoreload 2
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
os.getcwd()

'/srv/mfs/hausserlab/fabio/data_analysis/notebooks/karen_building_blocks_analysis'

In [2]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from src.utils.archetypes import ArchetypalAnalysis
from src.CellAbundance import CellAbundance, generate_abundance_matrix, join_abundance_matrices
from src.utils.visualization import plot_scatter_pca, plot_cumulative_explained_variance, plot_PCAs, plot_CVEs

In [3]:
##########################################################
#      CONTROL PANEL       #  set here your parameters   #
##########################################################
patient_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 
               26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41]
N_SITE = 100
RADIUS = 100
METHOD = 'abs'
CELL_TYPES = ['CD8-T', 'Other immune', 'DC / Mono', 'CD3-T', 'B', 'NK', 'Kreatin-positive tumor', 'Tumor', 
              'CD4-T', 'Mesenchymal-like', 'Macrophages', 'Endothelial', 'Tregs', 'Unidentified', 'DC', 'Mono / Neu', 
              'Neutrophils']
ROOT_DATA_PATH = "../../../output/cell_positions_data"
ROOT_OUTPUT_CSV_PATH = "../../../output/csv_files"
RANDOM_SEED = 1022

In [4]:
# Generating the sites and cell abbundance!
abbundance_matrix = generate_abundance_matrix(CELL_TYPES, patient_ids, N_SITE, RADIUS, METHOD, root=ROOT_DATA_PATH, random_seed=RANDOM_SEED)

In [23]:
# generating the dataset with all the sites from the patients and the cells belonging to each site (point one)
df = pd.DataFrame()
for ca in abbundance_matrix:
    df_ca = ca.get_site_cell_map_dataframe()
    df_ca['patient_id'] = int(ca.patient_id)
    df = df.append(df_ca)
df = df.reset_index(drop = True)
df.to_csv("{}/site_cells.csv".format(ROOT_OUTPUT_CSV_PATH), index=False, header=True)

In [24]:
df

Unnamed: 0,site_idx,site_x_centers,site_y_centers,x_cell,y_cell,cell_type,cell_id,patient_id
0,0,192.282,192.282,130.455,306.54,Macrophages,2023,1
1,0,192.282,192.282,138.45,302.64,B,2003,1
2,0,192.282,192.282,150.345,310.635,Kreatin-positive tumor,2036,1
3,0,192.282,192.282,152.88,298.35,Kreatin-positive tumor,1963,1
4,0,192.282,192.282,154.83,288.015,CD3-T,1873,1
...,...,...,...,...,...,...,...,...
1043310,99,314.243,314.243,384.735,589.875,DC / Mono,3988,41
1043311,99,314.243,314.243,391.56,587.925,Macrophages,3969,41
1043312,99,314.243,314.243,333.645,604.695,Macrophages,4101,41
1043313,99,314.243,314.243,356.85,612.885,Macrophages,4182,41


In [12]:
#Abbundance Matrix 
df_2 = pd.DataFrame()
for ca in abbundance_matrix:
    abbundance_df = pd.DataFrame(ca.abundance_matrix)
    abbundance_df['site_idx'] = np.arange(len(abbundance_df))
    abbundance_df['patient_idx'] = ca.patient_id
    df_2 = df_2.append(abbundance_df)
df_2 = df_2.reset_index()
df_2.to_csv("{}/abbundance_matrix.csv".format(ROOT_OUTPUT_CSV_PATH), index=False, header=True)

In [13]:
df_2

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,site_idx,patient_idx
0,0,3,11,7,5,5,0,143,1,1,4,12,3,0,2,0,0,0,0,1
1,1,10,10,0,16,276,2,78,0,14,0,3,0,0,0,0,0,0,1,1
2,2,2,4,9,8,5,0,139,0,3,2,9,2,0,2,0,0,1,2,1
3,3,6,3,13,11,13,0,141,0,7,1,18,0,0,1,0,0,0,3,1
4,4,11,21,6,15,12,2,131,0,4,12,34,11,0,5,0,0,0,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3994,95,36,15,47,14,4,1,60,0,31,25,31,2,0,2,0,11,12,95,41
3995,96,32,25,32,29,9,1,59,1,15,23,22,1,0,3,0,10,13,96,41
3996,97,32,20,38,27,10,1,41,1,19,32,29,1,0,3,0,16,10,97,41
3997,98,25,16,28,27,4,0,74,1,8,22,23,1,0,2,0,6,11,98,41


In [16]:
# Archetypal Analysis
sites, patients_ids = join_abundance_matrices(abbundance_matrix)
pca = PCA()
pc = pca.fit_transform(sites)
AA_3D = ArchetypalAnalysis(n_archetypes = 4, 
                        tolerance = 0.001, 
                        max_iter = 200, 
                        random_state = 0, 
                        C = 0.0001, 
                        initialize = 'random',
                        redundancy_try = 30)
AA_3D.fit_transform(pc[:, :3])



array([[  20.0638632 ,   -7.35117968,  -19.72478522],
       [-106.32866059,  207.84373371, -105.30130853],
       [  16.92781075,   -6.50024647,  -20.48839519],
       ...,
       [ -90.58545189,  -29.04531936,   -5.96026737],
       [ -53.03796365,  -27.38110777,   -9.13366421],
       [-111.64872168,  -46.36751719,  -20.89705525]])

In [18]:
#ope it
df_archetypes = pd.DataFrame(AA_3D.alfa.T, columns=['1', '2', '3', '4'])
df_archetypes['site_idx'] = df_2['site_idx']
df_archetypes['patient_idx'] = df_2['patient_idx']
df_archetypes.to_csv("{}/archetypes_matrix.csv".format(ROOT_OUTPUT_CSV_PATH), index=False, header=True)

In [19]:
df_archetypes

Unnamed: 0,1,2,3,4,site_idx,patient_idx
0,0.017151,0.006542,0.414560,0.561747,0,1
1,0.482869,0.000000,0.226668,0.290467,1,1
2,0.020456,0.007183,0.406281,0.566079,2,1
3,0.030850,0.034051,0.407601,0.527498,3,1
4,0.018399,0.047597,0.362740,0.571264,4,1
...,...,...,...,...,...,...
3994,0.013353,0.165897,0.145537,0.675212,95,41
3995,0.018220,0.112136,0.146100,0.723543,96,41
3996,0.020170,0.121304,0.088966,0.769560,97,41
3997,0.008842,0.082625,0.194786,0.713747,98,41
