In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import altair as alt
from scipy.stats import mode
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
import random
import math
import json
import time
import os
import scipy.stats as stats
from scipy.stats import kendalltau
import re
plt.rcParams['font.family'] = 'DejaVu Sans'
from typing import List, Optional

import utils

In [2]:
data = pd.read_csv("data/synthetic/5|50.csv")
biomarker_df = data[data['biomarker'] == "HIP-FCI (1)"].reset_index(drop=True)
biomarker_df.head()

Unnamed: 0,participant,biomarker,measurement,diseased
0,0,HIP-FCI (1),-6.478694,True
1,1,HIP-FCI (1),0.97191,True
2,2,HIP-FCI (1),-1.679449,True
3,3,HIP-FCI (1),-9.749598,True
4,4,HIP-FCI (1),6.355445,True


In [3]:
biomarker_df.measurement.values

array([ -6.47869406,   0.97190956,  -1.6794493 ,  -9.74959777,
         6.35544488,   1.24766926,   1.12557536,   9.8238457 ,
        -7.14583467,   2.44920504,   8.75654243,  -0.18754759,
        -0.22802603,  -8.43810928,  -1.52692228, -13.79810626,
        -8.65548913,  -7.05471431,  -3.24417868,  -6.65935328,
         7.5461844 ,  -2.8848049 ,   5.8309055 , -12.35114877,
       -10.59805685, -12.33427578,   9.16934171,  -6.15459225,
       -13.14885106,  -4.75569505,  -0.12540885,  -2.41557142,
         7.66126459,   2.19719472, -14.56447599,  -5.97423647,
        -5.49056799, -14.34299008,   4.95096849, -12.27604413,
        -2.13659689,  -8.67773602,  -8.16773116,   6.11274656,
        -5.02316793,  -8.31486091,  -5.34962163,  -9.23721014,
       -17.45340079,  -3.12139616])

In [4]:
# reshape to satisfy sklearn requirements
measurements = np.array(biomarker_df['measurement']).reshape(-1, 1)

# dataframe for non-diseased participants
healthy_df = biomarker_df[biomarker_df['diseased'] == False]


In [5]:
kmeans_setup = KMeans(2, random_state=0, n_init="auto")
hierarchical_clustering_setup = AgglomerativeClustering(n_clusters=2)

In [6]:
clustering_result_kmeans = kmeans_setup.fit(measurements)
clustering_result_kmeans.labels_

array([0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0], dtype=int32)

In [7]:
clustering_result_kmeans.predict(measurements)

array([0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0], dtype=int32)

In [8]:
theta_mean, theta_std, phi_mean, phi_std = utils.compute_theta_phi_for_biomarker(
    data, "HIP-FCI (1)", clustering_setup = None)

TypeError: compute_theta_phi_for_biomarker() got an unexpected keyword argument 'clustering_setup'

In [None]:
theta_mean, theta_std, phi_mean, phi_std

(-8.670515224654201, 3.7456773038825366, 3.1380607558332643, 3.979854599286057)

In [9]:
biomarkers = data.biomarker.unique()
utils.get_theta_phi_estimates(data, biomarkers)

{'HIP-FCI (1)': {'theta_mean': -8.670515224654201,
  'theta_std': 3.7456773038825366,
  'phi_mean': 3.1380607558332643,
  'phi_std': 3.9798545992860563},
 'PCC-FCI (2)': {'theta_mean': 3.8477722684768514,
  'theta_std': 2.240421770525342,
  'phi_mean': 11.802146434595763,
  'phi_std': 2.6699676610506673},
 'HIP-GMI (3)': {'theta_mean': 0.2834803893887619,
  'theta_std': 0.2632128475918643,
  'phi_mean': 0.2834803893887619,
  'phi_std': 0.2632128475918643},
 'FUS-GMI (4)': {'theta_mean': 0.4931581629003902,
  'theta_std': 0.06414911515382,
  'phi_mean': 0.4931581629003902,
  'phi_std': 0.06414911515382},
 'FUS-FCI (5)': {'theta_mean': -19.698716486692724,
  'theta_std': 5.426383241871959,
  'phi_mean': -9.264096908187947,
  'phi_std': 2.4969637281638892}}