In [1]:
from scipy.stats import zscore

from phd_journal.roma.plotting import *
from phd_journal.roma.combat_variations import *
from phd_journal.roma.read import *

#### I/O

In [2]:
datasets_paths = {
    "Newcastle": "/Volumes/MMIS-Saraiv/Datasets/Newcastle/EC/features_source_ind-bands",
    "Izmir": "/Volumes/MMIS-Saraiv/Datasets/Izmir/EC/features_source_ind-bands",
    "Sapienza": "/Volumes/MMIS-Saraiv/Datasets/Sapienza/features_source_ind-bands",
}

In [3]:
datasets_metadata_paths = {
    "Izmir": "/Volumes/MMIS-Saraiv/Datasets/Izmir/metadata.csv",
    "Sapienza": "/Volumes/MMIS-Saraiv/Datasets/Sapienza/metadata.csv",
    "Newcastle": "/Volumes/MMIS-Saraiv/Datasets/Newcastle/metadata.csv",
}

#### Read

In [4]:
# Read datasets
datasets = {}
for dataset_name, path in datasets_paths.items():
    dataset = read_dataset(path, label=dataset_name)
    datasets[dataset_name] = dataset

In [5]:
# Read metadata
datasets_metadata = {}
for dataset_name, path in datasets_metadata_paths.items():
    dataset = read_metadata(path, label=dataset_name)
    dataset['SITE'] = [s.split('-')[0] for s in dataset.index]
    datasets_metadata[dataset_name] = dataset

In [36]:
for d in ('Sapienza', 'Izmir', 'Newcastle'):
    print(d)
    x = datasets_metadata[d]
    print("HC", len(x[x['DIAGNOSIS'] == 'HC']))
    print("AD", len(x[x['DIAGNOSIS'] == 'AD']))

#### Before log transformation

In [6]:
plot_mean_std_indep(datasets, datasets_metadata, log_scale=False)

In [7]:
plot_mean_diffs(datasets)

In [8]:
# Check normality before
check_normality(datasets, datasets_metadata)
#create_qq_plots(datasets)

#### Log transformation

In [9]:
# Approximate normality by log transformation
datasets = {dataset_name: intra_dataset_norm(dataset, method='log') for dataset_name, dataset in datasets.items()}

In [10]:
# Check normality after log
check_normality(datasets, datasets_metadata)
create_qq_plots(datasets)

In [11]:
plot_mean_std_indep(datasets, datasets_metadata, log_scale=False)

In [12]:
plot_mean_diffs(datasets, log=True)

#### Harmonisation

In [13]:
# Join all datasets and metadata
X = pd.concat(datasets.values())
all_metadata = pd.concat(datasets_metadata.values())
all_metadata = all_metadata.loc[X.index]  # keep only the metadata of the subjects in X
assert X.shape[0] == all_metadata.shape[0]
# NeuroHarmonize
X = neuro_harmonize(X, all_metadata, cov_age=True, cov_gender=True, cov_education=True, cov_diagnosis=True)
datasets_after = {dataset_name: X.loc[datasets[dataset_name].index] for dataset_name in datasets.keys()}

In [14]:
X = intra_dataset_norm(X, method='z-score')
# Put back as it was
datasets_after_zscore = {dataset_name: X.loc[datasets[dataset_name].index] for dataset_name in datasets.keys()}

In [15]:
# Undo log transformation
datasets_after_nolog = {dataset_name: np.power(10, dataset) for dataset_name, dataset in datasets_after.items()}

#### After harmonisation

In [16]:
# Check normality after COMBAT
check_normality(datasets_after, datasets_metadata)
#create_qq_plots(datasets)

In [17]:
# Plot all regions
plot_mean_std_indep(datasets_after, datasets_metadata)

In [18]:
# Plot all regions (Z-score)
plot_mean_std_indep(datasets_after_zscore, datasets_metadata)

In [19]:
# Plot all regions (no log)
plot_mean_std_indep(datasets_after_nolog, datasets_metadata)

In [20]:
plot_mean_diffs(datasets_after, log=True)

In [21]:
correlation_with_var(datasets, datasets_after, datasets_metadata, ("DIAGNOSIS", "SITE"))

In [22]:
classification_with_var(datasets, datasets_after, datasets_metadata, ("DIAGNOSIS", "SITE"), relevant_features=None)

In [23]:
datasets_metadata['Izmir']

In [24]:
regression_with_var(datasets, datasets_after, datasets_metadata, ("MMSE", "AGE"), relevant_features=None)

In [25]:
plot_2components(datasets, datasets_after, datasets_metadata, method='pca')

In [26]:
plot_2components(datasets, datasets_after, datasets_metadata, method='lda')

In [27]:
plot_2components(datasets, datasets_after, datasets_metadata, method='tsne')

In [28]:
plot_distance_matrix(datasets, datasets_after, datasets_metadata)