In [2]:
import numpy as np
import pandas as pd
import os
import math

In [3]:
# Data sources.
data_root = '../data'
data_timeseries = '../data/raw_ts'
data_phenotype = '../data/phenotype.csv'
data_ct = '../data/CT.csv'
data_euler = '../data/Euler.csv'
data_computed_fcms = '../data/processed_ts'

In [8]:
phenotypes = pd.read_csv(data_phenotype, sep=',')
phenotypes.index = np.vectorize(lambda x: 'UKB' + str(x))(phenotypes['eid'])

In [77]:
import enum


class Phenotype(enum.Enum):
    SEX = 'SEX'
    AGE = 'AGE'
    FULL_TIME_EDUCATION = 'FTE'
    FLUID_INTELLIGENCE = 'FI'
    PROSPECTIVE_MEMORY_RESULT = 'MEM'
    MENTAL_HEALTH = 'MEN'
    BIPOLAR_DISORDER_STATUS = 'BIP'
    NEUROTICISM_SCORE = 'NEU'
    SMOKING_STATUS = 'SMO'

    # noinspection PyMethodParameters
    def get_biobank_codes(feature):
        biobank_features = {
            Phenotype.SEX: ['31-0.0'],  # http://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=31
            Phenotype.AGE: ['21003-2.0'],  # http://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=21003
            Phenotype.FULL_TIME_EDUCATION: ['845-0.0', '845-1.0', '845-2.0'],  # http://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=845
            Phenotype.FLUID_INTELLIGENCE: ['20016-0.0', '20016-1.0', '20016-2.0'],  # http://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=20016
            Phenotype.PROSPECTIVE_MEMORY_RESULT: ['20018-2.0'],  # http://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=20018
            Phenotype.MENTAL_HEALTH: ['20544-0.' + str(i) for i in range(1, 17)],  # http://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=20544
            Phenotype.BIPOLAR_DISORDER_STATUS: ['20122-0.0'],  # http://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=20122
            Phenotype.NEUROTICISM_SCORE: ['20127-0.0'],  # http://biobank.ndph.ox.ac.,uk/showcase/field.cgi?id=20127
            Phenotype.SMOKING_STATUS: ['20116-2.0']  # http://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=20116
        }

        return biobank_features[feature]

In [56]:
def create_custom_similarity_function(feature_list):
    """
    Creates the similarity metric based on the phenotype feature list.

    If a feature has several entries in the UK Biobank, take either the most recent available estimate or, if the
    entries correspond to categories, consider the matching category values.

    The final score is an average of all the indicator scores for each feature, i.e. if two subjects have all of the
    features matching, the total score will be 1, and if none of the features match then the value will be 0. Edge
    creation then depends on the similarity threshold defined in graph construction.

    If both features are unknown, assume there is no match.
    # TODO support some deviations, e.g. if the values are in the same percentile range etc.

    Args:
        feature_list: list of features taken as Phenotype enumerated values.

    Returns:
        The similarity function taking in the phenotype list and returning the similarity score.
    """
    def get_similarity(phenotypes, subject_i, subject_j):
        total_score = 0
        for feature_set in feature_list:
            feature = feature_set.value
            if np.array_equal(feature, Phenotype.MENTAL_HEALTH.value):
                # TODO compare the rest of the categories
                # First value in the mental health feature array gives the overall diagnosis as string.
                total_score += int(phenotypes.loc[subject_i, feature[0]] == phenotypes.loc[subject_j, feature[0]])
            elif len(feature) > 1:
                # handle the more/less recent values
                instance_i = feature[0]
                for f in reversed(feature):
                    if phenotypes.loc[subject_i, f] != 'NaN':
                        instance_i = f
                        break
                instance_j = feature[0]
                for f in reversed(feature):
                    if phenotypes.loc[subject_j, f] != 'NaN':
                        instance_j = f
                        break
                total_score += int(phenotypes.loc[subject_i, instance_i] == phenotypes.loc[subject_j, instance_j])
            else:
                total_score += int(phenotypes.loc[subject_i, feature[0]] == phenotypes.loc[subject_j, feature[0]])
        return total_score * 1.0 / len(feature_list)

    return get_similarity

In [6]:
sex_similarity = create_custom_similarity_function([Phenotype.SEX])

Truth values from the dataset:

In [21]:
phenotype.loc[['UKB1000028', 'UKB1000260', 'UKB1000133'], Phenotype.SEX.value]

Unnamed: 0,31-0.0
UKB1000028,Female
UKB1000260,Female
UKB1000133,Male


In [9]:
sex_similarity(phenotypes, 'UKB1000028', 'UKB1000260')

1.0

In [10]:
sex_similarity(phenotypes, 'UKB1000028', 'UKB1000133')

0.0

Metric dependent on value recency: fluid intelligence might have been recorded in several, one, or no assessments. Take the most recent score if it is available.

In [11]:
fluid_intelligence_similarity = create_custom_similarity_function([Phenotype.FLUID_INTELLIGENCE])

Truth values from the dataset:

In [16]:
phenotype.loc[['UKB1007166', 'UKB1008180', 'UKB6015441', 'UKB6016745'], Phenotype.FLUID_INTELLIGENCE.value]

Unnamed: 0,20016-0.0,20016-1.0,20016-2.0
UKB1007166,7.0,,
UKB1008180,,7.0,
UKB6015441,5.0,5.0,
UKB6016745,,,


In [17]:
fluid_intelligence_similarity(phenotypes, 'UKB1007166', 'UKB1008180')

1.0

In [18]:
fluid_intelligence_similarity(phenotypes, 'UKB6015441', 'UKB6016745')

0.0

Combined metrics: sex and full-time education (the latter has several entries)

In [57]:
sex_fteducation = create_custom_similarity_function([Phenotype.SEX, Phenotype.FULL_TIME_EDUCATION])

Truth values from the dataset:

In [70]:
features = np.concatenate((Phenotype.FULL_TIME_EDUCATION.value, Phenotype.SEX.value))
phenotype.loc[['UKB1008180', 'UKB6016745', 'UKB1007166', 'UKB1000028', 'UKB6025942', 'UKB1003453', 'UKB1008552'], features]

Unnamed: 0,845-1.0,845-2.0,31-0.0
UKB1008180,16.0,,Female
UKB6016745,,,Female
UKB1007166,,,Female
UKB1000028,16.0,16.0,Female
UKB6025942,17.0,16.0,Female
UKB1003453,16.0,16.0,Female
UKB1008552,,16.0,Male


Only half of the features match:

In [59]:
sex_fteducation(phenotype, 'UKB1008180', 'UKB6016745')

0.5

Two NaNs cannot be matched so still half of the features:

In [60]:
sex_fteducation(phenotype, 'UKB1007166', 'UKB6016745')

0.5

Both features match

In [62]:
sex_fteducation(phenotype, 'UKB1000028', 'UKB6025942')

1.0

Education but not gender match

In [69]:
sex_fteducation(phenotype, 'UKB1003453', 'UKB1008552')

0.5

In [72]:
phenotype.loc[['UKB1008180', 'UKB6016745', 'UKB1007166', 'UKB1000028', 'UKB6025942', 'UKB1003453', 'UKB1008552'], Phenotype.FULL_TIME_EDUCATION.value]

Unnamed: 0,845-1.0,845-2.0
UKB1008180,16.0,
UKB6016745,,
UKB1007166,,
UKB1000028,16.0,16.0
UKB6025942,17.0,16.0
UKB1003453,16.0,16.0
UKB1008552,,16.0


In [74]:
feature = Phenotype.MENTAL_HEALTH

In [76]:
feature == Phenotype.MENTAL_HEALTH.value

False

In [78]:
def precompute_similarity_features(feature_list):
    """Precomputes the columns of the phenotype dataset for faster subject comparison.

    :return: dataframe containing the values used for similarity comparison, row-indexed by subject ID and
    column-indexed by phenotype code name (e.g. 'AGE', 'FTE' etc.)
    """
    # TODO create a dataframe containing all relevamnt similarity features and directly look them up
    # TODO without referring to the phenotype in get_similarity, just the subjects.

    phenotypes = pd.read_csv(data_phenotype, sep=',')
    phenotypes.index = ['UKB' + str(eid) for eid in phenotypes['eid']]

    biobank_feature_list = []
    for feature in feature_list:
        biobank_feature_list.extend(Phenotype.get_biobank_codes(feature))

    phenotype_processed = phenotypes[biobank_feature_list]

    def get_most_recent(ukb_feature, subject_id):
        instance = ukb_feature[0]
        for f in reversed(ukb_feature):
            if phenotypes.loc[subject_id, f] != 'NaN':
                instance = f
                break
        return instance

    for feature in feature_list:
        biobank_feature = Phenotype.get_biobank_codes(feature)
        if feature == Phenotype.MENTAL_HEALTH:
            # TODO compare the rest of the categories
            # First value in the mental health feature array gives the overall diagnosis as string.
            phenotype_processed[feature.value] = phenotype_processed[biobank_feature[0]]
        elif len(biobank_feature) > 1:
            # handle the more/less recent values
            si = pd.Index.to_series(phenotype_processed)
            phenotype_processed[feature.value] = si.apply(lambda s: get_most_recent(biobank_feature, s))

In [79]:
feature_list = [Phenotype.MENTAL_HEALTH, Phenotype.FULL_TIME_EDUCATION]

In [96]:
phenotypes = pd.read_csv(data_phenotype, sep=',')
phenotypes.index = ['UKB' + str(eid) for eid in phenotypes['eid']]

biobank_feature_list = []
for feature in feature_list:
    biobank_feature_list.extend(Phenotype.get_biobank_codes(feature))

phenotype_processed = phenotypes[biobank_feature_list].copy()
phenotype_processed

Unnamed: 0,20544-0.1,20544-0.2,20544-0.3,20544-0.4,20544-0.5,20544-0.6,20544-0.7,20544-0.8,20544-0.9,20544-0.10,20544-0.11,20544-0.12,20544-0.13,20544-0.14,20544-0.15,20544-0.16,845-0.0,845-1.0,845-2.0
UKB1000028,,,,,,,,,,,,,,,,,16.0,16.0,16.0
UKB1000133,,,,,,,,,,,,,,,,,15.0,,
UKB1000260,Panic attacks,,,,,,,,,,,,,,,,,,
UKB1000329,,,,,,,,,,,,,,,,,,,
UKB1000430,Depression,15.0,,,,,,,,,,,,,,,21.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UKB6025704,,,,,,,,,,,,,,,,,16.0,16.0,18.0
UKB6025720,,,,,,,,,,,,,,,,,18.0,,18.0
UKB6025771,,,,,,,,,,,,,,,,,,,
UKB6025805,"Anxiety, nerves or GAD",,,,,,,,,,,,,,,,18.0,,18.0


In [97]:
def get_most_recent(ukb_feature, subject_id):
    instance = ukb_feature[0]
    for f in reversed(ukb_feature):
        if phenotypes.loc[subject_id, f] != 'NaN':
            instance = f
            break
    return phenotypes.loc[subject_id, instance]

for feature in feature_list:
    biobank_feature = Phenotype.get_biobank_codes(feature)
    if feature == Phenotype.MENTAL_HEALTH:
        # TODO compare the rest of the categories
        # First value in the mental health feature array gives the overall diagnosis as string.
        phenotype_processed.loc[:, feature.value] = phenotype_processed[biobank_feature[0]].copy()
    elif len(biobank_feature) > 1:
        # handle the more/less recent values
        si = phenotype_processed.index.to_series()
        phenotype_processed.loc[:, feature.value] = si.apply(lambda s: get_most_recent(biobank_feature, s))

In [98]:
phenotype_processed

Unnamed: 0,20544-0.1,20544-0.2,20544-0.3,20544-0.4,20544-0.5,20544-0.6,20544-0.7,20544-0.8,20544-0.9,20544-0.10,...,20544-0.12,20544-0.13,20544-0.14,20544-0.15,20544-0.16,845-0.0,845-1.0,845-2.0,MEN,FTE
UKB1000028,,,,,,,,,,,...,,,,,,16.0,16.0,16.0,,16.0
UKB1000133,,,,,,,,,,,...,,,,,,15.0,,,,
UKB1000260,Panic attacks,,,,,,,,,,...,,,,,,,,,Panic attacks,
UKB1000329,,,,,,,,,,,...,,,,,,,,,,
UKB1000430,Depression,15.0,,,,,,,,,...,,,,,,21.0,,,Depression,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UKB6025704,,,,,,,,,,,...,,,,,,16.0,16.0,18.0,,18.0
UKB6025720,,,,,,,,,,,...,,,,,,18.0,,18.0,,18.0
UKB6025771,,,,,,,,,,,...,,,,,,,,,,
UKB6025805,"Anxiety, nerves or GAD",,,,,,,,,,...,,,,,,18.0,,18.0,"Anxiety, nerves or GAD",18.0


In [99]:
phenotype_processed.drop(biobank_feature_list, axis=1, inplace=True)

In [100]:
phenotype_processed

Unnamed: 0,MEN,FTE
UKB1000028,,16.0
UKB1000133,,
UKB1000260,Panic attacks,
UKB1000329,,
UKB1000430,Depression,
...,...,...
UKB6025704,,18.0
UKB6025720,,18.0
UKB6025771,,
UKB6025805,"Anxiety, nerves or GAD",18.0


In [94]:
Phenotype.MENTAL_HEALTH in Phenotype

True

In [95]:
'MEN' in Phenotype

False

In [None]:
phenotype_processed.loc[:, feature.value] = phenotype_processed[biobank_feature[0]].copy()