In [2]:
import numpy as np
import pandas as pd
import os
import math

In [3]:
# Data sources.
data_root = '../data'
data_timeseries = '../data/raw_ts'
data_phenotype = '../data/phenotype.csv'
data_ct = '../data/CT.csv'
data_euler = '../data/Euler.csv'
data_computed_fcms = '../data/processed_ts'

In [8]:
phenotypes = pd.read_csv(data_phenotype, sep=',')
phenotypes.index = np.vectorize(lambda x: 'UKB' + str(x))(phenotypes['eid'])

In [77]:
import enum


class Phenotype(enum.Enum):
    SEX = 'SEX'
    AGE = 'AGE'
    FULL_TIME_EDUCATION = 'FTE'
    FLUID_INTELLIGENCE = 'FI'
    PROSPECTIVE_MEMORY_RESULT = 'MEM'
    MENTAL_HEALTH = 'MEN'
    BIPOLAR_DISORDER_STATUS = 'BIP'
    NEUROTICISM_SCORE = 'NEU'
    SMOKING_STATUS = 'SMO'

    # noinspection PyMethodParameters
    def get_biobank_codes(feature):
        biobank_features = {
            Phenotype.SEX: ['31-0.0'],  # http://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=31
            Phenotype.AGE: ['21003-2.0'],  # http://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=21003
            Phenotype.FULL_TIME_EDUCATION: ['845-0.0', '845-1.0', '845-2.0'],  # http://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=845
            Phenotype.FLUID_INTELLIGENCE: ['20016-0.0', '20016-1.0', '20016-2.0'],  # http://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=20016
            Phenotype.PROSPECTIVE_MEMORY_RESULT: ['20018-2.0'],  # http://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=20018
            Phenotype.MENTAL_HEALTH: ['20544-0.' + str(i) for i in range(1, 17)],  # http://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=20544
            Phenotype.BIPOLAR_DISORDER_STATUS: ['20122-0.0'],  # http://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=20122
            Phenotype.NEUROTICISM_SCORE: ['20127-0.0'],  # http://biobank.ndph.ox.ac.,uk/showcase/field.cgi?id=20127
            Phenotype.SMOKING_STATUS: ['20116-2.0']  # http://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=20116
        }

        return biobank_features[feature]

In [56]:
def create_custom_similarity_function(feature_list):
    """
    Creates the similarity metric based on the phenotype feature list.

    If a feature has several entries in the UK Biobank, take either the most recent available estimate or, if the
    entries correspond to categories, consider the matching category values.

    The final score is an average of all the indicator scores for each feature, i.e. if two subjects have all of the
    features matching, the total score will be 1, and if none of the features match then the value will be 0. Edge
    creation then depends on the similarity threshold defined in graph construction.

    If both features are unknown, assume there is no match.
    # TODO support some deviations, e.g. if the values are in the same percentile range etc.

    Args:
        feature_list: list of features taken as Phenotype enumerated values.

    Returns:
        The similarity function taking in the phenotype list and returning the similarity score.
    """
    def get_similarity(phenotypes, subject_i, subject_j):
        total_score = 0
        for feature_set in feature_list:
            feature = feature_set.value
            if np.array_equal(feature, Phenotype.MENTAL_HEALTH.value):
                # TODO compare the rest of the categories
                # First value in the mental health feature array gives the overall diagnosis as string.
                total_score += int(phenotypes.loc[subject_i, feature[0]] == phenotypes.loc[subject_j, feature[0]])
            elif len(feature) > 1:
                # handle the more/less recent values
                instance_i = feature[0]
                for f in reversed(feature):
                    if phenotypes.loc[subject_i, f] != 'NaN':
                        instance_i = f
                        break
                instance_j = feature[0]
                for f in reversed(feature):
                    if phenotypes.loc[subject_j, f] != 'NaN':
                        instance_j = f
                        break
                total_score += int(phenotypes.loc[subject_i, instance_i] == phenotypes.loc[subject_j, instance_j])
            else:
                total_score += int(phenotypes.loc[subject_i, feature[0]] == phenotypes.loc[subject_j, feature[0]])
        return total_score * 1.0 / len(feature_list)

    return get_similarity

In [6]:
sex_similarity = create_custom_similarity_function([Phenotype.SEX])

Truth values from the dataset:

In [21]:
phenotype.loc[['UKB1000028', 'UKB1000260', 'UKB1000133'], Phenotype.SEX.value]

Unnamed: 0,31-0.0
UKB1000028,Female
UKB1000260,Female
UKB1000133,Male


In [9]:
sex_similarity(phenotypes, 'UKB1000028', 'UKB1000260')

1.0

In [10]:
sex_similarity(phenotypes, 'UKB1000028', 'UKB1000133')

0.0

Metric dependent on value recency: fluid intelligence might have been recorded in several, one, or no assessments. Take the most recent score if it is available.

In [11]:
fluid_intelligence_similarity = create_custom_similarity_function([Phenotype.FLUID_INTELLIGENCE])

Truth values from the dataset:

In [16]:
phenotype.loc[['UKB1007166', 'UKB1008180', 'UKB6015441', 'UKB6016745'], Phenotype.FLUID_INTELLIGENCE.value]

Unnamed: 0,20016-0.0,20016-1.0,20016-2.0
UKB1007166,7.0,,
UKB1008180,,7.0,
UKB6015441,5.0,5.0,
UKB6016745,,,


In [17]:
fluid_intelligence_similarity(phenotypes, 'UKB1007166', 'UKB1008180')

1.0

In [18]:
fluid_intelligence_similarity(phenotypes, 'UKB6015441', 'UKB6016745')

0.0

Combined metrics: sex and full-time education (the latter has several entries)

In [57]:
sex_fteducation = create_custom_similarity_function([Phenotype.SEX, Phenotype.FULL_TIME_EDUCATION])

Truth values from the dataset:

In [70]:
features = np.concatenate((Phenotype.FULL_TIME_EDUCATION.value, Phenotype.SEX.value))
phenotype.loc[['UKB1008180', 'UKB6016745', 'UKB1007166', 'UKB1000028', 'UKB6025942', 'UKB1003453', 'UKB1008552'], features]

Unnamed: 0,845-1.0,845-2.0,31-0.0
UKB1008180,16.0,,Female
UKB6016745,,,Female
UKB1007166,,,Female
UKB1000028,16.0,16.0,Female
UKB6025942,17.0,16.0,Female
UKB1003453,16.0,16.0,Female
UKB1008552,,16.0,Male


Only half of the features match:

In [59]:
sex_fteducation(phenotype, 'UKB1008180', 'UKB6016745')

0.5

Two NaNs cannot be matched so still half of the features:

In [60]:
sex_fteducation(phenotype, 'UKB1007166', 'UKB6016745')

0.5

Both features match

In [62]:
sex_fteducation(phenotype, 'UKB1000028', 'UKB6025942')

1.0

Education but not gender match

In [69]:
sex_fteducation(phenotype, 'UKB1003453', 'UKB1008552')

0.5

In [72]:
phenotype.loc[['UKB1008180', 'UKB6016745', 'UKB1007166', 'UKB1000028', 'UKB6025942', 'UKB1003453', 'UKB1008552'], Phenotype.FULL_TIME_EDUCATION.value]

Unnamed: 0,845-1.0,845-2.0
UKB1008180,16.0,
UKB6016745,,
UKB1007166,,
UKB1000028,16.0,16.0
UKB6025942,17.0,16.0
UKB1003453,16.0,16.0
UKB1008552,,16.0


In [74]:
feature = Phenotype.MENTAL_HEALTH

In [76]:
feature == Phenotype.MENTAL_HEALTH.value

False

In [79]:
feature_list = [Phenotype.MENTAL_HEALTH]

In [96]:
phenotypes = pd.read_csv(data_phenotype, sep=',')
phenotypes.index = ['UKB' + str(eid) for eid in phenotypes['eid']]

biobank_feature_list = []
for feature in feature_list:
    biobank_feature_list.extend(Phenotype.get_biobank_codes(feature))

phenotype_processed = phenotypes[biobank_feature_list].copy()
phenotype_processed

Unnamed: 0,20544-0.1,20544-0.2,20544-0.3,20544-0.4,20544-0.5,20544-0.6,20544-0.7,20544-0.8,20544-0.9,20544-0.10,20544-0.11,20544-0.12,20544-0.13,20544-0.14,20544-0.15,20544-0.16,845-0.0,845-1.0,845-2.0
UKB1000028,,,,,,,,,,,,,,,,,16.0,16.0,16.0
UKB1000133,,,,,,,,,,,,,,,,,15.0,,
UKB1000260,Panic attacks,,,,,,,,,,,,,,,,,,
UKB1000329,,,,,,,,,,,,,,,,,,,
UKB1000430,Depression,15.0,,,,,,,,,,,,,,,21.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UKB6025704,,,,,,,,,,,,,,,,,16.0,16.0,18.0
UKB6025720,,,,,,,,,,,,,,,,,18.0,,18.0
UKB6025771,,,,,,,,,,,,,,,,,,,
UKB6025805,"Anxiety, nerves or GAD",,,,,,,,,,,,,,,,18.0,,18.0


In [97]:
def get_most_recent(ukb_feature, subject_id):
    instance = ukb_feature[0]
    for f in reversed(ukb_feature):
        if phenotypes.loc[subject_id, f] != 'NaN':
            instance = f
            break
    return phenotypes.loc[subject_id, instance]

for feature in feature_list:
    biobank_feature = Phenotype.get_biobank_codes(feature)
    if feature == Phenotype.MENTAL_HEALTH:
        # TODO compare the rest of the categories
        # First value in the mental health feature array gives the overall diagnosis as string.
        
        phenotype_processed.loc[:, feature.value] = phenotype_processed[biobank_feature[0]].copy()
    elif len(biobank_feature) > 1:
        # handle the more/less recent values
        si = phenotype_processed.index.to_series()
        phenotype_processed.loc[:, feature.value] = si.apply(lambda s: get_most_recent(biobank_feature, s))

In [98]:
phenotype_processed

Unnamed: 0,20544-0.1,20544-0.2,20544-0.3,20544-0.4,20544-0.5,20544-0.6,20544-0.7,20544-0.8,20544-0.9,20544-0.10,...,20544-0.12,20544-0.13,20544-0.14,20544-0.15,20544-0.16,845-0.0,845-1.0,845-2.0,MEN,FTE
UKB1000028,,,,,,,,,,,...,,,,,,16.0,16.0,16.0,,16.0
UKB1000133,,,,,,,,,,,...,,,,,,15.0,,,,
UKB1000260,Panic attacks,,,,,,,,,,...,,,,,,,,,Panic attacks,
UKB1000329,,,,,,,,,,,...,,,,,,,,,,
UKB1000430,Depression,15.0,,,,,,,,,...,,,,,,21.0,,,Depression,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UKB6025704,,,,,,,,,,,...,,,,,,16.0,16.0,18.0,,18.0
UKB6025720,,,,,,,,,,,...,,,,,,18.0,,18.0,,18.0
UKB6025771,,,,,,,,,,,...,,,,,,,,,,
UKB6025805,"Anxiety, nerves or GAD",,,,,,,,,,...,,,,,,18.0,,18.0,"Anxiety, nerves or GAD",18.0


In [99]:
phenotype_processed.drop(biobank_feature_list, axis=1, inplace=True)

In [100]:
phenotype_processed

Unnamed: 0,MEN,FTE
UKB1000028,,16.0
UKB1000133,,
UKB1000260,Panic attacks,
UKB1000329,,
UKB1000430,Depression,
...,...,...
UKB6025704,,18.0
UKB6025720,,18.0
UKB6025771,,
UKB6025805,"Anxiety, nerves or GAD",18.0


In [94]:
Phenotype.MENTAL_HEALTH in Phenotype

True

In [95]:
'MEN' in Phenotype

False

In [103]:
mental = phenotype[Phenotype.get_biobank_codes(Phenotype.MENTAL_HEALTH)]

```
Coding	Meaning
1	Social anxiety or social phobia
2	Schizophrenia
3	Any other type of psychosis or psychotic illness
4	A personality disorder
5	Any other phobia (eg disabling fear of heights or spiders)
6	Panic attacks
7	Obsessive compulsive disorder (OCD)
10	Mania, hypomania, bipolar or manic-depression
11	Depression
12	Bulimia nervosa
13	Psychological over-eating or binge-eating
14	Autism, Asperger's or autistic spectrum disorder
15	Anxiety, nerves or generalized anxiety disorder
16	Anorexia nervosa
17	Agoraphobia
18	Attention deficit or attention deficit and hyperactivity disorder (ADD/ADHD)
-818	Prefer not to answer (group A)
-819	Prefer not to answer (group B)
```

In [134]:
mental[mental['20544-0.9'].notnull()]

Unnamed: 0,20544-0.1,20544-0.2,20544-0.3,20544-0.4,20544-0.5,20544-0.6,20544-0.7,20544-0.8,20544-0.9,20544-0.10,20544-0.11,20544-0.12,20544-0.13,20544-0.14,20544-0.15,20544-0.16
UKB5029194,Social Anxiety or phobia,2.0,3.0,4.0,5.0,6.0,7.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0


In [135]:
mental[mental['20544-0.1'].notnull()]

Unnamed: 0,20544-0.1,20544-0.2,20544-0.3,20544-0.4,20544-0.5,20544-0.6,20544-0.7,20544-0.8,20544-0.9,20544-0.10,20544-0.11,20544-0.12,20544-0.13,20544-0.14,20544-0.15,20544-0.16
UKB1000260,Panic attacks,,,,,,,,,,,,,,,
UKB1000430,Depression,15.0,,,,,,,,,,,,,,
UKB1001269,"Anxiety, nerves or GAD",,,,,,,,,,,,,,,
UKB1002158,Depression,,,,,,,,,,,,,,,
UKB1002352,Panic attacks,11.0,15.0,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UKB6018825,"Anxiety, nerves or GAD",,,,,,,,,,,,,,,
UKB6021644,"Mania, hypomania, bipolar or manic-depression",11.0,,,,,,,,,,,,,,
UKB6022452,"Anxiety, nerves or GAD",,,,,,,,,,,,,,,
UKB6024859,"Anxiety, nerves or GAD",,,,,,,,,,,,,,,


In [136]:
mental['20544-0.1'].value_counts()

Depression                                       2570
Anxiety, nerves or GAD                            925
Panic attacks                                     588
Social Anxiety or phobia                          160
Other phobia                                      112
OCD                                                40
Other type of Psychosis or psychotic ilness        40
Mania, hypomania, bipolar or manic-depression      38
Anorexia nervosa                                   20
Prefer not to answer(Group B)                      19
Other personality disorder                         17
Prefer not to answer(Group A)                      16
Bulimia nervosa                                    13
Schizophrenia                                       7
Autism, Asperger or binge-eating                    5
Agoraphobia                                         3
Psychological over-eating or binge-eating           2
ADD/ADHD                                            2
Name: 20544-0.1, dtype: int6

In [141]:
subject_id = 'UKB1002352'
mental.loc[subject_id].to_numpy()

array(['Panic attacks', 11.0, 15.0, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan], dtype=object)

In [140]:
mental.loc[subject_id, Phenotype.MENTAL_HEALTH.value] = mental.loc[subject_id].to_numpy()

ValueError: Must have equal len keys and value when setting with an ndarray

In [148]:
mental_to_code = {'Depression': 11, 
 'Anxiety, nerves or GAD': 15, 
 'Panic attacks': 6,
 'Social Anxiety or phobia': 1,
 'Other phobia': 5,
 'OCD': 7,
 'Other type of Psychosis or psychotic ilness': 3,
 'Mania, hypomania, bipolar or manic-depression': 10,
 'Anorexia nervosa': 16,
 'Prefer not to answer(Group B)': -819,
 'Other personality disorder': 4,
 'Prefer not to answer(Group A)': -818,
 'Bulimia nervosa': 12,
 'Schizophrenia': 2,
 'Autism, Asperger or binge-eating': 14,
 'Agoraphobia': 17,
 'Psychological over-eating or binge-eating': 13,
 'ADD/ADHD': 18}

In [155]:
mental.loc[:, '20544-0.1'] = mental['20544-0.1'].apply(lambda x: mental_to_code[x] if x in mental_to_code.keys() else None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [157]:
mental[mental['20544-0.1'].isnull()]

Unnamed: 0,20544-0.1,20544-0.2,20544-0.3,20544-0.4,20544-0.5,20544-0.6,20544-0.7,20544-0.8,20544-0.9,20544-0.10,20544-0.11,20544-0.12,20544-0.13,20544-0.14,20544-0.15,20544-0.16,MEN
UKB1000028,,,,,,,,,,,,,,,,,
UKB1000133,,,,,,,,,,,,,,,,,
UKB1000329,,,,,,,,,,,,,,,,,
UKB1001530,,,,,,,,,,,,,,,,,
UKB1003453,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UKB6025231,,,,,,,,,,,,,,,,,
UKB6025704,,,,,,,,,,,,,,,,,
UKB6025720,,,,,,,,,,,,,,,,,
UKB6025771,,,,,,,,,,,,,,,,,


In [158]:
mental['20544-0.1'].value_counts()

 11.0     2570
 15.0      925
 6.0       588
 1.0       160
 5.0       112
 7.0        40
 3.0        40
 10.0       38
 16.0       20
-819.0      19
 4.0        17
-818.0      16
 12.0       13
 2.0         7
 14.0        5
 17.0        3
 18.0        2
 13.0        2
Name: 20544-0.1, dtype: int64

In [167]:
mental.drop(['MEN' + str(i) for i in range(1, 19)], axis=1, inplace=True)

In [168]:
mental

Unnamed: 0,20544-0.1,20544-0.2,20544-0.3,20544-0.4,20544-0.5,20544-0.6,20544-0.7,20544-0.8,20544-0.9,20544-0.10,20544-0.11,20544-0.12,20544-0.13,20544-0.14,20544-0.15,20544-0.16,MEN
UKB1000028,,,,,,,,,,,,,,,,,
UKB1000133,,,,,,,,,,,,,,,,,
UKB1000260,6.0,,,,,,,,,,,,,,,,
UKB1000329,,,,,,,,,,,,,,,,,
UKB1000430,11.0,15.0,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UKB6025704,,,,,,,,,,,,,,,,,
UKB6025720,,,,,,,,,,,,,,,,,
UKB6025771,,,,,,,,,,,,,,,,,
UKB6025805,15.0,,,,,,,,,,,,,,,,


In [173]:
si = phenotype_processed.index.to_series()
for i in range(1, 19):
    mental.loc[:, Phenotype.MENTAL_HEALTH.value + str(i)] = si.apply(lambda s: 1 if i in mental.loc[s].to_numpy() else 0)
mental

Unnamed: 0,20544-0.1,20544-0.2,20544-0.3,20544-0.4,20544-0.5,20544-0.6,20544-0.7,20544-0.8,20544-0.9,20544-0.10,...,MEN9,MEN10,MEN11,MEN12,MEN13,MEN14,MEN15,MEN16,MEN17,MEN18
UKB1000028,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
UKB1000133,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
UKB1000260,6.0,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
UKB1000329,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
UKB1000430,11.0,15.0,,,,,,,,,...,0,0,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UKB6025704,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
UKB6025720,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
UKB6025771,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
UKB6025805,15.0,,,,,,,,,,...,0,0,0,0,0,0,1,0,0,0


In [174]:
mental_lookup = mental.drop(['20544-0.' + str(i) for i in range(1, 17)], axis=1)

Unnamed: 0,MEN,MEN1,MEN2,MEN3,MEN4,MEN5,MEN6,MEN7,MEN8,MEN9,MEN10,MEN11,MEN12,MEN13,MEN14,MEN15,MEN16,MEN17,MEN18
UKB1000028,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
UKB1000133,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
UKB1000260,,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
UKB1000329,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
UKB1000430,,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UKB6025704,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
UKB6025720,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
UKB6025771,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
UKB6025805,,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [175]:
mental_lookup.drop(['MEN'], axis=1, inplace=True)
mental_lookup

Unnamed: 0,MEN1,MEN2,MEN3,MEN4,MEN5,MEN6,MEN7,MEN8,MEN9,MEN10,MEN11,MEN12,MEN13,MEN14,MEN15,MEN16,MEN17,MEN18
UKB1000028,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
UKB1000133,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
UKB1000260,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
UKB1000329,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
UKB1000430,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UKB6025704,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
UKB6025720,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
UKB6025771,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
UKB6025805,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [178]:
subject_i = 'UKB1000430'
subject_j = 'UKB6025805'
np.dot(mental_lookup.loc[subject_i], mental_lookup.loc[subject_j])

2

In [179]:
subject_i = 'UKB1000260'
subject_j = 'UKB6025805'
np.dot(mental_lookup.loc[subject_i], mental_lookup.loc[subject_j])

1

In [184]:
mental_lookup[mental_lookup['MEN15'] == 1]

Unnamed: 0,MEN1,MEN2,MEN3,MEN4,MEN5,MEN6,MEN7,MEN8,MEN9,MEN10,MEN11,MEN12,MEN13,MEN14,MEN15,MEN16,MEN17,MEN18
UKB1000430,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
UKB1001269,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
UKB1002352,1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0
UKB1007157,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
UKB1007772,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UKB6018196,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
UKB6018825,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
UKB6022452,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
UKB6024859,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [185]:
mental_lookup.loc[:, Phenotype.MENTAL_HEALTH.value] = si.apply(lambda s: int(np.sum(mental_lookup.loc[s, ['MEN' + str(i) for i in range(1, 19)]]) > 0))

In [186]:
mental_lookup

Unnamed: 0,MEN1,MEN2,MEN3,MEN4,MEN5,MEN6,MEN7,MEN8,MEN9,MEN10,MEN11,MEN12,MEN13,MEN14,MEN15,MEN16,MEN17,MEN18,MEN
UKB1000028,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
UKB1000133,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
UKB1000260,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
UKB1000329,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
UKB1000430,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UKB6025704,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
UKB6025720,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
UKB6025771,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
UKB6025805,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1


In [189]:
mental_lookup.loc[['UKB1000260', 'UKB6025805', 'UKB1000028', 'UKB1000133']]

Unnamed: 0,MEN1,MEN2,MEN3,MEN4,MEN5,MEN6,MEN7,MEN8,MEN9,MEN10,MEN11,MEN12,MEN13,MEN14,MEN15,MEN16,MEN17,MEN18,MEN
UKB1000260,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
UKB6025805,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
UKB1000028,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
UKB1000133,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [190]:
subject_i = 'UKB1000260'
subject_j = 'UKB6025805'
r = np.dot(mental_lookup.loc[subject_i], mental_lookup.loc[subject_j])
r == 0 or r > 1

True

In [191]:
subject_i = 'UKB1000028'
subject_j = 'UKB1000133'
r = np.dot(mental_lookup.loc[subject_i], mental_lookup.loc[subject_j])
r == 0 or r > 1

True