In [None]:
#default_exp phenotypes.Field

config=Config()
data_dict = config.data_dict
pheno_df = pheno_df

In [None]:
#export
import pandas as pd
import dask.dataframe as dd
import logging
import numpy as np
from combinatorial_gwas.data_catalog import get_catalog, get_parameters

In [None]:
#export
parameters= get_parameters()
parameters

{'example_test_data_ratio': 0.2,
 'example_num_train_iter': 10000,
 'example_learning_rate': 0.01,
 'template_gwas_result_file_link': 'https://broad-ukb-sumstats-us-east-1.s3.amazonaws.com/round2/additive-tsvs/{phenotype_code}.gwas.imputed_v3.both_sexes.tsv.bgz',
 'genetic_file_path_template': '/lab/corradin_biobank/Raw_UKB_downloads/BGEN/ukb_imp_chr{chrom_number}_v3.bgen',
 'sample_file_template': '/lab/corradin_biobank/Raw_UKB_downloads/sample_files/ukb45624_imp_chr21_v3_s487275.sample',
 'data_dict_file': '/lab/corradin_biobank/Phenotypes//Data_Dictionary_Showcase.csv',
 'pheno_file': '/lab/corradin_biobank/samples//neale_gwas_both_sexes_parquet/',
 'coding_file_path_template': '/lab/corradin_biobank/Phenotypes//data_codes/datacode-*.tsv',
 'id_col': 'f.eid'}

In [None]:
#@delegate_as(dd.core.DataFrame, to='dd_cls')
#export
try:
    data_dict = pd.read_csv(parameters["data_dict_file"])
    data_dict.file = parameters["data_dict_file"]
    pheno_df = dd.read_parquet(parameters["pheno_file"])
    if parameters["id_col"]:
        if parameters["id_col"] not in pheno_df.columns:
            raise KeyError(f"Cannot find ID column {parameters['id_col']} in file {parameters['id_col']}")
        logging.warning("Found ID column, setting index. This might take a bit long. Please be patient.")
        pheno_df[parameters["id_col"]] = pheno_df[parameters["id_col"]].astype(int)
        pheno_df = pheno_df.set_index(parameters["id_col"])
    coding_file_path_template = parameters["coding_file_path_template"]
    
except AttributeError:
    raise AttributeError("Could not find file names from the settings module, please set `data_dict_file` and `pheno_file` attribute")



In [None]:
#export
class DelegatedAttribute:
    def __init__(self, delegate_name, attr_name):
        self.attr_name = attr_name
        self.delegate_name = delegate_name
    
    def __get__(self, instance, owner):
        if instance is None:
            return self
        else:
            print("attr_name",self.attr_name)
            print("delegate_name",  self.delegate_name)
            #print("instance",instance)
            print("owner", owner)
           
            # return instance.delegate.attr
            return getattr(self.delegate(instance),  self.attr_name)

    def __set__(self, instance, value):
        # instance.delegate.attr = value
        setattr(self.delegate(instance), self.attr_name, value)

    def __delete__(self, instance):
        delattr(self.delegate(instance), self.attr_name)

    def delegate(self, instance):
        return getattr(instance, self.delegate_name)

    def __str__(self):
        return ""
    
# def delegate_as(delegate_cls):
#     # gather the attributes of the delegate class to add to the decorated class
#     attributes = delegate_cls.__dict__.keys()

#     def inner(cls):
#         # create property for storing the delegate
#         setattr(cls, 'delegate', delegate_cls)
#         # set all the attributes
#         for attr in attributes:
#             setattr(cls, attr, DelegatedAttribute(to, attr))
#         return cls
  
#     return inner

def delegate_as(delegate_cls, to='delegate'):#, include=frozenset(), ignore=frozenset()):
    # turn include and ignore into sets, if they aren't already
#     if not isinstance(include, set):
#         include = set(include)
#     if not isinstance(ignore, set):
#         ignore = set(ignore)
    print(delegate_cls)
    delegate_attrs = set(delegate_cls.__dict__.keys())
    attributes = delegate_attrs#include | delegate_attrs - ignore

    def inner(cls):
        # create property for storing the delegate
        #setattr(cls, to, delegate_cls)
        
        # don't bother adding attributes that the class already has
        attrs = attributes - set(cls.__dict__.keys())
        print("attributes", attrs)
        print("dict_line" in attrs)
        print(cls)
        # set all the attributes
        for attr in attrs:
            setattr(cls, attr, DelegatedAttribute(to, attr))
        return cls
    return inner

In [None]:
#export

#@delegate_as(dd.core.DataFrame, to="df")
#@delegate_df_cls(dd.core.DataFrame)


#@add_numerics(dd.core.DataFrame, dunder_delegate_attr="col", other_delegate_attr="df", dunder_list = ['add', 'sub', 'mul', 'truediv', 'floordiv', 'mod', 'pow', 'radd', 'rsub', 'rmul', 'rtruediv', 'rfloordiv', 'rmod', 'rpow', 'lt', 'gt', 'le', 'ge', 'ne', 'eq'])
class Field():
#     DELEGATED_METHODS = {
#         'df': ['add','radd','sub','rsub','mul','rmul']
#     }
#     data_dict= data_dict
#     pheno_df = pheno_df

    def __init__(self,pheno, *, name="Unamed_Field", instances=None, arrays=None):
        #self.dd_cls = dd_cls
        #allow passing in both the FieldID int and Field str
        
        if isinstance(pheno, int):
            dict_line_df = data_dict.query(f"FieldID == '{pheno}'")
        else:
            dict_line_df = data_dict.query(f"Field == '{pheno}'")
            
        if dict_line_df.empty:
            raise ValueError(f"Cannot find Field/FieldID '{pheno}' in data dict file '{data_dict.file}' ")
        self.dict_line = DictLine(dict_line_df)


        #make dict_line attributes accessible from Field object
        for col in self.dict_line.df.columns:
            setattr(self, col, getattr(self.dict_line, f"_{col}"))

        #reminder of what attribute is available from self.dict_line
        self.help = self.dict_line.df.columns.tolist()
        self.pheno_str = self.Field
        self.pheno_cols = self.dict_line.get_pheno_cols(pheno_df.columns, instances=instances, arrays=arrays)

        try:
            self.df = pheno_df.loc[:, self.pheno_cols].fillna(np.nan)
        except KeyError:
            raise KeyError(f"Cannot find phenotype '{self.pheno_str}'', ID: {self.FieldID} in the phenotype file. Please make sure your phenotype file contains all following columns: {self.pheno_cols} ")
        
        self.name = name
        #some fields don't have coding
        if self.Coding:
            #replacing coding int with more versatile coding object
            self.Coding = self.get_coding(coding_file_path_template)
            self.get_codes = self.Coding.get_codes
        else:
            print(f"Data field {self} has no Coding. The Coding attribute will be 'None'")

    

    def copy(self):
        obj = type(self).__new__(self.__class__)
        obj.__dict__.update(self.__dict__)
        obj.df = self.df.copy()
        return obj
    
    @classmethod
    def init_multi_type(cls, data, name):
        accepted_types = (str, int)
        #create new instance(s) of class Field
        
        ##handling dict
        if isinstance(data, dict):
            field = data["pheno"]
            if isinstance(field, accepted_types):
                updated_kwargs = data
                updated_kwargs["name"] = name
                return cls(**data)
        elif isinstance(data, accepted_types):
            return cls(data, name=name)
        
        #create copy of instance with a different name
        elif isinstance(data, cls):
            new_field = data.copy()
            new_field.name = name
            return new_field
        else:
            raise TypeError(f"Invalid pheno data type {type(field)}, can only accept input of type {accepted_types} ")
    
    @classmethod
    def make_fields_dict(cls, data: dict)-> dict: 
        if isinstance(data, dict):
            iter_obj = data.items()
            return {name: cls.init_multi_type(dict_or_obj, name) for name, dict_or_obj in iter_obj}
        raise TypeError("Can only accept dictionary ")
    
    @property
    def name(self, ):
        return self._name
    
    @name.setter
    def name(self,new_name):
        def make_full_name(orig_name, new_name=None):
            if new_name is None:
                new_name = "Unamed_field"
            #for first time parsing from source
            if "." in orig_name:
                name_list = orig_name.split(".")
                instance = name_list[2]
                array = name_list[3]
            else:
                name_list = orig_name.split("_")
                instance = name_list[-2]
                array = name_list[-1]
            return f"{new_name}_{instance}_{array}"
        
        #make_name = partial(make_full_name, new_name=new_name)
        
        #if only one instance and array, then we simplify the column name
        if self.Array == self.Instances == 1:
            self.df.columns = [new_name for col in self.df.columns]    
        else:
            self.df.columns = [ make_full_name(col, new_name=new_name) for col in self.df.columns]
            
        self._name = new_name
        return self
    
    def rename(self, new_name):
        self.name = new_name
        return self
    
    @property
    def pheno_str_no_space(self):
        return self.pheno_str.replace(" ", "_")
    
    def __repr__(self):
        return f"Field(Name:{self.name}, Pheno: `{self.pheno_str}`, ID: {self.FieldID}, Original Column(s): {self.pheno_cols}, Named Column(s): {self.all_cols_df.columns})"

    def __str__(self):
        return self.__repr__()

    def get_coding(self, coding_file_path_template):
        coding_file_name = coding_file_path_template.replace("*", str(self.Coding))
        return(Coding(coding_file_name, self.Coding))


    #need to call .compute() before you can perform masking
    def get_attr_childs(self, attr,*, input_field, output_field):
        all_related_fields = self.Coding.get_codes(attr, input_field, output_field)
        print(f"All the related fields that are classified as '{attr}' in coding file \n '{self.Coding.coding_file}' \n are {all_related_fields}")
        return all_related_fields #(self.df[self.pheno_str_no_space]).isin(all_related_fields)

    @property
    def value_counts(self):
        return self.col.value_counts()

    @property
    def all_cols_df(self):
        col = self.df.loc[:,self.df.columns]
        if (self.ValueType).lower() == "continuous":
            return col.astype("float")
        return col


class DictLine():
    def __init__(self, dict_line_df):
        self.df = dict_line_df

        #turn columns into private attributes _{col}
        for col in self.df.columns:
            setattr(self.__class__, f"_{col}", self.get_dict_field(col, self.df))

    def get_dict_field(self,dict_field, dict_line):
        @property
        def func(self):
            #catch multiple rows dataframes
            try:
                field = self.df[dict_field].item()
            except ValueError:
                return None
                
            if dict_field == "Coding":
                coding_num = field
                #some fields do not have coding
                if np.isnan(coding_num):
                    return None
                else:
                    return int(coding_num)
            return field
        return func

    def get_pheno_cols(self, col_list, instances=None, arrays=None):

        all_cols=[col for col in col_list if col.startswith(f"f.{self._FieldID}.")]
        
        if instances:
            all_cols = [col for col in all_cols if (int(col.split(".")[2]) in instances)]
            
        if arrays:
            all_cols = [col for col in all_cols if (int(col.split(".")[3]) in arrays)]
            
        return all_cols #[f"f.{self._FieldID}.{instance}.{array}" for array in range(self._Array) for instance in range(self._Instances)]

class Coding():
    def __init__(self, coding_file, coding_num):
        self.coding_num = coding_num
        self.coding_file = coding_file
        self.df = pd.read_csv(coding_file, sep="\t")

    def __repr__(self):
        return f"{self.__class__}. Coding num: {self.coding_num}, coding file: {self.coding_file}"

    def get_codes(self, selection, input_field, output_field):
        valid_coding_fields = ["coding","meaning","node_id"]
        for i_o_field in [input_field,output_field]:
            if (i_o_field not in valid_coding_fields):
                raise ValueError(f"Could not find field {i_o_field} field in coding file {self.coding_file}, choose from the following options: {valid_coding_fields}")
        if selection not in self.df[input_field].values:
            raise ValueError(f"Value not found in field {input_field} of coding file")

        code_line = DictLine(self.df[self.df[input_field] == selection])

        codes = [getattr(code_line,f"_{output_field}")]

        #code_line = DictLine(coding_df.query(f"meaning == '{selection}'"))
        df = self.df.query(f"parent_id=={code_line._node_id}")
        child_nodes = df[input_field].values.tolist()

        #recurse on the child nodes to get codings of level below the selection
        for child in child_nodes:
            codes.extend(self.get_codes(child, input_field, output_field))
        return codes

In [None]:
test = Field("Monocyte count", name= "monocyte_count")
test

Data field Field(Name:monocyte_count, Pheno: `Monocyte count`, ID: 30130, Original Column(s): ['f.30130.0.0', 'f.30130.1.0', 'f.30130.2.0'], Named Column(s): Index(['monocyte_count_0_0', 'monocyte_count_1_0', 'monocyte_count_2_0'], dtype='object')) has no Coding. The Coding attribute will be 'None'


Field(Name:monocyte_count, Pheno: `Monocyte count`, ID: 30130, Original Column(s): ['f.30130.0.0', 'f.30130.1.0', 'f.30130.2.0'], Named Column(s): Index(['monocyte_count_0_0', 'monocyte_count_1_0', 'monocyte_count_2_0'], dtype='object'))

In [None]:
test.df.compute()

Unnamed: 0_level_0,monocyte_count_0_0,monocyte_count_1_0,monocyte_count_2_0
f.eid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1000025,0.54,,
1000038,0.32,,
1000042,0.5,0.3,
1000056,0.53,,
1000061,0.97,,
...,...,...,...
5873158,0.3,,
5873167,0.5,0.1,0.24
5873175,0.42,,
5873180,0.56,,


In [None]:
icd10_df = Field(41202, name= "ICD10_primary").compute()
icd10_df

Unnamed: 0_level_0,ICD10_primary_0_0,ICD10_primary_0_1,ICD10_primary_0_2,ICD10_primary_0_3,ICD10_primary_0_4,ICD10_primary_0_5,ICD10_primary_0_6,ICD10_primary_0_7,ICD10_primary_0_8,ICD10_primary_0_9,...,ICD10_primary_0_56,ICD10_primary_0_57,ICD10_primary_0_58,ICD10_primary_0_59,ICD10_primary_0_60,ICD10_primary_0_61,ICD10_primary_0_62,ICD10_primary_0_63,ICD10_primary_0_64,ICD10_primary_0_65
f.eid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000025,I839,S832,,,,,,,,,...,,,,,,,,,,
1000038,O268,O800,R69,,,,,,,,...,,,,,,,,,,
1000042,M754,S5200,T848,,,,,,,,...,,,,,,,,,,
1000056,K409,R33,,,,,,,,,...,,,,,,,,,,
1000061,C851,C859,,,,,,,,,...,,,,,,,,,,
1000074,,,,,,,,,,,...,,,,,,,,,,
1000093,G510,H024,I214,K635,L570,M0086,M179,M1991,R31,,...,,,,,,,,,,
1000115,,,,,,,,,,,...,,,,,,,,,,
1000149,,,,,,,,,,,...,,,,,,,,,,
1000151,S6230,S662,T846,,,,,,,,...,,,,,,,,,,


In [None]:
icd10_df.join(icd10_secondary_df).to_csv("ICD10_pheno_matrix.tsv", sep = "\t", index = True)

In [None]:
Field(41202, name= "ICD10")

Unnamed: 0_level_0,ICD10_0_0,ICD10_0_1,ICD10_0_2,ICD10_0_3,ICD10_0_4,ICD10_0_5,ICD10_0_6,ICD10_0_7,ICD10_0_8,ICD10_0_9,...,ICD10_0_56,ICD10_0_57,ICD10_0_58,ICD10_0_59,ICD10_0_60,ICD10_0_61,ICD10_0_62,ICD10_0_63,ICD10_0_64,ICD10_0_65
f.eid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000025,I839,S832,,,,,,,,,...,,,,,,,,,,
1000038,O268,O800,R69,,,,,,,,...,,,,,,,,,,
1000042,M754,S5200,T848,,,,,,,,...,,,,,,,,,,
1000056,K409,R33,,,,,,,,,...,,,,,,,,,,
1000061,C851,C859,,,,,,,,,...,,,,,,,,,,


In [None]:
Field(41202, name= "ICD10").notnull().sum().compute()

ICD10_0_0     291787
ICD10_0_1     225364
ICD10_0_2     170244
ICD10_0_3     127300
ICD10_0_4      95032
ICD10_0_5      71119
ICD10_0_6      53653
ICD10_0_7      40382
ICD10_0_8      30516
ICD10_0_9      23276
ICD10_0_10     17848
ICD10_0_11     13791
ICD10_0_12     10735
ICD10_0_13      8424
ICD10_0_14      6641
ICD10_0_15      5283
ICD10_0_16      4260
ICD10_0_17      3415
ICD10_0_18      2712
ICD10_0_19      2192
ICD10_0_20      1766
ICD10_0_21      1447
ICD10_0_22      1211
ICD10_0_23      1005
ICD10_0_24       840
ICD10_0_25       701
ICD10_0_26       588
ICD10_0_27       489
ICD10_0_28       398
ICD10_0_29       318
               ...  
ICD10_0_36       107
ICD10_0_37        91
ICD10_0_38        79
ICD10_0_39        64
ICD10_0_40        52
ICD10_0_41        45
ICD10_0_42        39
ICD10_0_43        30
ICD10_0_44        27
ICD10_0_45        24
ICD10_0_46        18
ICD10_0_47        14
ICD10_0_48        13
ICD10_0_49         9
ICD10_0_50         8
ICD10_0_51         6
ICD10_0_52   