## 1. Carga de datos y limpieza inicial

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [2]:
data_dir = 'results/'

In [3]:
#data = pd.read_csv(data_dir + 'GwasCat_associations.r2020-05-03.parsed.csv.gz')
data = pd.read_table(data_dir + 'gwas_cat.filtrado.tsv.gz')

In [4]:
data.head()

Unnamed: 0,fenotipo,alelo_riesgo,categoria_fenotipo,genes_reportados,genes_mapeados,contexto,OR_or_beta,EFO_URI
0,Crohn's disease,rs12722489-C,Digestive system disorder,IL2RA,IL2RA,intron,1.11,http://www.ebi.ac.uk/efo/EFO_0000384
1,Crohn's disease,rs151181-G,Digestive system disorder,IL27 | SH2B1 | EIF3C | LAT | CD19,CLN3 | AC138894.1,intron,1.07,http://www.ebi.ac.uk/efo/EFO_0000384
2,Crohn's disease,rs2058660-G,Digestive system disorder,IL12RL2 | IL18R1 | IL1RL1 | IL18RAP,IL18RAP,intron,1.19,http://www.ebi.ac.uk/efo/EFO_0000384
3,Crohn's disease,rs6556412-A,Digestive system disorder,IL12B,AC008691.1,intron,1.18,http://www.ebi.ac.uk/efo/EFO_0000384
4,Crohn's disease,rs2797685-A,Digestive system disorder,VAMP3,PER3 | Z98884.1,intron,1.05,http://www.ebi.ac.uk/efo/EFO_0000384


In [5]:
data.shape

(112624, 8)

In [6]:
data.fenotipo.unique()[:20]

array(["Crohn's disease", 'Biochemical measures',
       'Obesity (early onset extreme)',
       'Soluble levels of adhesion molecules', 'Multiple sclerosis',
       'Alcohol consumption (transferrin glycosylation)',
       'Sudden cardiac arrest', 'Colorectal cancer', 'Ulcerative colitis',
       'Psoriasis', 'Nicotine dependence',
       'Congenital heart malformation', 'Pulmonary function', 'Height',
       'Osteoporosis-related phenotypes', 'Lung cancer', 'Glioma',
       'Melanoma', "Alzheimer's disease", 'Bone mineral density (hip)'],
      dtype=object)

Seleccionamos aquellas instancias que tienen en la columna `alelo_riesgo` un SNP.

In [7]:
mask_snps = data.alelo_riesgo.str.match("^rs[0-9]+-[ATCG\?]$")
data = data[mask_snps]
data.shape

(108554, 8)

Miramos y descartamos valores faltantes

In [8]:
data.isna().sum()

fenotipo                  0
alelo_riesgo              0
categoria_fenotipo       65
genes_reportados      42933
genes_mapeados         1727
contexto               1722
OR_or_beta                0
EFO_URI                  65
dtype: int64

No nos interesan aquellas instancias que tienen valore faltantes en la columna `OR_or_beta`

In [9]:
data.dropna(subset = ['OR_or_beta'], inplace = True)
data.shape

(108554, 8)

In [10]:
data.isna().sum()

fenotipo                  0
alelo_riesgo              0
categoria_fenotipo       65
genes_reportados      42933
genes_mapeados         1727
contexto               1722
OR_or_beta                0
EFO_URI                  65
dtype: int64

#### ESTO SE PUEDE MEJORAR: tiene que haber una única componente gigante

**Opcional 1**: sacamos aquellos alelos que aparezcan una sola vez. Sirve más que nada para achicar la base de datos.

In [11]:
if False:
    umbral = 1
    mask_alelos_poca_frecuencia = data.alelo_riesgo.value_counts() <=1
    mask_alelos_poca_frecuencia.head()

    alelos_poca_frecuencia = list(mask_alelos_poca_frecuencia.index[mask_alelos_poca_frecuencia.values])
    print(len(alelos_poca_frecuencia))

    mask_descartables = data.alelo_riesgo.isin(alelos_poca_frecuencia)
    mask_descartables

    print(data.shape)
    data = data[~mask_descartables]
    print(data.shape)

**No-Tan-Opcional 2:** Sacamos aquellos fenotipos que aparecen una sola vez:

In [12]:
umbral = 1
mask_fenotipos_poca_frecuencia = data.fenotipo.value_counts() <=1
mask_fenotipos_poca_frecuencia.head()

Blood protein levels                           False
Heel bone mineral density                      False
Height                                         False
Metabolite levels                              False
Educational attainment (years of education)    False
Name: fenotipo, dtype: bool

In [13]:
fenotipos_poca_frecuencia = list(mask_fenotipos_poca_frecuencia.index[mask_fenotipos_poca_frecuencia.values])
print(len(fenotipos_poca_frecuencia))

587


In [14]:
mask_descartables = data.fenotipo.isin(fenotipos_poca_frecuencia)
mask_descartables

0         False
1         False
2         False
3         False
4         False
          ...  
112619    False
112620    False
112621    False
112622    False
112623    False
Name: fenotipo, Length: 108554, dtype: bool

In [15]:
print(data.shape)
data = data[~mask_descartables]
print(data.shape)

(108554, 8)
(107967, 8)


### 2. Tripletes `alelo_riesgo`, `fenotipo` y `OR_or_beta` - Matriz de utilidad/¿incidencia?

In [16]:
data_tripletes = data[['alelo_riesgo', 'fenotipo', 'OR_or_beta']]
data_tripletes.shape

(107967, 3)

In [17]:
data_tripletes

Unnamed: 0,alelo_riesgo,fenotipo,OR_or_beta
0,rs12722489-C,Crohn's disease,1.1100
1,rs151181-G,Crohn's disease,1.0700
2,rs2058660-G,Crohn's disease,1.1900
3,rs6556412-A,Crohn's disease,1.1800
4,rs2797685-A,Crohn's disease,1.0500
...,...,...,...
112619,rs902627-A,Highest math class taken (MTAG),0.0095
112620,rs3117582-C,Lung adenocarcinoma,1.2200
112621,rs1051730-T,Lung adenocarcinoma,1.3100
112622,rs31489-C,Lung adenocarcinoma,1.1200


**Matriz de Utilidad/¿incidencia?**

In [18]:
n_alelos = data_tripletes.alelo_riesgo.nunique()
n_fenotipos = data_tripletes.fenotipo.nunique()

data_bipartita = pd.DataFrame(np.zeros((n_alelos, n_fenotipos)), columns = data_tripletes.fenotipo.value_counts().index, dtype = np.int8)
data_bipartita.head()

Unnamed: 0,Blood protein levels,Heel bone mineral density,Height,Metabolite levels,Educational attainment (years of education),Body mass index,Post bronchodilator FEV1/FVC ratio,Educational attainment (MTAG),General cognitive ability,Post bronchodilator FEV1,...,Hen's egg allergy,Pediatric areal bone mineral density (radius vs spine & hip discordant skeletal phenotype),Response to allopurinol in gout (change in serum uric acid levels),Trichloroethylene-induced hypersensitivity syndrome,Accelerometer-based physical activity measurement (fraction of time with accelerations >425 milli-gravities),Breast cancer (menopausal hormone therapy interaction),Insulin-related traits,Triglyceride levels x smoking (ever vs never smokers) interaction (2df test),Heart rate response to beta blockers,Thyrotoxic hypokalemic periodic paralysis
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
### EN EL DICCIONARIO APARECEN ORDENADOS POR FRECUENCIA
dict_id_to_alelo = {}
for i,alelo in enumerate(data_tripletes.alelo_riesgo.value_counts().index):
    dict_id_to_alelo[i] = alelo
    
dict_alelo_to_id = {v: k for k, v in dict_id_to_alelo.items()}

In [20]:
for _, row in data_tripletes.iterrows():
    alelo_row = row.alelo_riesgo
    fenotipo_row = row.fenotipo
    id_alelo = dict_alelo_to_id[alelo_row]
    data_bipartita.loc[id_alelo, fenotipo_row] = 1

**Chequeos**

La suma de todos los elementos de `data_bipartita` debe dar la cantidad de filas en `data_tripletes`

In [21]:
data_bipartita.sum().sum() == data_tripletes.shape[0]

True

La suma por columnas el `value_counts()` de `data_tripletes.fenotipo`

In [22]:
(data_tripletes.fenotipo.value_counts().values == data_bipartita.sum().values).all()

True

La suma por filas el `value_counts()` de `data_tripletes.alelo_riesgo`

In [23]:
(data_tripletes.alelo_riesgo.value_counts().values == data_bipartita.sum(axis = 1).values).all()

True

In [24]:
data_bipartita

Unnamed: 0,Blood protein levels,Heel bone mineral density,Height,Metabolite levels,Educational attainment (years of education),Body mass index,Post bronchodilator FEV1/FVC ratio,Educational attainment (MTAG),General cognitive ability,Post bronchodilator FEV1,...,Hen's egg allergy,Pediatric areal bone mineral density (radius vs spine & hip discordant skeletal phenotype),Response to allopurinol in gout (change in serum uric acid levels),Trichloroethylene-induced hypersensitivity syndrome,Accelerometer-based physical activity measurement (fraction of time with accelerations >425 milli-gravities),Breast cancer (menopausal hormone therapy interaction),Insulin-related traits,Triglyceride levels x smoking (ever vs never smokers) interaction (2df test),Heart rate response to beta blockers,Thyrotoxic hypokalemic periodic paralysis
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89452,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
89453,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
89454,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
89455,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Separación de datos para testeo

#### Separación 1 - Alelos "nuevos"

Tomamos alelos al azar y los sacamos de la matriz de utilidad. De esta forma, no aportarán a las similitudes de los fenotipos.

Vamos a sacar alelos que tengan al menos grado 2 y como máximo grados 6 

In [68]:
np.random.seed(42)
n_alelos_a_sacar = 100
alelos_a_sacar = []

lista_alelos_desordenada = list(dict_id_to_alelo.keys())
lista_alelos_desordenada = np.random.choice(lista_alelos_desordenada,len(lista_alelos_desordenada), replace = False)

grados = data_bipartita.sum(axis = 1)
for alelo in lista_alelos_desordenada:
#     pass
    grado = grados.iloc[alelo]
    
    if grado >=3 and grado <=5:
        alelos_a_sacar.append(alelo)
        
    if len(alelos_a_sacar) == n_alelos_a_sacar:
        break

In [69]:
data_bipartita_test_1 = data_bipartita.iloc[alelos_a_sacar,:]
print(data_bipartita_test_1.shape)
print(data_bipartita_test_1.sum().sum())

(100, 2957)
345


In [70]:
data_bipartita_train = data_bipartita.drop(alelos_a_sacar)
print(data_bipartita_train.shape)

(89357, 2957)


#### ESTO SE PUEDE MEJORAR, PERO POR AHORA CHEQUEAMOS QUE NO QUEDE UN FENOTIPO DE GRADO 1
Nuevamente, lo mejor sería que el conjunto de train tenga una única componente.

In [71]:
data_bipartita_train.sum().min()

2

Sacamos del conjunto de test algunos 

In [72]:
np.random.seed(42)
for idx, row in data_bipartita_test_1.iterrows():
    
    fenotipos_asociados = list(row[row == 1].index)
    fenotipos_asociados = np.random.choice(fenotipos_asociados, len(fenotipos_asociados), replace = False)
    contador = 0
    for fenotipo in fenotipos_asociados:
        data_bipartita_test_1.loc[idx, fenotipo] = 0
        contador +=1
        if contador == len(fenotipos_asociados) - 1:
            break

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_bipartita_test_1.loc[idx, fenotipo] = 0


In [73]:
data_bipartita_test_1.sum().sum()

100

### 3. Armamos la red --> ¿Pasar a Gephy la visualización?

Se puede saltear por ahora, no lo usamos

In [None]:
import networkx as nx

In [None]:
red = nx.Graph()

In [None]:
red.add_edges_from(data_tripletes[['alelo_riesgo', 'fenotipo']].values)

In [None]:
### MUY PESADO EN MEMORIA
matriz_adyacencia = nx.to_pandas_adjacency(red, dtype = np.int8)

In [None]:
# prueba = np.ones((99365, 99365), dtype = np.uint8)
# del prueba

In [None]:
matriz_adyacencia

In [None]:
data_bipartita

### 4. Filtro Colaborativo Implícito

https://medium.com/radon-dev/item-item-collaborative-filtering-with-binary-or-unary-data-e8f0b465b2c3

In [74]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

En primer lugar, normalizamos con respecto a la cantidad de enlaces que tiene cada alelo. En las películas, esto se justifica como 

> *This is the idea of normalizing the user vectors so that a user with many ratings contributes less to any individual rating. This is to say that a like from a user who has only liked 10 items is more valuable to us than a like from someone who likes everything she comes across.*

In [75]:
#------------------------
# ITEM-ITEM CALCULATIONS
#------------------------

# As a first step we normalize the user vectors to unit vectors.

# magnitude = sqrt(x2 + y2 + z2 + ...)
magnitude = np.sqrt(np.square(data_bipartita_train).sum(axis=1))

# # unitvector = (x / magnitude, y / magnitude, z / magnitude, ...)
data_bipartita_normalizada = data_bipartita_train.divide(magnitude, axis='index')
data_bipartita_normalizada

Unnamed: 0,Blood protein levels,Heel bone mineral density,Height,Metabolite levels,Educational attainment (years of education),Body mass index,Post bronchodilator FEV1/FVC ratio,Educational attainment (MTAG),General cognitive ability,Post bronchodilator FEV1,...,Hen's egg allergy,Pediatric areal bone mineral density (radius vs spine & hip discordant skeletal phenotype),Response to allopurinol in gout (change in serum uric acid levels),Trichloroethylene-induced hypersensitivity syndrome,Accelerometer-based physical activity measurement (fraction of time with accelerations >425 milli-gravities),Breast cancer (menopausal hormone therapy interaction),Insulin-related traits,Triglyceride levels x smoking (ever vs never smokers) interaction (2df test),Heart rate response to beta blockers,Thyrotoxic hypokalemic periodic paralysis
0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.2,0.000000,0.0,0.2,0.2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.2,0.0,0.0,0.0,0.0,0.200000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89452,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
89453,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
89454,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
89455,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Luego,calculamos la similaridad coseno para cada columna, obteniendo así qué fenotipos son parecidos.

In [76]:
def calculate_similarity(data_items):
    """Calculate the column-wise cosine similarity for a sparse
    matrix. Return a new dataframe matrix with similarities.
    """
    data_sparse = sparse.csr_matrix(data_items)
    similarities = cosine_similarity(data_sparse.transpose())
    sim = pd.DataFrame(data=similarities, index= data_items.columns, columns= data_items.columns)
    return sim

# Build the similarity matrix
data_matrix = calculate_similarity(data_bipartita_normalizada)
data_matrix.head()

Unnamed: 0,Blood protein levels,Heel bone mineral density,Height,Metabolite levels,Educational attainment (years of education),Body mass index,Post bronchodilator FEV1/FVC ratio,Educational attainment (MTAG),General cognitive ability,Post bronchodilator FEV1,...,Hen's egg allergy,Pediatric areal bone mineral density (radius vs spine & hip discordant skeletal phenotype),Response to allopurinol in gout (change in serum uric acid levels),Trichloroethylene-induced hypersensitivity syndrome,Accelerometer-based physical activity measurement (fraction of time with accelerations >425 milli-gravities),Breast cancer (menopausal hormone therapy interaction),Insulin-related traits,Triglyceride levels x smoking (ever vs never smokers) interaction (2df test),Heart rate response to beta blockers,Thyrotoxic hypokalemic periodic paralysis
Blood protein levels,1.0,0.0,0.000444,0.0,0.0,0.000685,0.00014,0.0,0.0,0.000522,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Heel bone mineral density,0.0,1.0,0.002303,0.00024,0.000189,0.000173,0.0,0.000585,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Height,0.000444,0.002303,1.0,7.8e-05,0.000323,0.000754,0.00022,0.000399,0.0,0.000274,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Metabolite levels,0.0,0.00024,7.8e-05,1.0,0.000336,0.0,0.0,0.0,0.000352,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Educational attainment (years of education),0.0,0.000189,0.000323,0.000336,1.0,0.001954,0.0,0.283447,0.003053,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Veamos similaridades

In [77]:
# Lets get the top 11 similar artists for Beyonce
# print(data_matrix.loc['Drinking behavior'].nlargest(25))

print(data_matrix.loc['Educational attainment (MTAG)'].nlargest(30))

Educational attainment (MTAG)                                                     1.000000
Educational attainment (years of education)                                       0.283447
Highest math class taken (MTAG)                                                   0.148955
Cognitive performance (MTAG)                                                      0.062867
Self-reported math ability (MTAG)                                                 0.021035
Intelligence (MTAG)                                                               0.011431
Household income (MTAG)                                                           0.010181
Cognitive performance                                                             0.008260
Intelligence                                                                      0.007398
Highest math class taken                                                          0.006975
Urea levels                                                                       0.004796

In [78]:
# Construct a new dataframe with the 10 closest neighbours (most similar)
# for each artist.

n_neighbours = 20
data_neighbours = pd.DataFrame(index=data_matrix.columns, columns=range(1,n_neighbours+1))
for i in range(0, len(data_matrix.columns)):
    data_neighbours.iloc[i,:n_neighbours] = data_matrix.iloc[0:,i].sort_values(ascending=False)[:n_neighbours].index
data_neighbours.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
Blood protein levels,Blood protein levels,Planum temporale asymmetry index,Soluble levels of adhesion molecules,L-arginine levels,Insulinogenic index,Neovascular age-related macular degeneration,NT-proBNP levels in acute coronary syndrome,ADAMTS13 activity,B-type natriuretic peptide levels,Neonatal cytokine/chemokine levels (fetal gene...,Butyrylcholinesterase levels,Vitamin B12 levels,Blood protein levels in cardiovascular risk,Expressive vocabulary in infants,Metabolic traits,Age-related macular degeneration (choroidal ne...,HIV-1 susceptibility,Composite immunoglobulin trait (IgA/IgG),Serum cancer antigen 19.9 levels,Serum protein levels (sST2)
Heel bone mineral density,Heel bone mineral density,Serum sclerostin levels,Fractures,Thyroid function,Systolic blood pressure change trajectories,Serum urea levels,Multisite chronic pain,Femoral neck bone mineral density,Cardiovascular disease risk factors,Total body bone mineral density,Systemic lupus erythematosus or rheumatoid art...,Height,Maternal nondisjunction of chromosome 21 (MI e...,Heel bone mineral density (MTAG),Total body bone mineral density (MTAG),Serum alkaline phosphatase levels,Stem cell factor levels,Lumbar spine bone mineral density (trabecular),Maternal nondisjunction of chromosome 21 (moth...,Educational attainment (college completion)
Height,Height,Infant length,Birth length,Hip circumference adjusted for BMI,Puberty onset (breast development),Waist circumference adjusted for BMI in non-sm...,Glucagon levels in response to oral glucose to...,Waist circumference adjusted for BMI (adjusted...,Developmental dysplasia of the hip,Birth weight,Waist circumference adjusted for BMI in active...,Age at voice drop,Anthropometric traits,Head circumference (infant),Infant head circumference (MTAG),Weight,Waist circumference adjusted for body mass index,Body fat distribution (arm fat ratio),Birth weight (MTAG),Menarche and menopause (age at onset)
Metabolite levels,Metabolite levels,Parkinson's disease in GBA mutation carriers,Statin-induced myopathy (severe),Red blood cell fatty acid levels,Serum urea levels,Acylcarnitine levels,Urinary sodium to creatinine ratio,Plasma factor V levels in venous thrombosis (c...,Fasting blood glucose,Lipoprotein-associated phospholipase A2 activi...,Plasma free amino acid levels (adjusted for tw...,Folate pathway vitamin levels,Change in serum metabolite levels,QRS interval (sulfonylurea treatment interaction),Hematological and biochemical traits,Change in serum metabolite levels (CMS),Alanine transaminase levels,Total bilirubin levels in HIV-1 infection,Low HDL-cholesterol levels,Serum metabolite levels
Educational attainment (years of education),Educational attainment (years of education),Educational attainment (MTAG),Highest math class taken (MTAG),Cognitive performance (MTAG),Household income (MTAG),Intelligence (MTAG),Self-reported math ability (MTAG),Educational attainment (college completion),Autism and major depressive disorder (MTAG),Risk-taking tendency (4-domain principal compo...,Household income,Smoking behaviour (cigarette pack-years),Intelligence,Regular attendance at a gym or sports club,Urea levels,Cigarettes smoked per day (MTAG),Eating disorders,Alcohol consumption,Educational attainment,Highest math class taken


In [79]:
# np.random.seed(43)
alelo_prueba = np.random.choice(data_tripletes.alelo_riesgo.unique())
alelo_prueba = 'rs6739779-C'
alelo_prueba


'rs6739779-C'

In [80]:
alelo_prueba_index = dict_alelo_to_id[alelo_prueba]
print(alelo_prueba_index)

known_alelo_fenotipos = data_bipartita_normalizada.iloc[alelo_prueba_index]
known_alelo_fenotipos = known_alelo_fenotipos[known_alelo_fenotipos >0].index.values
print(known_alelo_fenotipos)

9626
['Educational attainment (MTAG)' 'Highest math class taken (MTAG)']


In [81]:
# Construct the neighbourhood from the most similar items to the
# ones our alelo it's related
most_similar_fenotipos = data_neighbours.loc[known_alelo_fenotipos]
most_similar_fenotipos

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
Educational attainment (MTAG),Educational attainment (MTAG),Educational attainment (years of education),Highest math class taken (MTAG),Cognitive performance (MTAG),Self-reported math ability (MTAG),Intelligence (MTAG),Household income (MTAG),Cognitive performance,Intelligence,Highest math class taken,Urea levels,Smoking initiation (ever regular vs never regu...,Self-reported math ability,General cognitive ability,Age of smoking initiation (MTAG),Educational attainment (college completion),Age-related cognitive decline (attention/proce...,Smoking cessation (MTAG),Household income,Neuroticism (MTAG)
Highest math class taken (MTAG),Highest math class taken (MTAG),Educational attainment (MTAG),Self-reported math ability (MTAG),Cognitive performance (MTAG),Educational attainment (years of education),Highest math class taken,Self-reported math ability,Cognitive performance,Intelligence (MTAG),Intelligence,Extremely high intelligence,Household income (MTAG),Kawasaki disease,Glioblastoma,Smoking initiation (ever regular vs never regu...,Smoking cessation,Educational attainment (college completion),Glaucoma,Allergic rhinitis,BMI in non-smokers


In [82]:
similar_list = most_similar_fenotipos.values.tolist()
similar_list = list(set([item for sublist in similar_list for item in sublist]))
similar_list


['Cognitive performance',
 'Household income (MTAG)',
 'Educational attainment (MTAG)',
 'Age-related cognitive decline (attention/processing speed) (slope of z-scores)',
 'Cognitive performance (MTAG)',
 'Self-reported math ability',
 'Self-reported math ability (MTAG)',
 'Smoking initiation (ever regular vs never regular) (MTAG)',
 'Intelligence',
 'Educational attainment (college completion)',
 'Glioblastoma',
 'Glaucoma',
 'Urea levels',
 'Neuroticism (MTAG)',
 'General cognitive ability',
 'Allergic rhinitis',
 'Household income',
 'Highest math class taken (MTAG)',
 'BMI in non-smokers',
 'Smoking cessation',
 'Extremely high intelligence',
 'Educational attainment (years of education)',
 'Age of smoking initiation (MTAG)',
 'Highest math class taken',
 'Kawasaki disease',
 'Intelligence (MTAG)',
 'Smoking cessation (MTAG)']

In [83]:
neighbourhood = data_matrix[similar_list].loc[similar_list]
neighbourhood

Unnamed: 0,Cognitive performance,Household income (MTAG),Educational attainment (MTAG),Age-related cognitive decline (attention/processing speed) (slope of z-scores),Cognitive performance (MTAG),Self-reported math ability,Self-reported math ability (MTAG),Smoking initiation (ever regular vs never regular) (MTAG),Intelligence,Educational attainment (college completion),...,Highest math class taken (MTAG),BMI in non-smokers,Smoking cessation,Extremely high intelligence,Educational attainment (years of education),Age of smoking initiation (MTAG),Highest math class taken,Kawasaki disease,Intelligence (MTAG),Smoking cessation (MTAG)
Cognitive performance,1.0,0.001053,0.00826,0.0,0.146346,0.000382,0.010251,0.00241,0.089125,0.0,...,0.010186,0.0,0.0,0.002845,0.003643,0.0,0.001132,0.0,0.035952,0.0
Household income (MTAG),0.001053,1.0,0.010181,0.0,0.011688,0.0,0.000747,0.00323,0.008822,0.017149,...,0.004919,0.0,0.0,0.0,0.017487,0.0,0.0,0.0,0.027864,0.001923
Educational attainment (MTAG),0.00826,0.010181,1.0,0.003181,0.062867,0.004185,0.021035,0.00446,0.007398,0.003353,...,0.148955,0.002002,0.0,0.001119,0.283447,0.003663,0.006975,0.0,0.011431,0.003158
Age-related cognitive decline (attention/processing speed) (slope of z-scores),0.0,0.0,0.003181,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.002575,0.0,0.0,0.0,0.0,0.0
Cognitive performance (MTAG),0.146346,0.011688,0.062867,0.0,1.0,0.004948,0.045281,0.000732,0.042574,0.00332,...,0.074647,0.0,0.0,0.001661,0.035912,0.0,0.007706,0.0,0.032505,0.0
Self-reported math ability,0.000382,0.0,0.004185,0.0,0.004948,1.0,0.109468,0.0,0.001866,0.0,...,0.020191,0.0,0.0,0.0,0.001662,0.0,0.040554,0.0,0.00091,0.0
Self-reported math ability (MTAG),0.010251,0.000747,0.021035,0.0,0.045281,0.109468,1.0,0.001328,0.003951,0.0,...,0.123139,0.002477,0.0,0.0,0.011141,0.0,0.012877,0.0,0.004982,0.0
Smoking initiation (ever regular vs never regular) (MTAG),0.00241,0.00323,0.00446,0.0,0.000732,0.0,0.001328,1.0,0.0,0.0,...,0.002733,0.0,0.001879,0.0,0.002751,0.105771,0.0,0.0,0.000985,0.01818
Intelligence,0.089125,0.008822,0.007398,0.0,0.042574,0.001866,0.003951,0.0,1.0,0.001597,...,0.008038,0.004767,0.0,0.0,0.003944,0.0,0.001985,0.0,0.066037,0.0
Educational attainment (college completion),0.0,0.017149,0.003353,0.0,0.00332,0.0,0.0,0.0,0.001597,1.0,...,0.002359,0.0,0.0,0.0,0.009049,0.0,0.0,0.0,0.004357,0.0


In [84]:
# A user vector containing only the neighbourhood items and
# the known user likes.
fenotipos_probables = data_bipartita_normalizada.iloc[alelo_prueba_index].loc[similar_list]
fenotipos_probables

Cognitive performance                                                             0.000000
Household income (MTAG)                                                           0.000000
Educational attainment (MTAG)                                                     0.707107
Age-related cognitive decline (attention/processing speed) (slope of z-scores)    0.000000
Cognitive performance (MTAG)                                                      0.000000
Self-reported math ability                                                        0.000000
Self-reported math ability (MTAG)                                                 0.000000
Smoking initiation (ever regular vs never regular) (MTAG)                         0.000000
Intelligence                                                                      0.000000
Educational attainment (college completion)                                       0.000000
Glioblastoma                                                                      0.000000

In [85]:
# Calculate the score.
score = neighbourhood.dot(fenotipos_probables).div(neighbourhood.sum(axis=1))
score

Cognitive performance                                                             0.009751
Household income (MTAG)                                                           0.008838
Educational attainment (MTAG)                                                     0.506695
Age-related cognitive decline (attention/processing speed) (slope of z-scores)    0.002237
Cognitive performance (MTAG)                                                      0.065612
Self-reported math ability                                                        0.014486
Self-reported math ability (MTAG)                                                 0.075338
Smoking initiation (ever regular vs never regular) (MTAG)                         0.004443
Intelligence                                                                      0.008511
Educational attainment (college completion)                                       0.003876
Glioblastoma                                                                      0.004220

In [86]:
# Drop the known likes.
# score = score.drop(known_alelo_fenotipos).sort_values()
score.sort_values(ascending = False, inplace = True)
score

Highest math class taken (MTAG)                                                   0.530427
Educational attainment (MTAG)                                                     0.506695
Educational attainment (years of education)                                       0.167441
Self-reported math ability (MTAG)                                                 0.075338
Cognitive performance (MTAG)                                                      0.065612
Highest math class taken                                                          0.024323
Self-reported math ability                                                        0.014486
Intelligence (MTAG)                                                               0.011505
Cognitive performance                                                             0.009751
Household income (MTAG)                                                           0.008838
Intelligence                                                                      0.008511

In [87]:
print(known_alelo_fenotipos)
print(score.nlargest(30))

['Educational attainment (MTAG)' 'Highest math class taken (MTAG)']
Highest math class taken (MTAG)                                                   0.530427
Educational attainment (MTAG)                                                     0.506695
Educational attainment (years of education)                                       0.167441
Self-reported math ability (MTAG)                                                 0.075338
Cognitive performance (MTAG)                                                      0.065612
Highest math class taken                                                          0.024323
Self-reported math ability                                                        0.014486
Intelligence (MTAG)                                                               0.011505
Cognitive performance                                                             0.009751
Household income (MTAG)                                                           0.008838
Intelligence          

### Sobre Conjunto de Test

"A mano" por ahora

In [104]:
# np.random.seed(50)
alelo_prueba_index = np.random.choice(data_bipartita_test_1.index)
alelo_prueba_index

3242

In [105]:
alelo_prueba = dict_id_to_alelo[alelo_prueba_index]
alelo_prueba

'rs319690-T'

In [106]:
known_alelo_fenotipos = data_bipartita_test_1.loc[alelo_prueba_index]
known_alelo_fenotipos = known_alelo_fenotipos[known_alelo_fenotipos >0].index.values
print(known_alelo_fenotipos)

['Diastolic blood pressure']


In [107]:
fenotipos_reportados_alelo = data_bipartita.loc[alelo_prueba_index]
fenotipos_reportados_alelo = fenotipos_reportados_alelo[fenotipos_reportados_alelo >0].index.values
print(fenotipos_reportados_alelo)

['Systolic blood pressure' 'Diastolic blood pressure' 'Blood pressure']


In [108]:
# Construct the neighbourhood from the most similar items to the
# ones our alelo it's related
most_similar_fenotipos = data_neighbours.loc[known_alelo_fenotipos]
most_similar_fenotipos

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
Diastolic blood pressure,Diastolic blood pressure,Systolic blood pressure,Mean arterial pressure,Hypertension,Pulse pressure,Mean arterial pressure (long-term average),Systolic blood pressure (long-term average),Diastolic blood pressure (long-term average),Medication use (agents acting on the renin-ang...,Drinking behavior,Myocardial infarction,Blood pressure,Soluble ICAM-1,Medication use (beta blocking agents),Coronary artery disease (myocardial infarction...,Activated partial thromboplastin time,Medication use (antihypertensives),Macrophage inflammatory protein 1a levels,Autoimmune hepatitis type-1,Heart rate


In [109]:
similar_list = most_similar_fenotipos.values.tolist()
similar_list = list(set([item for sublist in similar_list for item in sublist]))
similar_list


['Soluble ICAM-1',
 'Medication use (antihypertensives)',
 'Mean arterial pressure',
 'Activated partial thromboplastin time',
 'Macrophage inflammatory protein 1a levels',
 'Diastolic blood pressure',
 'Drinking behavior',
 'Systolic blood pressure (long-term average)',
 'Coronary artery disease (myocardial infarction, percutaneous transluminal coronary angioplasty, coronary artery bypass grafting, angina or chromic ischemic heart disease)',
 'Pulse pressure',
 'Heart rate',
 'Myocardial infarction',
 'Blood pressure',
 'Systolic blood pressure',
 'Mean arterial pressure (long-term average)',
 'Hypertension',
 'Medication use (beta blocking agents)',
 'Autoimmune hepatitis type-1',
 'Diastolic blood pressure (long-term average)',
 'Medication use (agents acting on the renin-angiotensin system)']

In [110]:
neighbourhood = data_matrix[similar_list].loc[similar_list]
neighbourhood

Unnamed: 0,Soluble ICAM-1,Medication use (antihypertensives),Mean arterial pressure,Activated partial thromboplastin time,Macrophage inflammatory protein 1a levels,Diastolic blood pressure,Drinking behavior,Systolic blood pressure (long-term average),"Coronary artery disease (myocardial infarction, percutaneous transluminal coronary angioplasty, coronary artery bypass grafting, angina or chromic ischemic heart disease)",Pulse pressure,Heart rate,Myocardial infarction,Blood pressure,Systolic blood pressure,Mean arterial pressure (long-term average),Hypertension,Medication use (beta blocking agents),Autoimmune hepatitis type-1,Diastolic blood pressure (long-term average),Medication use (agents acting on the renin-angiotensin system)
Soluble ICAM-1,1.0,0.0,0.0,0.0,0.0,0.006007,0.0,0.0,0.009962,0.005672,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Medication use (antihypertensives),0.0,1.0,0.011021,0.0,0.0,0.004832,0.0,0.0,0.0,0.013688,0.0,0.0,0.0,0.017546,0.0,0.010527,0.0,0.0,0.0,0.0
Mean arterial pressure,0.0,0.011021,1.0,0.000969,0.0,0.136234,0.0,0.071059,0.004946,0.03843,0.0,0.004239,0.005358,0.111029,0.072149,0.147493,0.010861,0.0,0.062898,0.005047
Activated partial thromboplastin time,0.0,0.0,0.000969,1.0,0.0,0.004884,0.0,0.0,0.0,0.000401,0.0,0.0,0.0,0.000386,0.0,0.0,0.0,0.0,0.0,0.0
Macrophage inflammatory protein 1a levels,0.0,0.0,0.0,0.0,1.0,0.004642,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Diastolic blood pressure,0.006007,0.004832,0.136234,0.004884,0.004642,1.0,0.008741,0.023446,0.005059,0.040736,0.004236,0.006814,0.006293,0.141934,0.024041,0.059575,0.005467,0.004244,0.018868,0.011654
Drinking behavior,0.0,0.0,0.0,0.0,0.0,0.008741,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007936,0.0,0.0,0.0,0.0,0.0,0.0
Systolic blood pressure (long-term average),0.0,0.0,0.071059,0.0,0.0,0.023446,0.0,1.0,0.0,0.029417,0.0,0.0,0.0,0.028283,0.771663,0.048281,0.0,0.0,0.279524,0.0
"Coronary artery disease (myocardial infarction, percutaneous transluminal coronary angioplasty, coronary artery bypass grafting, angina or chromic ischemic heart disease)",0.009962,0.0,0.004946,0.0,0.0,0.005059,0.0,0.0,1.0,0.004519,0.0,0.120614,0.0,0.006772,0.0,0.0,0.008872,0.0,0.0,0.004427
Pulse pressure,0.005672,0.013688,0.03843,0.000401,0.0,0.040736,0.0,0.029417,0.004519,1.0,0.007749,0.002407,0.012379,0.087663,0.029869,0.032302,0.003497,0.0,0.026039,0.005768


In [111]:
data_bipartita_test_1.loc[alelo_prueba_index]

Blood protein levels                                                            0
Heel bone mineral density                                                       0
Height                                                                          0
Metabolite levels                                                               0
Educational attainment (years of education)                                     0
                                                                               ..
Breast cancer (menopausal hormone therapy interaction)                          0
Insulin-related traits                                                          0
Triglyceride levels x smoking (ever vs never smokers) interaction (2df test)    0
Heart rate response to beta blockers                                            0
Thyrotoxic hypokalemic periodic paralysis                                       0
Name: 3242, Length: 2957, dtype: int8

In [112]:
# A user vector containing only the neighbourhood items and
# the known user likes.
fenotipos_probables = data_bipartita_test_1.loc[alelo_prueba_index].loc[similar_list]
fenotipos_probables

Soluble ICAM-1                                                                                                                                                                0
Medication use (antihypertensives)                                                                                                                                            0
Mean arterial pressure                                                                                                                                                        0
Activated partial thromboplastin time                                                                                                                                         0
Macrophage inflammatory protein 1a levels                                                                                                                                     0
Diastolic blood pressure                                                                                                

In [113]:
# Calculate the score.
score = neighbourhood.dot(fenotipos_probables).div(neighbourhood.sum(axis=1))
score

Soluble ICAM-1                                                                                                                                                                0.005880
Medication use (antihypertensives)                                                                                                                                            0.004569
Mean arterial pressure                                                                                                                                                        0.081008
Activated partial thromboplastin time                                                                                                                                         0.004852
Macrophage inflammatory protein 1a levels                                                                                                                                     0.004621
Diastolic blood pressure                                                             

In [114]:
# Drop the known likes.
# score = score.drop(known_alelo_fenotipos).sort_values()
score.sort_values(ascending = False, inplace = True)
score

Diastolic blood pressure                                                                                                                                                      0.658888
Systolic blood pressure                                                                                                                                                       0.092953
Mean arterial pressure                                                                                                                                                        0.081008
Hypertension                                                                                                                                                                  0.039343
Pulse pressure                                                                                                                                                                0.030388
Medication use (agents acting on the renin-angiotensin system)                       

In [115]:
print(known_alelo_fenotipos)
print(fenotipos_reportados_alelo)
print(score.nlargest(50))

['Diastolic blood pressure']
['Systolic blood pressure' 'Diastolic blood pressure' 'Blood pressure']
Diastolic blood pressure                                                                                                                                                      0.658888
Systolic blood pressure                                                                                                                                                       0.092953
Mean arterial pressure                                                                                                                                                        0.081008
Hypertension                                                                                                                                                                  0.039343
Pulse pressure                                                                                                                                                         

### Cosas para hacer

1. Chequear sobre `categoria_fenotipo` que no sea trivial
1. Agregar segunda forma de evaluación: borrando '1' de la matriz de train
1. Chequear que al separar train y test no se rompa la red
1. Metodizar la evaluación para obtener una métrica
1. Chequear que no estemos cayendo en la parte "fácil" de la red --> ¿pregunta biológica?