### Human Phenotype Ontology Analysis

Find the similarity between terms in the HPO

In [14]:
%matplotlib inline
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial.distance import cosine

from itertools import combinations

import cPickle as pickle

In [2]:
datadir='/Users/kenneth/PhD_Data2017/HPO'

In [None]:
def get_data(datadir=datadir, filename):
    return pd.read_csv(os.path.join(datadir, filename))

In [9]:
for (path, dirs, files) in os.walk(datadir):
    print files

['.DS_Store', 'ALL_SOURCES_ALL_FREQUENCIES_diseases_to_genes_to_phenotypes.txt', 'ALL_SOURCES_ALL_FREQUENCIES_genes_to_phenotype.txt', 'ALL_SOURCES_ALL_FREQUENCIES_phenotype_to_genes.txt', 'diseases_to_genes.txt', 'genes_to_diseases.txt', 'HP.csv', 'HP.csv.gz']


### Create dictionaries

- Create dictionary of phenotypes to genes
- Create dictionary of genes to genes
- Create dictionary of diseases to genes
- Create dictionary of diseases to phenotypes

In [3]:
dfPhenGene = pd.read_csv(os.path.join(datadir, 'ALL_SOURCES_ALL_FREQUENCIES_phenotype_to_genes.txt'), sep='\t', skiprows=1)

Unnamed: 0,HP:0001459,1-3 toe syndactyly,2737,GLI3
0,HP:0006088,1-5 finger complete cutaneous syndactyly,64327,LMBR1
1,HP:0010708,1-5 finger syndactyly,6469,SHH


In [4]:
dfPhenGene.shape

(374365, 4)

In [5]:
columns =['hpoid', 'hpo_name', 'gene_id', 'gene_symbol']

In [6]:
dfPhenGene.columns=columns

In [7]:
dfTop = dfPhenGene.head(100)

In [8]:
dictG = {k:g['gene_id'].tolist() for k,g in  dfTop.groupby(dfTop.hpoid)}


#### Create a dictionary of phenotypes by genes {phen: [gene1, gene2...genen]}

In [9]:
dictPhen = {k:g['gene_id'].tolist() for k,g in  dfPhenGene.groupby(dfPhenGene.hpoid)}



In [39]:
for k, v in dictPhen.iteritems():
#     print k, v
    pass
    
print dictPhen['HP:0006099']

374365/7505

# print dictPhen.keys()

print 

[26229, 9469]



#### Serialize phenotype dictionary into an object

In [15]:
def serializeDict(mydict, myfile ):
    with open(myfile, "wb") as handle:
        pickle.dump(mydict, handle)


In [16]:
serializeDict(dictPhen, "Phenotypes_to_Genes.cpk")

In [None]:
# %%timeit
hpo_pairs = combinations(dictPhen.keys(), 2)

for i in hpo_pairs:
    


In [32]:
# len([i for i in hpo_pairs])
count = 0
for i in hpo_pairs:
    if count < 20:
        print dictPhen[i[0]]
#         print hpo_similarity(dictPhen[i[0]], dictPhen[i[1]])
        count+=1

In [41]:
sims=[hpo_similarity(dictPhen[i[0]], dictPhen[i[1]]) for i in hpo_pairs]

In [43]:
len(sims)

NameError: name 'sims' is not defined

In [57]:
similarities = [hpo_similarity(dictPhen[i[0]],dictPhen[i[1]] ) for i in hpo_pairs]

In [24]:
l1=[1,2,3,4,11]
l2=[1,2,3,6,4,10]

print l1, l2


[1, 2, 3, 4, 11] [1, 2, 3, 6, 4, 10]


In [46]:
lsDisease=['doid1','doid2','doid3']

np.array(lsDisease)

array(['doid1', 'doid2', 'doid3'], 
      dtype='|S5')

In [47]:
from collections import Counter
def buildVector(ls1, ls2):
    counter1 = Counter(ls1)
    counter2 = Counter(ls2)
    
    all_items = set(counter1.keys()).union(set(counter2.keys()))
    vector1 = [counter1[k] for k in all_items]
    vector2 = [counter2[k] for k in all_items]
    return  vector1, vector2
    
vspace = buildVector(l1, l2)

print cosine(vspace[0], vspace[1])



0.26970325666


In [28]:
from numpy.linalg import norm
from numpy import dot, vdot
from sklearn.metrics.pairwise import cosine_similarity


print 1-cosine(vspace[0], vspace[1])
dot(vspace[0], vspace[1])/(norm(vspace[0])* norm(vspace[1]))
# val1 = vspace[0].reshape(1, -1)

print cosine_similarity(np.array(vspace[0]).reshape(1,-1), np.array(vspace[1]).reshape(1,-1))

0.73029674334
[[ 0.73029674]]


### Serialize dictionaries into objects

In [48]:
import cPickle as pickle
def picklize(func):
    def inner(mydict, myfile):
        print "Pickling File: {0}".format(myfile)
        with open(myfile, 'wb') as handle:
            pickle.dump(mydict, handle)   
        return func(mydict,myfile)
    return inner

@picklize
def geneToDisease(mydict, myfile):
    print "Genes to Disease"
    
geneToDisease({'one':1, 'two':2}, "GeneFile.txt")

Pickling File: GeneFile.txt
Genes to Disease


In [49]:
# Use a decorator to build vectors from the list
def build_vector(func):
    def inner(lsOne, lsTwo):
        counter1 = Counter(lsOne)
        counter2 = Counter(lsTwo)
        all_items = set(counter1.keys()).union(set(counter2.keys()))
        vector1 = np.array([counter1[k] for k in all_items]).reshape(1,-1)
        vector2 = np.array([counter2[k] for k in all_items]).reshape(1,-1)
        return func(vector1, vector2)
    return  inner
        

In [50]:
@build_vector
def hpo_similarity(ls1, ls2):
    return cosine_similarity(ls1, ls2)

print hpo_similarity(l1, l2)[0]

[ 0.73029674]


In [64]:
# dictG

phen_pairs = combinations(dictG.keys(), 2)

# list(phen_pairs)
sims=[hpo_similarity(dictG[i[0]], dictG[i[1]]) for i in phen_pairs]

print  np.asarray(sims, dtype=np.float).reshape(1,-1)[0]

[ 0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.57735027  0.          0.          0.          0.
  0.5         0.          0.          0.28347335  0.25        0.28867513
  0.          0.          0.          0.          0.35355339  0.5         0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.57735027  0.          0.57735027
  0.          0.57735027  0.21821789  0.28867513  0.33333333  0.          0.
  0.          0.          0.40824829  0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.   

In [70]:
p=combinations(dictPhen.keys(),)

In [None]:
print 

### Create Similarity matrices of diseases based on phenotypes

### Create Similarity matrices of diseases based on genes

### Create Similarity matrices of genes based on phenotypes