## Network Analysis for A. thaliana Data
Something here.

In [9]:
import pandas as pd
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.multi_sparse', False)
import numpy as np
import time
import sys
import os
import multiprocessing as mp
from sklearn.model_selection import train_test_split

sys.path.append("../../oats")
from oats.utils.utils import save_to_pickle, load_from_pickle
from oats.utils.utils import function_wrapper, to_hms

print("done loading imports")

done loading imports


In [10]:
from oats.datasets.dataset import Dataset

# Put together a complete dataset from files that have been formatted correctly in the other notebooks.
dataset = Dataset()
dataset.add_data(pd.read_csv("../data/reshaped_files/ath_tair_gene_text.csv", lineterminator="\n"))
dataset.add_data(pd.read_csv("../data/reshaped_files/ath_tair_gene_annot_go.csv", lineterminator="\n"))
dataset.add_data(pd.read_csv("../data/reshaped_files/ath_tair_gene_annot_po.csv", lineterminator="\n"))
dataset.collapse_by_first_gene_name()
dataset.filter_has_description()
dataset.filter_has_annotation()
dataset.describe()

Number of rows in the dataframe: 5229
Number of unique IDs:            5229
Number of unique descriptions:   3051
Number of unique gene name sets: 5229
Number of species represented:   1


In [11]:
from oats.datasets.groupings import Groupings
from oats.utils.utils import merge_list_dicts


genes = dataset.get_gene_dictionary()

# Create/read in the different objects for organizing gene groupings.
groupings_kegg = load_from_pickle(path="../data/pickles/kegg_pathways.pickle")
groupings_pmn = load_from_pickle(path="../data/pickles/pmn_pathways.pickle")
groupings_subset = load_from_pickle(path="../data/pickles/lloyd_subsets.pickle")
groupings_class = load_from_pickle(path="../data/pickles/lloyd_classes.pickle")

# Get mappings between object IDs and pathways IDs and the reverse.
id_to_pathway_ids = merge_list_dicts(
    groupings_kegg.get_forward_dict(genes),
    groupings_pmn.get_forward_dict(genes),
    groupings_subset.get_forward_dict(genes))

# Show some mappings between IDs and groups where the ID belongs to atleast one.
total = len(id_to_pathway_ids.keys())
has_mapping = len([k for (k,v) in id_to_pathway_ids.items() if len(v)>0])
print("Of the {} genes in the dataset, {} are mapped to atleast one pathway.".format(total, has_mapping))
for k,v in [(k,v) for (k,v) in id_to_pathway_ids.items() if len(v)>0][:25]:
    print("{:6}{}".format(k," ".join(v)[:80]))

Of the 5229 genes in the dataset, 661 are mapped to atleast one pathway.
16    PWY-5080 PWY-7036
18    PWY-6
19    PWY-6733
21    PWY-7270 ETHYL-PWY
22    PWY-181
24    PWY-5667 TRIGLSYN-PWY
31    PWY-6295 PWY-6733 PWY-84
36    PWY-282
44    PWY-6898 PWY-7356 PWY-6908
49    LEU-DEG2-PWY
55    PWY-622
57    HEME-BIOSYNTHESIS-II CHLOROPHYLL-SYN
61    PWY-6745
68    PWY-581 PWYDQC-4
69    PWY-5080
77    PWY-6446 PWY-6444 PWY-3181
79    PWY-5136
89    PWY-6363 PWY-6364
90    PWY-5272 PWY-1782 PWY-1741
91    PWY-6363 PWY-6364
101   PWY-5136 PWY-6837
103   PWY-6773
104   PWY-5129
105   PWY-5667 TRIGLSYN-PWY
109   PWY-6475


In [12]:
# Subset the dataset to retain only the genes that are mapped to atleast one group.
dataset.filter_with_ids([k for k,v in id_to_pathway_ids.items() if len(v)>0])
dataset.describe()
dataset.filter_random_k(k=30, seed=1938)
dataset.describe()

Number of rows in the dataframe: 661
Number of unique IDs:            661
Number of unique descriptions:   588
Number of unique gene name sets: 661
Number of species represented:   1
Number of rows in the dataframe: 30
Number of unique IDs:            30
Number of unique descriptions:   29
Number of unique gene name sets: 30
Number of species represented:   1


In [13]:
# Get mapping from IDs to gene objects.
genes = dataset.get_gene_dictionary()
for k,v in list(genes.items())[:10]:
    print("{:6}{}".format(k," ".join(v.names)))

0     AT4G23850
1     AT5G32470
2     AT1G23320
3     AT1G77240
4     AT4G18780
5     AT4G33010
6     AT5G43940
7     AT5G56610
8     AT4G17360
9     AT4G25700


In [14]:
# Get a mapping from IDs to annotations (lists of ontology terms).
annotations = dataset.get_annotations_dictionary()
for k,v in list(annotations.items())[:10]:
    print("{:6}{}".format(k," ".join(v)[:88])) 

0     GO:0009805 GO:0001676 GO:0006633 GO:0005634 GO:0002213 GO:0005794 GO:0005886 GO:0004467 
1     GO:0003674 GO:0005829 GO:0008150 GO:0009507
2     GO:0016846 GO:0009793 GO:0080097 GO:0050362 GO:0010588 GO:0080022 GO:0048825 GO:0005737
3     GO:0005739
4     GO:0045492 GO:0030244 GO:0009834 GO:0016757 GO:0010413 GO:0010116 GO:0050832 GO:0009832 
5     GO:0009570 GO:0006098 GO:0048046 GO:0009695 GO:0009072 GO:0019252 GO:0042545 GO:0043085 
6     GO:0051903 GO:0009611 GO:0046292 GO:0005777 GO:0019288 GO:0080007 GO:0009684 GO:0006569 
7     GO:0008138 GO:0005737 GO:0006470 GO:0046855 GO:0043407 GO:0004439
8     GO:0009853 GO:0008864 GO:0000023 GO:0043085 GO:0005739 GO:0019252 GO:0016742 GO:0009058 
9     GO:0016119 GO:0010291 GO:0016123 GO:0009507


In [15]:
from oats.nlp.preprocess import get_clean_description

# Get a mapping from IDs to text descriptions.
descriptions = dataset.get_description_dictionary()
descriptions = {i:get_clean_description(d) for (i,d) in descriptions.items()}
for k,v in list(descriptions.items())[:10]:
    print("{:6}{}".format(k,v[:80]))

0     no visible phenotype
1     true leaves of unsupplemented plants variegated yellowgreen
2     no visible phenotype weakly ethylene insensitive in roots of 3dayold darkgrown s
3     no visible phenotype
4     does not exhibit any obvious increase in  its susceptibility to wilting in gener
5     no significant difference in photosynthetic performance relative to wildtype pla
6     darkgrown seedlings are blocked in hypocotyl elongation after heat stress darkgr
7     short roots disorganized columella cells irregularly divided cells and cellular 
8     no visible phenotype severe photorespiratory phenotypes plants are small pale de
9     chlorophyll ab ratio does not differ significantly from that of wildtype in leav


In [16]:


# Save the dataset as a pickle so it can be read by multithreading script.
#save_to_pickle(obj=dataset, path="../data/pickles/dataset_001.pickle")

# The contents of the multithreaded script. 

from oats.annotation.ontology import Ontology
from oats.annotation.annotation import annotate_using_rabin_karp, annotate_using_noble_coder 
from oats.annotation.annotation import write_annotations_to_tsv_file, read_annotations_from_tsv_file
from oats.graphs.similarity import get_similarity_df_using_fastsemsim
from oats.graphs.similarity import get_similarity_df_using_doc2vec
from oats.graphs.similarity import get_similarity_df_using_bagofwords
from oats.graphs.similarity import get_similarity_df_using_setofwords
from oats.graphs.similarity import get_similarity_df_using_annotations_unweighted_jaccard
from oats.graphs.similarity import get_similarity_df_using_annotations_weighted_jaccard
from oats.graphs.data import combine_dfs_with_name_dict, subset_df_with_ids

print("a")

# Setup some of the ontology and document embeddings stuff.
merged_ontology_file = "../ontologies/mo.obo"
annotations_file = "../data/scratch/annot.tsv"
doc2vec_model_file = "../gensim/enwiki_dbow/doc2vec.bin"
mo = Ontology(merged_ontology_file)
print("m1")
annotations = annotate_using_rabin_karp(object_dict=descriptions, ontology=mo)
print("m2")
write_annotations_to_tsv_file(annotations_dict=annotations, annotations_output_path=annotations_file)
print("b")

a
m1
m2
b


In [21]:
from oats.graphs.similarity import get_similarity_df_using_doc2vec_scipy

# Setup for creating the pairwise similarity matrices using seperate cores.
# The key is the function to use as a variable, and the values are the arguments to unpack as a list.
functions_and_args = {
    get_similarity_df_using_fastsemsim:[merged_ontology_file, annotations_file, descriptions, True],
    get_similarity_df_using_doc2vec:[doc2vec_model_file, descriptions, True],
    get_similarity_df_using_doc2vec_scipy:[doc2vec_model_file, descriptions, True],}
    #get_similarity_df_using_doc2vec:[doc2vec_model_file, descriptions, True],
    #get_similarity_df_using_bagofwords:[descriptions, True],
    #get_similarity_df_using_setofwords:[descriptions, True],
    #get_similarity_df_using_annotations_unweighted_jaccard:[annotations, mo, True],
    #get_similarity_df_using_annotations_weighted_jaccard:[annotations, mo, True]}
print("c")

# Use parallel processing the build all the similarity matrices.
start_time_mp = time.perf_counter()
pool = mp.Pool(mp.cpu_count())
results = [pool.apply_async(function_wrapper, args=(function, args)) for (function, args) in functions_and_args.items()]
results = [result.get() for result in results]
pool.close()
pool.join()    
total_time_mp = time.perf_counter()-start_time_mp

# Create a mapping between method names and the similarity matrices that were generated.
names = ["ontology", "doc2vec", "doc2vec_c"]
name_to_df_mapping = {name:result[0] for (name,result) in zip(names,results)}
df = combine_dfs_with_name_dict(name_to_df_mapping)


# Look at how long it took to build each pairwise similarity matrix.
print("\n\n")
print("Durations of generating each pairwise similarity matrix (hh:mm:ss)")
print("-----------------------------------------------------------------")
durations = [result[1] for result in results]
savings = total_time_mp/sum(durations)
for (name,duration) in zip(names,durations):
	print("{:15} {}".format(name, to_hms(duration)))
print("-----------------------------------------------------------------")
print("{:15} {}".format("total", to_hms(sum(durations))))
print("{:15} {} ({:.2%} of single thread time)".format("multiprocess", to_hms(total_time_mp), savings))
print("\n\n")


print(df.head(40))

c


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


1 [0.2151 2 [0.4301 3 [0.6452 4 [0.8602 5 [1.0753 6 [1.2903 7 [1.5054 8 [1.7204 9 [1.9355 10 [2.1505 %11 [2.3656 %12 [2.5806 %13 [2.7957 %14 [3.0108 %15 [3.2258 %16 [3.4409 %17 [3.6559 %18 [3.8710 %19 [4.0860 %20 [4.3011 %21 [4.5161 %22 [4.7312 %23 [4.9462 %24 [5.1613 %25 [5.3763 %26 [5.5914 %27 [5.8065 %28 [6.0215 %29 [6.2366 %30 [6.4516 %31 [6.6667 %32 [6.8817 %33 [7.0968 %34 [7.3118 %35 [7.5269 %36 [7.7419 %37 [7.9570 %38 [8.1720 %39 [8.3871 %40 [8.6022 %41 [8.8172 %42 [9.0323 %43 [9.2473 %44 [9.4624 %45 [9.6774 %46 [9.8925 %47 [10.1075 48 [10.3226 49 [10.5376 50 [10.7527 51 [10.9677 52 [11.1828 53 [11.3978 54 [11.6129 55 [11.8280 56 [12.0430 57 [12.2581 58 [12.4731 59 [12.6882 60 [12.9032 61 [13.1183 62 [13.3333 63 [13.5484 64 [13.7634 65 [13.9785 66 [14.1935 67 [14.4086 68 [14.6237 69 [14.8387 70 [15.0538 71 [15.2688 72 [15.4839 73 [15.6989 74 [15.9140 75 [16.1290 76 [16.3441 77 [16.5591 78 [16.7742 79 [16.9892 80 [17.2043 81 [17.4194 82 [17.6344 83 [17.8495 84 [18.0645 85 [18.279



IndexError: invalid index to scalar variable.