In [None]:
%% network expansion algorithm
% author: Joshua Goldford
% date: 11-6-2015

% inputs: R := reactant binary matrix (m x n)
%         P := product binary matrix (m x n)
%         x := starting point (binary vector (m x 1)
%         b := total number of reactant vector (n x 1) (can be computed via
%         row sums on R)
% output: out := output structure containing following fields:
%              x := binary vector representing presence/absence of metabolites 
%                   in converged network
%              y := binary vector representing presence/absence of reactions 
%                   in converged network
%              X := (metabolite accumulation matrix) binary mxr matrix 
%                   representing the presence/absence of metabolites at 
%                   iteration r
%              Y := (reaction accumulation matrix) binary mxr matrix 
%                   representing the presence/absence of reactions at 
%                   iteration r


function [out] = netExp(R,P,x,b)

% initialize variables:
% find the total number of metabolites in the seed set
k = sum(x);
% initialize previous iteration count of metabolites in the network
k0 = 0;
% iteration 1 consistes of the seed set
X = x;
% initialize reaction accumulation matrix
Y = [];

% while the metabolite set has not converged
while k > k0 
    % update previous number of metabolites
    k0 = k;
    % R'x ==> represnts the number of present metabolites in the 
    % network within each reaction; if this isequal to the total 
    % number of metabolites in each reaction, then the reaction 
    % is added to the network
    y  = double(R'*x == b);
    
    % P*y > 0 ==> represents the vector of reactions that produce 
    % metabolite i. (i in 1:m).  If this is >0, 
    % then that metabolite is producable 
    x_new = double(P*y > 0);
    % add to previous set of meatabolites (only needed to retain seed set)
    x = double(x | x_new);
    % find new total number of metabolites in network
    k = sum(x);
    
    % append accumulation matricies
    X = [X,x];
    Y = [Y,y];
end
    
% parse variables into output structure    
out.x = x;
out.y = y;
out.X = X;
out.Y = Y;
   


end

In [120]:
import json
import collections
import re
import os
import glob

In [3]:
def load_json(fname):
    """
    Wrapper to load json in single line

    :param fname: the filepath to the json file
    """
    with open(fname) as f:
        return json.load(f)

In [20]:
archaea_metadata = load_json("../jgi/metadata/archaea_metadata.json")
bacteria_metadata = load_json("../jgi/metadata/bacteria_metadata_subset.json")
eukarya_metadata = load_json("../jgi/metadata/eukarya_metadata.json")

In [6]:
for k in archaea_metadata:
    print k

sort
totalRecords
checked
pageSize
recordsReturned
records
startIndex
filtChecked
allSelect
dir


In [8]:
len([k for k in archaea_metadata['records']])

1625

In [10]:
for k in archaea_metadata['records']:
    for category in k:
        print category
    break

Enzymeassembled
IMGProductAssignment
MetaCycCountassembled
ContactNameDisp
BiosyntheticClusterGeneassembledDisp
ProportalWOASalinity
GenomeProperty
SampleBodySubsiteDisp
SpecificEcosystem
COGClusterCountassembledDisp
RevisedCountDisp
GOLDSequencingProjectIDDisp
SampleBodySiteDisp
DomainDisp
Sporulation
UncharCountDisp
FundingProgram
PfamClusterCountassembledDisp
GOLDAnalysisProjectID
ITSSPIDDisp
Comments
MetabolismDisp
RNAassembled
SporulationDisp
SequencingCenter
JGIProjectIDITSSPIDDisp
Select
CodingBaseCountNP127
Alt2ContactEmails
KOGCountDisp
BioticRelationshipsDisp
KOGCount
CladeDisp
TransmembraneCount
MyIMGAnnotation
23SrRNACountassembledDisp
wFuncPredCountassembled
NitrateConcentrationDisp
ProportalIsolation
HostGender
ProportalOcean
NCBIProjectIDDisp
GOLDSequencingDepth
ChromosomalCassetteGeneCountDisp
IMGPartsListCountDisp
StrainDisp
OrthologsCountDisp
LonghurstCodeDisp
Fused
CodingBaseCountassembledDisp
JGIAnalysisProductName
PfamCountassembledDisp
PMOProjectIDDisp
ParalogGrou

In [11]:
for entry in archaea_metadata['records']:
    print entry['pH']

zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
5.2 - 7
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
5.8 - 9.0
zzz
zzz
zzz
zzz
zzz
zzz
6.5
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
6.02
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
2.7
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
7.0-7.5
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
6.5
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
6.5 - 7.0
zzz
zzz
zzz
zzz
zzz
zzz
8.5
zzz
zzz
zzz
6.5 - 8
zzz
6.5
zzz
2.7
zzz
2-3
zzz
7.5
6.5
7
zzz
zzz
zzz
zzz
6
zzz
zzz
zzz
zzz
zzz
5 - 9
zzz
zzz
zzz
zzz
0-3.5
zzz
9.0 - 9.5
zzz
6.5
zzz
zzz
zzz
zzz
7.4
zzz
zzz
zzz
zzz
zzz
7
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
6.5
8.0
zzz
zzz
zzz
zzz
zzz
zzz
6.5
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
9
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
2-3
7
zzz
zzz
zzz
6.8
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
zzz
7
zzz
zzz
z

In [116]:
collections.Counter([entry['pH'] for entry in archaea_metadata['records']])

Counter({u'0-3.5': 1,
         u'0.5 - 4': 1,
         u'0.7': 1,
         u'1 - 4': 1,
         u'1 - 6': 1,
         u'2': 1,
         u'2 - 3': 1,
         u'2 - 4.5': 1,
         u'2-3': 9,
         u'2.0': 2,
         u'2.0 - 4.5': 2,
         u'2.5 - 3': 1,
         u'2.7': 3,
         u'3': 3,
         u'3.0': 1,
         u'3.5': 1,
         u'3.5 - 4.0': 1,
         u'3.6': 1,
         u'4 - 6': 1,
         u'4.0': 1,
         u'4.0 - 4.5': 1,
         u'4.5 - 8.5': 1,
         u'4.6 - 7.5': 1,
         u'5 - 9': 2,
         u'5.0-7.4': 1,
         u'5.2 - 7': 1,
         u'5.2?8.0': 1,
         u'5.4': 4,
         u'5.5': 3,
         u'5.5 - 6.0': 1,
         u'5.5 - 7': 1,
         u'5.5 - 7.5': 1,
         u'5.5 - 8': 1,
         u'5.5 - 8.5': 1,
         u'5.5-8.0': 1,
         u'5.8': 1,
         u'5.8 - 9.0': 2,
         u'6': 2,
         u'6 - 8': 1,
         u'6 - 8.5': 1,
         u'6-8': 1,
         u'6.0': 1,
         u'6.0 - 6.5': 1,
         u'6.0 - 8.0': 1,
      

In [115]:
collections.Counter([entry['pH'] for entry in bacteria_metadata['records']])

Counter({u'1.3 - 4.0': 1,
         u'1.8': 2,
         u'1.9': 1,
         u'10': 1,
         u'11.0': 1,
         u'2': 3,
         u'2-2.5 (optimum)': 1,
         u'2.5': 1,
         u'3': 4,
         u'3 - 4': 2,
         u'3.1-6.5, optimum 4.5-5.0': 1,
         u'3.2-7.5': 1,
         u'3.5 - 9.0': 1,
         u'3.5-3.8': 1,
         u'3.5-6.0': 1,
         u'3.6': 1,
         u'3.6 - 7.8': 1,
         u'3.6-6.9': 1,
         u'3.7-7.1': 1,
         u'3.8-9.0': 1,
         u'3.85 - 6.35': 1,
         u'3.9': 1,
         u'4 - 10': 1,
         u'4 - 4.5': 1,
         u'4 - 5': 1,
         u'4-10': 2,
         u'4-4.5': 1,
         u'4-5': 3,
         u'4-6': 1,
         u'4-7': 1,
         u'4-8': 2,
         u'4-9': 4,
         u'4-9.6': 2,
         u'4.0-8.0': 1,
         u'4.0-8.5': 3,
         u'4.0-9.0': 1,
         u'4.0-9.5': 4,
         u'4.1-7.8': 2,
         u'4.19': 13,
         u'4.2 - 7.2': 1,
         u'4.2 - 8.2': 1,
         u'4.2-8.3': 1,
         u'4.29': 3,
      

In [114]:
collections.Counter([entry['pH'] for entry in eukarya_metadata['records']])

Counter({u'7': 2, u'7.5': 1, u'zzz': 702})

In [19]:
collections.Counter([entry['Salinity'] for entry in archaea_metadata['records']])

Counter({u'0.16-0.22 M': 3,
         u'Halophile': 110,
         u'Halotolerant': 1,
         u'zzz': 1511})

In [31]:
collections.Counter([entry['pH'] for entry in bacteria_metadata['records']])

Counter({u'1.3 - 4.0': 1,
         u'1.8': 2,
         u'1.9': 1,
         u'10': 1,
         u'11.0': 1,
         u'2': 3,
         u'2-2.5 (optimum)': 1,
         u'2.5': 1,
         u'3': 4,
         u'3 - 4': 2,
         u'3.1-6.5, optimum 4.5-5.0': 1,
         u'3.2-7.5': 1,
         u'3.5 - 9.0': 1,
         u'3.5-3.8': 1,
         u'3.5-6.0': 1,
         u'3.6': 1,
         u'3.6 - 7.8': 1,
         u'3.6-6.9': 1,
         u'3.7-7.1': 1,
         u'3.8-9.0': 1,
         u'3.85 - 6.35': 1,
         u'3.9': 1,
         u'4 - 10': 1,
         u'4 - 4.5': 1,
         u'4 - 5': 1,
         u'4-10': 2,
         u'4-4.5': 1,
         u'4-5': 3,
         u'4-6': 1,
         u'4-7': 1,
         u'4-8': 2,
         u'4-9': 4,
         u'4-9.6': 2,
         u'4.0-8.0': 1,
         u'4.0-8.5': 3,
         u'4.0-9.0': 1,
         u'4.0-9.5': 4,
         u'4.1-7.8': 2,
         u'4.19': 13,
         u'4.2 - 7.2': 1,
         u'4.2 - 8.2': 1,
         u'4.2-8.3': 1,
         u'4.29': 3,
      

In [40]:
range(3)[::-1]

[2, 1, 0]

In [43]:
if 1 and 0:
    print True

In [79]:
## Tested and works
def check_overlap_number(range_1,i):
    if (i>=min(range_1)) and (i<=max(range_1)):
        return True
    else:
        return False

In [83]:
check_overlap_number((9.0,11.0),9.0)

True

In [80]:
## Tested and works
def check_overlap_ranges(range_1,range_2):
    # order the ranges so that the range with the smallest min is first
    # check if the min of range2 is less than the max of range 1
    ranges = [range_1,range_2]
    if min(range_2)<min(range_1):
        ranges = ranges[::-1]
    if min(ranges[1])<=max(ranges[0]):
        return True
    else:
        return False

In [98]:
def get_entries_within_ph_range(records, desired_ph_range):
    entries_with_phs_in_range = list()
    for entry in records:
        in_desired_ph_range = False
        ph_range_str = entry['pH'].split('-')

        if len(ph_range_str) == 1:
            try:
                ph = float(ph_range_str[0])
                in_desired_ph_range = check_overlap_number(desired_ph_range,ph)

            except:
                pass
        elif len(ph_range_str) == 2:
            try:
                ph_min = float(ph_range_str[0])
                ph_max = float(ph_range_str[1])
                ph = (ph_min,ph_max)
                in_desired_ph_range = check_overlap_ranges(desired_ph_range,ph)
            except:
                pass
        
        if in_desired_ph_range == True:
            entries_with_phs_in_range.append(entry)
    
    return entries_with_phs_in_range

In [107]:
def format_urls_for_scraping(homepage_url,entries_with_phs_in_range):
    all_GenomeNameSampleNameDisp =  [d['GenomeNameSampleNameDisp'] for d in entries_with_phs_in_range]

    organism_urls = list()

    for htmlandjunk in all_GenomeNameSampleNameDisp:
        regex = r"<a href='main\.cgi(.*)'>"
        match = re.search(regex, htmlandjunk)
        html_suffix = match.group(1)
        full_url = homepage_url+html_suffix
        organism_urls.append(full_url)

    return organism_urls

In [117]:
records = bacteria_metadata['records']
ph_range = (9.0,11.0)

entries_with_phs_in_range = get_entries_within_ph_range(records,ph_range)

In [118]:
len(entries_with_phs_in_range)

266

In [123]:
## 266 bacteria vs. 56 that were successfully scraped
print len(get_entries_within_ph_range(bacteria_metadata['records'],ph_range))
print len(glob.glob('../jgi/2018-09-29/ph_jsons/bacteria/*.json'))

266
56


In [124]:
## 28 archaea vs. 28 that were successfully scraped
print len(get_entries_within_ph_range(archaea_metadata['records'],ph_range))
print len(glob.glob('../jgi/2018-09-29/ph_jsons/archaea/*.json'))

28
28


In [None]:
records

In [112]:
homepage_url = 'https://img.jgi.doe.gov/cgi-bin/m/main.cgi'
format_urls_for_scraping(homepage_url,entries_with_phs_in_range)

[]

In [100]:
for entry in entries_with_phs_in_range:
    print entry['GenomeNameSampleNameDisp']

<a href='main.cgi?section=TaxonDetail&page=taxonDetail&taxon_oid=2523533532'>Actinomadura rifamycini DSM 43936</a>
<a href='main.cgi?section=TaxonDetail&page=taxonDetail&taxon_oid=2524614884'>Ornithinimicrobium pekingense DSM 21552</a>
<a href='main.cgi?section=TaxonDetail&page=taxonDetail&taxon_oid=2518645608'>Saccharibacillus kuerlensis DSM 22868</a>
<a href='main.cgi?section=TaxonDetail&page=taxonDetail&taxon_oid=2582581268'>Halomonas ilicicola DSM 19980</a>
<a href='main.cgi?section=TaxonDetail&page=taxonDetail&taxon_oid=2574180435'>Atlantibacter subterranea DSM 16208</a>
<a href='main.cgi?section=TaxonDetail&page=taxonDetail&taxon_oid=2565956515'>Pararheinheimera texasensis DSM 17496</a>
<a href='main.cgi?section=TaxonDetail&page=taxonDetail&taxon_oid=2518645611'>Halomonas lutea DSM 23508</a>
<a href='main.cgi?section=TaxonDetail&page=taxonDetail&taxon_oid=2519899518'>Hirschia maritima DSM 19733</a>
<a href='main.cgi?section=TaxonDetail&page=taxonDetail&taxon_oid=2585428077'>Calor

In [97]:
[i['pH'] for i in entries_with_phs_in_range]

[u'6-12',
 u'6-9',
 u'5-10',
 u'6-9',
 u'4.0-9.5',
 u'6.5-9.6',
 u'5-9',
 u'8.1-9.1',
 u'5.5-9.0',
 u'6-9.5',
 u'6.4-10.0',
 u'9.0',
 u'7.2-10.2',
 u'5-9',
 u'6-11',
 u'4.0-9.0',
 u'6-9',
 u'5.6-9.8',
 u'7.5-10.2',
 u'5-10',
 u'4.0-9.5',
 u'9.1',
 u'7-9',
 u'7-11',
 u'6-11',
 u'9.0',
 u'5.5-10.0',
 u'6-10',
 u'5.5-12.0',
 u'9.5',
 u'6-10',
 u'6-9',
 u'5-10',
 u'6.0-9.5',
 u'6.0-10.0',
 u'4.5 - 9.0',
 u'7-9',
 u'6.5-11.5',
 u'5-11',
 u'6-10',
 u'6.5-9.0',
 u'6-9',
 u'4-9',
 u'5-10',
 u'7.5-9.0',
 u'4-9',
 u'5-10',
 u'6-11',
 u'6-9',
 u'7-9',
 u'8.6-10.4',
 u'5.5-9.5',
 u'5.5-10.0',
 u'5-9',
 u'6.0 - 9.5',
 u'5 - 9',
 u'7-10',
 u'5.1-9.1',
 u'6-9',
 u'5.0-9.0',
 u'8 - 10',
 u'6.5-9.3',
 u'6 - 9',
 u'6.0 - 10.0',
 u'6-9',
 u'5.5-11.0',
 u'6-10',
 u'6-11',
 u'5-9.5',
 u'6.0-10.0',
 u'5.5 - 9.5',
 u'6.7-9.7',
 u'6.0-9.7',
 u'6-10',
 u'6-9',
 u'5.8-10.0',
 u'7-10',
 u'7.5-10.2',
 u'7-11',
 u'5.0-9.5',
 u'5-9',
 u'9',
 u'5-10',
 u'7-10',
 u'4-9.6',
 u'5-10',
 u'6-11',
 u'6 - 9',
 u'9.0',
 u'6

In [64]:
for i in collections.Counter([entry['pH'] for entry in bacteria_metadata['records']]):
    if len(i.split('-')) == 1:
        try:
            pH = float(i.split('-')[0])
            print pH
#         check_if_within_ph_range()
        except:
            pass
    else:
        try:
            ph_min = float(i.split('-')[0])
            ph_max = float(i.split('-')[1])
            print (ph_min,ph_max)
        except:
            pass

(6.8, 7.0)
(6.0, 8.1)
(6.5, 6.7)
(6.0, 10.0)
(5.7, 10.0)
(8.0, 10.2)
(8.0, 10.5)
(6.0, 7.0)
(7.0, 11.0)
(7.0, 11.1)
(6.0, 9.5)
(4.2, 7.2)
(7.0, 13.0)
(7.0, 8.0)
(4.0, 5.0)
(5.3, 9.3)
(8.0, 10.0)
(7.5, 12.0)
(5.2, 8.6)
(7.0, 8.5)
(6.5, 8.0)
(5.0, 6.5)
(5.8, 9.0)
(5.9, 9.0)
6.71
(8.0, 8.5)
(6.0, 8.0)
(6.0, 9.0)
(6.0, 7.0)
(5.8, 7.8)
(5.5, 12.0)
(6.1, 10.1)
(7.6, 8.0)
(6.8, 8.2)
(4.0, 5.0)
(6.8, 8.0)
(6.8, 8.8)
(5.0, 6.2)
(6.5, 8.5)
(5.1, 9.1)
(6.5, 8.0)
(5.0, 9.0)
(6.7, 8.0)
(6.7, 8.3)
(6.0, 7.2)
(6.0, 7.5)
(6.0, 7.8)
(8.0, 8.5)
(5.0, 8.0)
(5.0, 8.5)
(8.1, 9.1)
(4.0, 9.0)
(4.0, 9.5)
(3.5, 3.8)
(6.6, 8.4)
(7.5, 8.0)
(6.4, 8.8)
(5.8, 8.0)
(6.4, 8.5)
(6.4, 8.2)
(6.4, 8.3)
(6.4, 8.1)
(6.0, 9.7)
(6.5, 9.3)
(6.5, 9.0)
(5.8, 8.2)
(6.5, 9.6)
(6.5, 9.4)
(5.5, 7.5)
(6.5, 9.8)
(5.5, 10.5)
(5.5, 10.0)
(1.3, 4.0)
(5.6, 8.3)
(5.6, 8.6)
(5.6, 8.4)
(6.8, 8.0)
(6.0, 10.0)
(6.0, 11.0)
(6.0, 12.0)
5.3
5.5
5.9
(4.5, 8.0)
(6.2, 7.0)
(6.0, 10.0)
5.0
(3.7, 7.1)
(5.5, 10.0)
(6.5, 7.0)
(6.5, 8.0)
(6.0, 10.0)
(5.