In [1]:
import json
import sqlite3
from collections import Counter

import pandas as pd

In [2]:
%%bash
# remove test.db and ignore the error if it does not exist
rm test.db | exit 0

In [3]:
DATA_ROOT = "../exascale_data"
con = sqlite3.connect("test.db")

In [4]:
clusters_table = """CREATE TABLE IF NOT EXISTS clusters (
    "index" INTEGER,
    "cluster_id" TEXT,
    "cluster_prefix" TEXT,
    "node_ids" TEXT
)"""

edges_table = """CREATE TABLE IF NOT EXISTS edges (
    "index" INTEGER,
    "node1" TEXT,
    "node2" TEXT,
    "score" REAL,
    "edge_type" TEXT,
    "directed" INTEGER
)"""

nodes_table = """CREATE TABLE IF NOT EXISTS "nodes" (
"index" INTEGER,
  "GID" TEXT,
  "defline" TEXT,
  "symbols" TEXT,
  "names" TEXT,
  "KO_effect" TEXT,
  "GO" TEXT,
  "GOdesc" TEXT,
  "mapman_code" TEXT,
  "mapman_name" TEXT,
  "mapman_desc" TEXT
)"""

cur_clusters = con.execute(clusters_table)
cur_edges = con.execute(edges_table)
cur_nodes = con.execute(nodes_table)

assert cur_clusters.fetchall() == []
assert cur_edges.fetchall() == []
assert cur_nodes.fetchall() == []

columns_nodes = list(pd.read_sql("""SELECT * FROM nodes""", con).columns)
columns_nodes

['index',
 'GID',
 'defline',
 'symbols',
 'names',
 'KO_effect',
 'GO',
 'GOdesc',
 'mapman_code',
 'mapman_name',
 'mapman_desc']

In [5]:
df1 = pd.read_csv(f"{DATA_ROOT}/prerelease/edge_data/mentha_AT-PPI-3702_040319.to-kbase-spec.tsv", sep="\t")
df1.head()

Unnamed: 0,node1,node2,score,edge_type,directed
0,AT2G18790,AT1G09530,1.0,protein-protein-interaction_Mentha_A_thaliana_...,1
1,AT4G33430,AT4G39400,1.0,protein-protein-interaction_Mentha_A_thaliana_...,1
2,AT3G20740,AT1G02580,0.999,protein-protein-interaction_Mentha_A_thaliana_...,1
3,AT5G35410,AT4G33000,0.999,protein-protein-interaction_Mentha_A_thaliana_...,1
4,AT3G62980,AT3G23050,0.999,protein-protein-interaction_Mentha_A_thaliana_...,1


In [6]:
df0 = pd.read_csv(f"{DATA_ROOT}/prerelease/edge_data/ATRM_TF_to_Target_LitCurated_01082020.to-kbase-spec.tsv", sep="\t")
df0.head()

Unnamed: 0,node1,node2,score,edge_type
0,AT5G67420,AT4G09820,1,transcription-factor-regulatory-interaction_li...
1,AT2G20180,AT4G23750,1,transcription-factor-regulatory-interaction_li...
2,AT2G20180,AT1G80340,1,transcription-factor-regulatory-interaction_li...
3,AT2G20180,AT2G29090,1,transcription-factor-regulatory-interaction_li...
4,AT2G20180,AT2G40220,1,transcription-factor-regulatory-interaction_li...


In [7]:
df0.to_sql("edges", con, if_exists="append")

In [8]:
df1.to_sql("edges", con, if_exists="append")

In [9]:
df2 = pd.DataFrame({'name' : ['User 4', 'User 5', 'User 6']})
df3 = pd.DataFrame(dict(letter=["a"]*len(df2)))
pd.concat([df2, df3], axis=1)

Unnamed: 0,name,letter
0,User 4,a
1,User 5,a
2,User 6,a


In [10]:
df5 = pd.read_csv(
    f"{DATA_ROOT}/prerelease/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv",
    sep="\t"
)
df6 = pd.DataFrame(dict(cluster_prefix=["markov_i2"]*len(df5))) # i2 - inflation of 2
df7 = pd.concat([df5, df6], axis=1)
df7.tail()

Unnamed: 0,cluster_id,node_ids,cluster_prefix
3097,Cluster3098,"AT2G20080,AT2G20660",markov_i2
3098,Cluster3099,"AT3G62280,AT5G23210",markov_i2
3099,Cluster3100,"AT1G75890,AT3G15630",markov_i2
3100,Cluster3101,"AT1G67670,AT1G78710",markov_i2
3101,Cluster3102,"AT1G54870,AT5G54740",markov_i2


In [11]:
pd.read_sql("""select * from clusters order by "index" desc limit 5""", con)

Unnamed: 0,index,cluster_id,cluster_prefix,node_ids


In [12]:
df7.to_sql("clusters", con, if_exists="append")

In [13]:
df8 = pd.read_csv(
    f"{DATA_ROOT}/prerelease/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv",
    sep="\t"
)
df9 = pd.DataFrame(dict(cluster_prefix=["markov_i4"]*len(df8))) # i4 - inflation of 4
dfx = pd.concat([df8, df9], axis=1)
dfx.tail()

Unnamed: 0,cluster_id,node_ids,cluster_prefix
3688,Cluster3689,"AT2G20080,AT2G20660",markov_i4
3689,Cluster3690,"AT3G62280,AT5G23210",markov_i4
3690,Cluster3691,"AT1G75890,AT3G15630",markov_i4
3691,Cluster3692,"AT1G67670,AT1G78710",markov_i4
3692,Cluster3693,"AT1G54870,AT5G54740",markov_i4


In [14]:
dfx.to_sql("clusters", con, if_exists="append")

In [15]:
pd.read_sql("""select * from clusters order by "index" desc limit 5""", con)

Unnamed: 0,index,cluster_id,cluster_prefix,node_ids
0,3692,Cluster3693,markov_i4,"AT1G54870,AT5G54740"
1,3691,Cluster3692,markov_i4,"AT1G67670,AT1G78710"
2,3690,Cluster3691,markov_i4,"AT1G75890,AT3G15630"
3,3689,Cluster3690,markov_i4,"AT3G62280,AT5G23210"
4,3688,Cluster3689,markov_i4,"AT2G20080,AT2G20660"


In [16]:
df_nodes = pd.read_csv(
    f"{DATA_ROOT}/prerelease/node_tables/Ath_master_annotation_v01.txt",
    sep="\t"
)
df_nodes.to_sql("nodes", con, if_exists="replace")

In [17]:
# How many rows per node are there? If counts > 1 then some sort of merging is required.
pd.read_sql("""
SELECT "GID", COUNT(*) as counts
    FROM nodes
    GROUP BY "GID"
    ORDER BY count(*) DESC
    LIMIT 5
""", con)

Unnamed: 0,GID,counts
0,YI,1
1,YAK,1
2,XV,1
3,XTC2,1
4,XTC1,1


In [18]:
# if counts <= 1 for each node_id then any represenative will do
pd.concat([pd.read_sql(f"""
SELECT '{key}' as key,
        COUNT(DISTINCT({key})) as counts,
        "GID"
    FROM nodes
    GROUP BY "GID"
    ORDER BY COUNT(DISTINCT({key})) DESC
""", con).head(1) for key in columns_nodes[1:]])

Unnamed: 0,key,counts,GID
0,GID,1,YI
0,defline,1,ATMG01410
0,symbols,1,ATMG01410
0,names,1,ATMG01410
0,KO_effect,1,AT5G67590
0,GO,1,YI
0,GOdesc,1,YI
0,mapman_code,1,ATMG01410
0,mapman_name,1,ATMG01410
0,mapman_desc,1,ATMG01410


In [19]:
catted = pd.read_sql("""SELECT * from nodes""", con)
catted.tail(10)

Unnamed: 0,index,GID,defline,symbols,names,KO_effect,GO,GOdesc,mapman_code,mapman_name,mapman_desc
39809,39809,AT4G09895,,,,,,,35.2,not assigned.unknown,long noncoding RNA
39810,39810,AT4G09905,,,,,,,35.2,not assigned.unknown,long noncoding RNA
39811,39811,AT4G09915,,,,,,,35.2,not assigned.unknown,long noncoding RNA
39812,39812,AT4G09925,,,,,,,35.2,not assigned.unknown,microRNA NA
39813,39813,AT4G09935,,,,,,,35.2,not assigned.unknown,long noncoding RNA
39814,39814,AT4G09945,,,,,,,35.2,not assigned.unknown,long noncoding RNA
39815,39815,AT4G09955,,,,,,,35.2,not assigned.unknown,Natural antisense transcript overlaps with AT4...
39816,39816,AT4G09975,,,,,,,35.2,not assigned.unknown,Natural antisense transcript overlaps with AT4...
39817,39817,AT4G09985,,,,,,,35.2,not assigned.unknown,Natural antisense transcript overlaps with AT4...
39818,39818,AT4G09995,,,,,,,35.2,not assigned.unknown,long noncoding RNA


***
Did this actually solve the problem? That is, `ATCG00690` is not in the merged nodes table csv.

In [20]:
problem_gene = pd.read_sql("""SELECT * from nodes where "GID"="ATCG00690" """, con)
problem_gene

Unnamed: 0,index,GID,defline,symbols,names,KO_effect,GO,GOdesc,mapman_code,mapman_name,mapman_desc
0,33392,ATCG00690,photosystem II reaction center protein T,PSBT | PSBTC,photosystem II reaction center protein T |,,GO:0015979|GO:0016168|GO:0009523|GO:0009535|GO...,photosynthesis|chlorophyll binding|photosystem...,1.1.1.2,PS.lightreaction.photosystem II.PSII polypepti...,photosystem II reaction center protein T


Yes. Yes it did. *phew*

***

In [21]:
print(f"""'{"', '".join(sorted(('AT1G18590', 'AT2G34420', 'AT5G05850', 'ATCG00280', 'ATCG00350', 'ATCG00680', 'ATCG00690')))}'""")

'AT1G18590', 'AT2G34420', 'AT5G05850', 'ATCG00280', 'ATCG00350', 'ATCG00680', 'ATCG00690'


In [22]:
node_checks = ('AT1G18590', 'AT2G34420', 'AT5G05850', 'ATCG00280', 'ATCG00350', 'ATCG00680', 'ATCG00690')
checks = pd.read_sql(f"""SELECT * from nodes where "GID" in ("{'", "'.join(node_checks)}")""", con)
print(f"{len(node_checks)} checks, {len(checks.values)} results")
assert len(node_checks) == len(checks.values)

7 checks, 7 results


In [23]:
checks

Unnamed: 0,index,GID,defline,symbols,names,KO_effect,GO,GOdesc,mapman_code,mapman_name,mapman_desc
0,2012,AT1G18590,sulfotransferase 17,ATSOT17 | ATST5C | SOT17,SULFOTRANSFERASE 17 | ARABIDOPSIS SULFOTRANSFE...,,GO:0005634|GO:0051923|GO:0008146|GO:0005737|GO...,nucleus|sulfation|sulfotransferase activity|cy...,26.25,misc.sulfotransferase,sulfotransferase 17
1,12372,AT2G34420,photosystem II light harvesting complex gene B1B2,LHB1B2 | LHCB1.5,photosystem II light harvesting complex gene B...,,GO:0009941|GO:0009507|GO:0009535|GO:0009768|GO...,chloroplast envelope|chloroplast|chloroplast t...,1.1.1.1,PS.lightreaction.photosystem II.LHC-II,photosystem II light harvesting complex protei...
2,26367,AT5G05850,plant intracellular ras group-related LRR 1,PIRL1,plant intracellular ras group-related LRR 1,,GO:0009555|GO:0055046|GO:0005886,pollen development|microgametogenesis|plasma m...,30.99,signalling.unspecified,plant intracellular ras group-related LRR 1
3,33351,ATCG00280,photosystem II reaction center protein C,PSBC,photosystem II reaction center protein C,,GO:0010287|GO:0009534|GO:0009535|GO:0019684|GO...,plastoglobule|chloroplast thylakoid|chloroplas...,1.1.1.2,PS.lightreaction.photosystem II.PSII polypepti...,photosystem II reaction center protein C
4,33358,ATCG00350,"Photosystem I, PsaA/PsaB protein",PSAA,,,GO:0009535|GO:0009536|GO:0009534|GO:0009579|GO...,chloroplast thylakoid membrane|plastid|chlorop...,1.1.2.2,PS.lightreaction.photosystem I.PSI polypeptide...,Photosystem I%2C PsaA/PsaB protein
5,33391,ATCG00680,photosystem II reaction center protein B,PSBB,photosystem II reaction center protein B,,GO:0009535|GO:0003729|GO:0010207|GO:0009534|GO...,chloroplast thylakoid membrane|mRNA binding|ph...,1.1.1.2,PS.lightreaction.photosystem II.PSII polypepti...,photosystem II reaction center protein B
6,33392,ATCG00690,photosystem II reaction center protein T,PSBT | PSBTC,photosystem II reaction center protein T |,,GO:0015979|GO:0016168|GO:0009523|GO:0009535|GO...,photosynthesis|chlorophyll binding|photosystem...,1.1.1.2,PS.lightreaction.photosystem II.PSII polypepti...,photosystem II reaction center protein T
