In [5]:
from neo4j import GraphDatabase
import json
from tqdm.notebook import tqdm, trange
import pandas as pd
from fingest.nlp2neo4j import srl_ingest

## Load config variables

In [6]:
f = open('config.json', 'r')
config_obj = json.load(f)
neo4j_obj = config_obj['neo4j']
corenlp_obj = config_obj['corenlp']

n4j_user, n4j_password, n4j_port, n4j_host = neo4j_obj['user'], neo4j_obj['password'], str(neo4j_obj['port']), neo4j_obj['host']

corenlp_host, corenlp_port = corenlp_obj['host'], str(corenlp_obj['port'])

url = "neo4j://" + n4j_host + ":" + n4j_port

driver = GraphDatabase.driver(url, auth=(n4j_user, n4j_password))

In [7]:
def drop_it_all(tx):
    tx.run("MATCH (n) DETACH DELETE (n) ")
    
def add_company(tx, ticker, info):
           
    ## Add ticker part
    cypher = "MERGE (a:Ticker {id: $id"
    args = {
        "id": ticker
    }
    if "shortName" in info:
        if info["shortName"] is not None:
            args["short_name"] = info["shortName"]
            cypher += ", short_name: $short_name"
        else:
            print("shortName is NULL")
            return
            
    if "longBusinessSummary" in info:
        args["business_summary"] = info["longBusinessSummary"]
        cypher += ", business_summary: $business_summary"

    cypher += "})"
    ## End adding ticker
    
    if "industry" in info:
        args["industry_name"] = info["industry"] 
        cypher += " MERGE (i:Industry {name: $industry_name}) MERGE (a)-[:PART_OF]->(i)"

    
    if "sector" in info:
        args["sector_name"] = info["sector"] 
        cypher += " MERGE (s:Sector {name: $sector_name}) MERGE (a)-[:PART_OF]->(s)"

    if "country" in info:
        args["country_name"] = info["country"] 
        cypher += " MERGE (c:Country {name: $country_name}) MERGE (a)-[:HOSTED_IN]->(c)"
        
        
    if "industry" in info and "sector" in info:
        cypher += " MERGE (i)-[:WITHIN]->(s)"
        

    tx.run(cypher, args)

    
def add_inst_holders(tx, ticker, inst_holders_df):
    for index, row in inst_holders_df.iterrows():
        args = {}
        #print(row)
        if 'Holder' in args:
            args["inst_name"] = row['Holder']
        else:
            return
        args["ticker_id"] = ticker
        args["percentage"] = row['% Out']
        args["shares"] = row["Shares"]
        args["value"] = row["Value"]        
        cypher = "MERGE (t:Ticker {id: $ticker_id}) MERGE (h:InstHolder {name: $inst_name}) MERGE (t)-[:HELD_BY {percentage:$percentage, shares:$shares, value:$value}]-(h)"
        tx.run(cypher, args)    
    
def add_news(tx, ticker, news):
    
    for item in news:
        args = {}
        args["ticker_id"] = ticker
        args["title"] = item["title"]
        args["news_id"] = item["uuid"]
        args["publisher_name"] = item["publisher"]
        args["publish_time"] = item["providerPublishTime"]
        
        
        cypher = "MERGE (t:Ticker {id: $ticker_id}) MERGE (n:News {id: $news_id, title: $title}) MERGE (p:Publisher {name:$publisher_name}) MERGE (t)-[:MENTIONED_IN]->(n) MERGE (n)-[:PUBLISHED_BY {time:$publish_time}]-(p)"
        tx.run(cypher, args)
    

def add_news_detail(tx, ticker, news):
    
    endpoint = "".join(["http://", corenlp_host, ":", corenlp_port])
    
    ingest = srl_ingest(endpoint)

    for item in news:
        title_text = item["title"]
        args = {}
        args["title"] = item["title"]
        args["news_id"] = item["uuid"]
        
        sents = ingest.generate_roles(title_text)
        for sent in sents:
            for role in sent:
                #print(role)
                args["subject_text"], args["relation_text"], args["object_text"] = role
                cypher = "MERGE (n:News {id: $news_id, title: $title}) MERGE (s:Subject {text:$subject_text}) MERGE (r:Relation {text:$relation_text}) MERGE (o:Object {text:$object_text}) MERGE (n)-[:HAS_SUBJECT]->(s) MERGE (n)-[:HAS_RELATION]->(r) MERGE (n)-[:HAS_OBJECT]->(o)"
                tx.run(cypher, args)
    


In [None]:


with open('ticker.info.json', 'r') as json_info:
    json_obj = json.load(json_info)

print("Loaded ticker info")
with driver.session() as session:
    session.write_transaction(drop_it_all)
    for ticker in tqdm(json_obj):
        if "info" not in json_obj[ticker]:
            continue
        info = json_obj[ticker]["info"]
        news = json_obj[ticker]["news"] if "news" in json_obj[ticker] else None
        
        
        # Only ingest if there is no short name
        if "shortName" in info:
            session.write_transaction(add_company, ticker, info)            
            if news is not None:
                session.write_transaction(add_news, ticker, news)
                session.write_transaction(add_news_detail, ticker, news)
            if "institutional_holders" in json_obj[ticker]:            
                inst_holders_obj = json.loads(json_obj[ticker]['institutional_holders'])
                inst_holders_data = inst_holders_obj["data"]            
                inst_holders_df = pd.DataFrame(inst_holders_data)
                session.write_transaction(add_inst_holders, ticker, inst_holders_df)

driver.close()

Loaded ticker info


  0%|          | 0/3890 [00:00<?, ?it/s]

shortName is NULL
shortName is NULL
shortName is NULL
shortName is NULL
shortName is NULL
shortName is NULL
