In [1]:
# import libraries
from rdflib import Graph,Literal,RDF,URIRef
from rdflib.namespace import FOAF,XSD,RDFS
from rdflib import Namespace
import pandas as pd
import string
import random
import numpy as np
import json


# ignore the warning
import warnings
warnings.filterwarnings('ignore')


In [2]:
# initialize the graph
g = Graph()

# Namespace is used for the graph general URIref
# We can add are classes (Concepts) with PJ_SDM uri (/Tweet, /Company) etc.
PJ_SDM = Namespace("http://proj_sdm.org/")
# bind for the better readability
g.bind("pj_sdm",PJ_SDM)

In [3]:
# we created this parser as there should not be any forbiden characters in the URI
def URLparse(url:str):
    url=url.replace("\'","_").replace("\"","_")
    for i in string.punctuation:
        url = url.replace(i,"_")
    url = url.replace(" ","_")
    return url

# Data Sources

In [4]:
companies_df = pd.read_csv('data/top_25NSE.csv')
companies_df.head()

Unnamed: 0,Company Name,Last Price,% Chg,52 wk\nHigh,52 wk\nLow,Market Cap\n(Rs. cr)
0,RELIANCE,2595.85,0.67,2751.35,1876.7,1755858.03
1,TCS,3707.45,-1.13,4043.0,3004.0,1371404.8
2,HDFC,1430.9,-0.81,1725.0,1292.0,793384.31
3,WIPRO,1876.55,-0.54,1953.9,1311.3,789415.54
4,ITC,699.25,-0.7,867.0,531.15,485892.84


In [5]:
with open('data/sdm_sample_db_tw.json') as json_file:
    sample_sdm_db = json.load(json_file)
# sample_sdm_db[0]

# Add company

In [6]:
# Although it will be infered 
for i, row in companies_df.iterrows():
    comp = row["Company Name"]
    comp_obj_ref = URIRef(f"http://proj_sdm.org/{comp}")
    g.add((comp_obj_ref, RDF.type, PJ_SDM.Company))
    print('added - ',comp_obj_ref)

added -  http://proj_sdm.org/RELIANCE
added -  http://proj_sdm.org/TCS
added -  http://proj_sdm.org/HDFC
added -  http://proj_sdm.org/WIPRO
added -  http://proj_sdm.org/ITC
added -  http://proj_sdm.org/ONGC
added -  http://proj_sdm.org/HINDALCO
added -  http://proj_sdm.org/NTPC
added -  http://proj_sdm.org/IOC
added -  http://proj_sdm.org/GRASIM
added -  http://proj_sdm.org/M&amp;M
added -  http://proj_sdm.org/DLF
added -  http://proj_sdm.org/CIPLA
added -  http://proj_sdm.org/SIEMENS
added -  http://proj_sdm.org/BPCL
added -  http://proj_sdm.org/SRF
added -  http://proj_sdm.org/BRITANNIA
added -  http://proj_sdm.org/MINDTREE
added -  http://proj_sdm.org/GAIL
added -  http://proj_sdm.org/ZOMATO
added -  http://proj_sdm.org/MARICO
added -  http://proj_sdm.org/MPHASIS
added -  http://proj_sdm.org/IRCTC
added -  http://proj_sdm.org/UPL
added -  http://proj_sdm.org/GLAND


# Add Tweet Stock and Time

In [7]:
# random important users list from india 
famous_users = ['Mukesh ambani','Shiv Nadar','Dilip Shangvi','Satya Nadella','Azim Premji','Sachin Tendulkar','Virat Kohli','Barak Obama','']

In [8]:
def get_time():
    h = random.randint(1,24)
    d = random.randint(1,31)
    m = random.randint(1,12)
    y = 2022
    return h,d,m,y

In [9]:
def get_ohlc():
    o = random.randint(1,300)
    h = random.randint(1,300)
    l = random.randint(1,300)
    c = random.randint(1,300)
    return o,h,l,c

In [10]:
hour,day,month,year = get_time()

# add uniqe time 
g.add((PJ_SDM.Time, PJ_SDM.has_hour, Literal(int(hour))))
g.add((PJ_SDM.Time, PJ_SDM.has_day, Literal(int(day))))
g.add((PJ_SDM.Time, PJ_SDM.has_month, Literal(int(month))))
g.add((PJ_SDM.Time, PJ_SDM.has_year, Literal(int(year))))


# time ideally is a blank node, just giving URI for the better readability
unique_time_str = "_".join([str(s) for s in [hour,day,month,year]])
unique_time_uri = URIRef(f"http://proj_sdm.org/{unique_time_str}")
# although the below rule will be infered automaticly, adding it just for the better readility 
g.add((unique_time_uri, RDF.type, PJ_SDM.Time))



for data_dict in sample_sdm_db:
    comp = data_dict['companies']
    comp_obj_ref = URIRef(f"http://proj_sdm.org/{comp}")


    # get ohlc data and add to graph
    open,high,low,close  = get_ohlc()
    unique_stock_time = 'Stock_data_'+comp+unique_time_str
    unique_stock_time_uri = URIRef(f"http://proj_sdm.org/{unique_stock_time}")
    # although the below rule will be infered automaticly, adding it just for the better readility 
    g.add((unique_stock_time_uri, RDF.type, PJ_SDM.Stock_data))


    # add connections between stock_data, time and company, the stocks are for this particular hour, day, month and year. 
    # Granularities can be easily changed easily to minuite or seconds 
    g.add((unique_stock_time_uri,PJ_SDM.stock_time,unique_time_uri))
    g.add((comp_obj_ref,PJ_SDM.stock_pricing,unique_time_uri))


    # adding attribute 
    g.add((unique_stock_time_uri, PJ_SDM.open, Literal(float(open))))
    g.add((unique_stock_time_uri, PJ_SDM.high, Literal(float(high))))
    g.add((unique_stock_time_uri, PJ_SDM.low, Literal(float(low))))
    g.add((unique_stock_time_uri, PJ_SDM.close, Literal(float(close))))
    g.add((unique_stock_time_uri, PJ_SDM.in_stock_market, PJ_SDM.DSE))


    # add tweets information
    for i,tw_info in enumerate(data_dict['top_tweets']):

        # get attributes
        text = tw_info['text']
        support_count = tw_info['support_count']
        rank = tw_info['rank']
        sentiment = tw_info['sentiment']


        unique_tweet_time = 'Tweet_'+str(rank)+'_'+comp+unique_time_str
        unique_tweet_time_uri = URIRef(f"http://proj_sdm.org/{unique_tweet_time}")
        # although the below rule will be infered automaticly, adding it just for the better readility 
        g.add((unique_tweet_time_uri, RDF.type, PJ_SDM.Tweet))

        # add connections between Tweet, time and company, the stocks are for this particular hour, day, month and year. 
        # Granularities can be easily changed easily to minuite or seconds 
        g.add((unique_tweet_time_uri,PJ_SDM.tweet_time,unique_time_uri))
        g.add((comp_obj_ref,PJ_SDM.related_tweet,unique_tweet_time_uri))


        # add attributes
        text = tw_info['text']
        support_count = tw_info['support_count']
        rank = tw_info['rank']
        sentiment = tw_info['sentiment']
        g.add((unique_tweet_time_uri, PJ_SDM.text, Literal(str(text))))
        g.add((unique_tweet_time_uri, PJ_SDM.support_count, Literal(int(support_count))))
        g.add((unique_tweet_time_uri, PJ_SDM.rank, Literal(int(rank))))
        g.add((unique_tweet_time_uri, PJ_SDM.sentiment, Literal(float(sentiment))))


        # add users
        id = 0
        for i in range(2):
            id = random.randint(0,len(famous_users)-1)
            mentioned_user = famous_users[id]
            mentioned_user_uri = URIRef(f"http://proj_sdm.org/{URLparse(mentioned_user)}")
            g.add((mentioned_user_uri,PJ_SDM.name,Literal(str(mentioned_user))))
            g.add((unique_tweet_time_uri,PJ_SDM.related_user,mentioned_user_uri))

        tweet_user = famous_users[(id+5)%len(famous_users)]
        tweet_user_uri = URIRef(f"http://proj_sdm.org/{URLparse(tweet_user)}")
        g.add((tweet_user_uri,PJ_SDM.name,Literal(str(tweet_user))))
        g.add((unique_tweet_time_uri,PJ_SDM.tweet_by,tweet_user_uri))

In [11]:
save_format = "ttl"
file_name = "abox.ttl"
g.serialize(file_name, format=save_format)

<Graph identifier=N4650d10e0ef64471b522ec7c6f63b373 (<class 'rdflib.graph.Graph'>)>