# Instagram

In [1]:
# required libraries
import pandas as pd
import os
from pathlib import Path

In [2]:
# parameters and URLs
path = str(Path(os.path.abspath(os.getcwd())).parent.absolute())
youtubeUrl = path + '/DB2/top_200_youtubers.csv'
instaUrl = path + '/DB2/Top_50_Most_Followed_Instagram_Accounts.csv'
twitterUrl = path + '/DB2/Top_1000_Celebrity_Twitter_Accounts.csv'
grammyUrl = path + '/DB2/Grammy_Award.csv'

# country codes
countriesURL = path + '/DB2/wikipedia-iso-country-codes.csv'

# saving folder
savePath =  path + '/DB2/'

In [3]:
# Load the CSV files in memory
instagram = pd.read_csv(instaUrl, sep=',', index_col='instaid',encoding='cp1252')


In [4]:
#load the country codes
countries = pd.read_csv(countriesURL, sep=',', index_col='Name', keep_default_na=False, na_values=['_'])

In [5]:
instagram.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50 entries, ID1550 to ID1571
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Rank        50 non-null     int64  
 1   Username    50 non-null     object 
 2   Owner       50 non-null     object 
 3   Followers   50 non-null     float64
 4   Profession  50 non-null     object 
 5   Country     50 non-null     object 
dtypes: float64(1), int64(1), object(4)
memory usage: 2.7+ KB


In [6]:
# Load the required libraries
from rdflib import Graph, Literal, RDF, URIRef, Namespace
# rdflib knows about some namespaces, like FOAF
from rdflib.namespace import FOAF, XSD



In [7]:
# Construct the country and the movie ontology namespaces not known by RDFlib
CNS = Namespace("http://eulersharp.sourceforge.net/2003/03swap/countries#")
IN = Namespace("Instagram#")

#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("countries", CNS)
g.bind("in", IN)

In [8]:
# Check date
import datetime


In [9]:
%%time 
#measure execution time

#iterate over the instagram dataframe
for index, row in instagram.iterrows():

    Insta = URIRef(IN[index])

    g.add((Insta, RDF.type, IN.Insta))
    g.add((Insta, IN['rank'], Literal(row['Rank'], datatype=XSD.integer)))
    g.add((Insta, IN['username'], Literal(row['Username'], datatype=XSD.string)))
    g.add((Insta, IN['owner'], Literal(row['Owner'], datatype=XSD.string)))
    g.add((Insta, IN['folllowers'], Literal(row['Followers'], datatype=XSD.integer)))

   
    for pf in row['Profession'].split(','):
        prof = URIRef(IN[pf.strip()])
        g.add((Insta,IN['hasprofession'], prof))    
        
    for c in str(row['Country']).split(','):
        cName = c.strip()
        # check if the country exists
        # country.index == x returns an array of booleans, thus we need to use the any() method
        if((countries.index == cName).any() == True):
            #get the country code, convert to string and get the lower case to match the country codes in the ontology 
            code = str(countries[countries.index == cName]['Alpha-2 code'][0]).lower()
            # create the RDF node
            Country = URIRef(CNS[code])
            # add the edge connecting the Movie and the Country 
            g.add((Insta, IN['hasCountry'], Country))  



CPU times: total: 31.2 ms
Wall time: 56.3 ms


In [10]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'instagram.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))
    


--- saving serialization ---
CPU times: total: 31.2 ms
Wall time: 51.4 ms


# Youtube

In [11]:
youtube = pd.read_csv(youtubeUrl, sep=',', index_col='youtubeid', keep_default_na=False)

In [12]:
youtube.info()

<class 'pandas.core.frame.DataFrame'>
Index: 593 entries, ID921 to ID1512
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Country       593 non-null    object 
 1   Channel_Name  593 non-null    object 
 2   Category      593 non-null    object 
 3   username      593 non-null    object 
 4   followers     593 non-null    int64  
 5   Likes         593 non-null    float64
 6   Views         593 non-null    float64
 7   Views_Avg     593 non-null    float64
 8   Comments_Avg  593 non-null    float64
dtypes: float64(4), int64(1), object(4)
memory usage: 46.3+ KB


In [13]:
g = Graph()

In [14]:
CNS = Namespace("http://eulersharp.sourceforge.net/2003/03swap/countries#")
YT = Namespace("Youtube#")

g.bind("countries", CNS)
g.bind("yt", YT)

People are modeled with the FOAF ontology. 
Refer to [FOAF Documentation](http://xmlns.com/foaf/spec/)

In [15]:
%%time 

for index, row in youtube.iterrows():

    YouTube = URIRef(YT[index])
    
    g.add((YouTube, RDF.type, YT.YouTube))
    g.add((YouTube, YT['channelname'], Literal(row['Channel_Name'], datatype=XSD.string)))
    g.add((YouTube, YT['category'], Literal(row['Category'], datatype=XSD.string)))
    g.add((YouTube, YT['username'], Literal(row['username'], datatype=XSD.string)))
    g.add((YouTube, YT['followers'], Literal(row['followers'], datatype=XSD.integer)))
    g.add((YouTube, YT['likes'], Literal(row['Likes'], datatype=XSD.float)))
    g.add((YouTube, YT['views'], Literal(row['Views'], datatype=XSD.float)))
    g.add((YouTube, YT['viewsavg'], Literal(row['Views_Avg'], datatype=XSD.float)))
    g.add((YouTube, YT['commentsavg'], Literal(row['Comments_Avg'], datatype=XSD.float)))
    
    i=0
    for c in str(row['Country']).split(',') :
        cName = c.strip()
        for i in range(0,len(youtube)):
            if ( cName  == countries['Alpha-2 code'][i]):
                code = str(countries['Alpha-2 code'][i]).lower()
                cty = URIRef(CNS[code])
                g.add((YouTube, YT['hasCountry'], cty)) 
                break
    

        

CPU times: total: 1.11 s
Wall time: 1.24 s


In [17]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'youtube.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 62.5 ms
Wall time: 116 ms


# Twitter

In [18]:
# Load the CSV files in memory
twitter = pd.read_csv(twitterUrl, sep=',', index_col='twitterid', keep_default_na=False, na_values=['_'])

In [19]:
twitter.info()

<class 'pandas.core.frame.DataFrame'>
Index: 920 entries, ID0001 to ID0920
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   twitter_username  920 non-null    object
 1   twitter_userid    920 non-null    object
 2   domain            920 non-null    object
 3   name              920 non-null    object
 4   followers_count   920 non-null    object
 5   tweet_count       920 non-null    object
dtypes: object(6)
memory usage: 50.3+ KB


In [20]:
#create a new graph
g = Graph()

In [21]:
#regular expressions
TW = Namespace("Twitter#")
g.bind("tw", TW)

In [23]:
%%time 
#measure execution time

#iterate over the person dataframe
for index, row in twitter.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + the person id as URI
    Tweet = URIRef(TW[index])
    g.add((Tweet, RDF.type, TW.Twitter))
    g.add((Tweet, TW['username'], Literal(row['twitter_username'], datatype=XSD.string)))
    g.add((Tweet, TW['userid'], Literal(row['twitter_userid'], datatype=XSD.string)))
    g.add((Tweet, TW['domain'], Literal(row['domain'], datatype=XSD.string)))
    g.add((Tweet, TW['name'], Literal(row['name'], datatype=XSD.string)))
    g.add((Tweet, TW['followers'], Literal(row['followers_count'], datatype=XSD.string)))
    g.add((Tweet, TW['tweetscount'], Literal(row['tweet_count'], datatype=XSD.string)))



CPU times: total: 516 ms
Wall time: 510 ms


In [781]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'twitter.ttl', 'w', encoding="utf-8") as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 297 ms
Wall time: 326 ms


# Grammy

In [24]:
grammy = pd.read_csv(grammyUrl, sep=',', index_col='songid', encoding='cp1252')
grammy.astype({'Year': 'int32'}).dtypes

Song_Name    object
Artist       object
Year          int32
Winner         bool
dtype: object

In [162]:
grammy.info()

<class 'pandas.core.frame.DataFrame'>
Index: 110 entries, ID0001 to ID0110
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Song_Name  110 non-null    object
 1   Artist     110 non-null    object
 2   Year       110 non-null    int64 
 3   Winner     110 non-null    bool  
dtypes: bool(1), int64(1), object(2)
memory usage: 3.5+ KB


In [163]:
g = Graph()

In [164]:
GM = Namespace("Grammy#")
g.bind("gm", GM)

In [165]:
%%time 
#measure execution time

#iterate over the person dataframe
for index, row in grammy.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + the person id as URI
    Award = URIRef(GM[index])
    g.add((Award, RDF.type, GM.Grammy))
    g.add((Award, GM['songname'], Literal(row['Song_Name'], datatype=XSD.string)))
    g.add((Award, GM['artists'], Literal(row['Artist'], datatype=XSD.string)))
    g.add((Award, GM['year'], Literal(row['Year'], datatype=XSD.string)))
    g.add((Award, GM['winner'], Literal(row['Winner'], datatype=XSD.gYear)))
    
    pp = ''
    for p in row['Artist'].split(' '):
        pp = pp+p
    print(len(pp))
    for pf in pp.split(','):
        Art = URIRef(GM[pf.strip()])
        g.add((Award,GM['hasartists'], Art))   
        
 

TypeError: 'int' object is not callable

In [166]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'grammy.ttl', 'w', encoding="utf-8") as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 15.6 ms
Wall time: 2.08 ms


## Singers

In [97]:
g = Graph()

SN = Namespace("Singers#")
g.bind("countries", CNS)
g.bind("sn", SN)
singerlist = []



for index, row in instagram.iterrows():
    namelt = []
    for name in row['Profession'].split(','):
        namelt.append(name)

    if len(namelt)>1:
        for i in range(0,len(namelist)):
            if namelt[i] == 'Musician':
                name = ''
                for n in row['Owner'].split(' '):
                    name = name +n
                singerlist.append(name)
    elif namelt[0] == 'Musician':
        name = ''
        for n in row['Owner'].split(' '):
            name = name +n
        singerlist.append(name)



for index,row in youtube.iterrows():
    namelist = []
    for name in row['Category'].split(','):
        namelist.append(name)
    for i in range(0,len(namelist)): 
        if namelist[i] == 'Music' or namelist[i]== 'Pop music'  or namelist[i]=='Hip hop music' or \
        namelist[i]== 'Rock music' or namelist[i]== 'Music of Asia' or namelist[i]== 'Music of Latin America'\
        or namelist[i]== 'Electronic music' or namelist[i]== 'Rhythm and blues':
            name = ''
            for n in row['username'].split(' '):
                name = name + n
            if name not in singerlist:
                singerlist.append(name)
        else:
            pass
        
# print(singerlist)
# print('length = ',len(singerlist))

for singer in singerlist:
    Singers = URIRef(SN[singer])
    g.add((Singers, RDF.type, SN.Singer))
    checklt =[]
    for index, row in instagram.iterrows():
        name = ''
        for n in row['Owner'].split(' '):
            name = name +n
        checklt.append(name)
        if name == singer:
            g.add((Singers, SN['username'], Literal(row['Username'], datatype=XSD.string)))
            for c in str(row['Country']).split(','):
                cName = c.strip()
                if((countries.index == cName).any() == True):
                    code = str(countries[countries.index == cName]['Alpha-2 code'][0]).lower()
                    Country = URIRef(CNS[code])
                    g.add((Singers, SN['hasCountry'], Country)) 
    for index, row in youtube.iterrows():
        name = ''
        for n in row['username'].split(' '):
            name = name +n
        if name == singer and name not in checklist:
            g.add((Singers, SN['channelname'], Literal(row['Channel_Name'], datatype=XSD.string)))
            i=0
            for c in str(row['Country']).split(',') :
                cName = c.strip()
                for i in range(0,len(youtube)):
                    if ( cName  == countries['Alpha-2 code'][i]):
                        code = str(countries['Alpha-2 code'][i]).lower()
                        cty = URIRef(CNS[code])
                        g.add((Singers, SN['hasCountry'], cty)) 
                        break


length =  72


In [98]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'singer.ttl', 'w', encoding="utf-8") as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 31.2 ms
Wall time: 31.4 ms


## Celebrity

In [69]:
g = Graph()

CB = Namespace("Celebrity#")
g.bind("countries", CNS)
g.bind("cb", CB)
celeblist = []


for index, row in instagram.iterrows():
    namelist = []
    for name in row['Owner'].split(' '):
        namelist.append(name)
    name = ''
    for i in range(0,len(namelist)): 
        name = name + namelist[i]

    celeblist.append(name)

for index, row in twitter.iterrows():
    namelist = []
    for name in row['name'].split(' '):
        namelist.append(name)
    name = ''
    for i in range(0,len(namelist)): 
        name = name + namelist[i]
    if name not in celeblist:
        celeblist.append(name)
        
# print(celeblist)
# print('lengeth = ',len(celeblist))

for celeb in celeblist:
    Celebrity = URIRef(CB[celeb])
    g.add((Celebrity, RDF.type, CB.Celebrity))
    checklist =[]
    for index, row in instagram.iterrows():
        namelist = []
        for name in row['Owner'].split(' '):
            namelist.append(name)
        name = ''
        for i in range(0,len(namelist)): 
            name = name + namelist[i]
        checklist.append(name)
        if name == celeb:
            g.add((Celebrity, CB['username'], Literal(row['Username'], datatype=XSD.string)))
            for c in str(row['Country']).split(','):
                cName = c.strip()
                if((countries.index == cName).any() == True):
                    code = str(countries[countries.index == cName]['Alpha-2 code'][0]).lower()
                    Country = URIRef(CNS[code])
                    g.add((Celebrity, CB['hasCountry'], Country)) 
        
        
    for index, row in twitter.iterrows():
        namelist = []
        for name in row['name'].split(' '):
            namelist.append(name)
        name = ''
        for i in range(0,len(namelist)): 
            name = name + namelist[i]
        if name == celeb and name not in checklist:
            g.add((Celebrity, CB['username'], Literal(row['twitter_username'], datatype=XSD.string))) 
            break

lengeth =  945


In [71]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'celebrity.ttl', 'w', encoding="utf-8") as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
CPU times: total: 93.8 ms
Wall time: 101 ms
