# Analyse Graph Centrality Measures (GCM) on DBpedia data 

In [2]:
# Importing packages
import pandas as pd
#from SPARQLWrapper import SPARQLWrapper, JSON, N3
#import lxml
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
#from networkx import Graph as NXGraph
#from rdflib import Graph as RDFGraph
#from rdflib.extras.external_graph_libs import rdflib_to_networkx_graph
import statistics
import collections
#import danker # https://github.com/athalhammer/danker
from scipy import stats
from scipy.stats import kstest
# https://www.statology.org/normality-test-python/

from urllib import parse
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from statsmodels.miscmodels.ordinal_model import OrderedModel

import mlnotify 
# https://github.com/aporia-ai/mlnotify
# Use %%notify at beginning of cell

import os.path, time
# Use %%time at beginning of cell

print('------------')
!python --version
print("Pandas " + pd.__version__)
print("Numpy " + np.__version__)
print("Seaborn " + sns.__version__)
#print("Networkx " + nx.__version__)
print('------------')
print("All packages loaded and ready to roll :-)")

------------
Python 3.7.6
Pandas 1.3.4
Numpy 1.18.1
Seaborn 0.11.2
------------
All packages loaded and ready to roll :-)


In [3]:
# Load GCM data

# NetworkX
df_gcm = pd.read_csv('../data/gcm_computed.nosync/nx-gcm.csv')

# iGraph
df_ig_gcm = pd.read_csv('../data/gcm_computed.nosync/ig-gcm.csv')

#print(df_gcm.head())
#print('======================')
#print(df_ig_gcm.head())

In [None]:
# Standardize GCMs
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

scaler = StandardScaler()

df_ig_gcm_scaled = df_ig_gcm.copy()
df_ig_gcm_scaled[['degree', 'eigenvector', 'pagerank']] = scaler.fit_transform(df_ig_gcm[['degree', 'eigenvector', 'pagerank']])
df_ig_gcm_scaled.head()

In [25]:
print("Data types:")
print(df_ig_gcm.dtypes)
print('------------------------')
print("Memory usage:")
print(df_ig_gcm.memory_usage(deep=True))
print('------------------------')
print("Dataframe shape is {}".format(df_ig_gcm.shape))

print("========================")

print("Degree Centrality Kurtosis: %s" % stats.kurtosis(df_ig_gcm['degree']))
print("Degree Centrality Skewness: %s" % stats.skew(df_ig_gcm['degree']))
print('------------------------')
print("Eigenvector Centrality Kurtosis: %s" % stats.kurtosis(df_ig_gcm['eigenvector']))
print("Eigenvector Centrality Skewness: %s" % stats.skew(df_ig_gcm['eigenvector']))
print('------------------------')
print("PageRank Kurtosis: %s" % stats.kurtosis(df_ig_gcm['pagerank']))
print("PageRank Skewness: %s" % stats.skew(df_ig_gcm['pagerank']))

Data types:
resource        object
degree           int64
eigenvector    float64
pagerank       float64
dtype: object
------------------------
Memory usage:
Index                128
resource       964958363
degree          65946664
eigenvector     65946664
pagerank        65946664
dtype: int64
------------------------
Dataframe shape is (8243333, 4)
Degree Centrality Kurtosis: 3845700.692636409
Degree Centrality Skewness: 1758.6476950353158
------------------------
Eigenvector Centrality Kurtosis: 2535495.292056733
Eigenvector Centrality Skewness: 1186.2822293812544
------------------------
PageRank Kurtosis: 2865512.8079392132
PageRank Skewness: 1578.3902507400448


In [5]:
df_ig_gcm.describe()

Unnamed: 0,degree,eigenvector,pagerank
count,8243333.0,8243333.0,8243333.0
mean,6.85722,0.0001550843,1.213102e-07
std,691.0175,0.0004676179,1.375352e-05
min,1.0,0.0,2.435047e-08
25%,2.0,1.842308e-11,5.151196e-08
50%,3.0,2.267567e-09,6.047217e-08
75%,5.0,8.826938e-08,8.760632e-08
max,1624880.0,1.0,0.02791082


In [4]:
# (OLD) Method for splitting the resource to get resource/name (based of the URI) to match PageRank dataframe

def node_path (row):
    return parse.urlsplit(row['resource']).path.split('/', 1)[-1]

In [36]:
# Function for printing a GCMs mean, stdev, as well as the max and min node

def gcmStats(data_frame, gcm_string):
    df_gcm = data_frame.copy()
    gcm_col = gcm_string
    gcm_top = df_gcm.sort_values(by=[gcm_col], ascending=False)
    gcm_low = df_gcm.sort_values(by=[gcm_col])

    print(gcm_col.upper()," CENTRALITY")
    print("==========================")
    print("The mean degree centrality is {}, with stdev {}".format(np.nanmean(df_gcm[gcm_col].values), np.nanstd(df_gcm[gcm_col].values)))#, statistics.stdev(degreesDC)))
    print("The maximum node is {} - with value {}".format(gcm_top['resource'].iat[0], gcm_top[gcm_col].iat[0]))
    print("The minimum node is {} - with value {}".format(gcm_low['resource'].iat[0], gcm_low[gcm_col].iat[0]))

In [37]:
gcmStats(df_ig_gcm, 'degree')

DEGREE  CENTRALITY
The mean degree centrality is 6.857219525160515, with stdev 691.0174712767385
The maximum node is http://dbpedia.org/ontology/CareerStation - with value 1624880
The minimum node is http://dbpedia.org/resource/Robert_Dalrymple_Ross - with value 1


In [38]:
gcmStats(df_ig_gcm, 'eigenvector')

EIGENVECTOR  CENTRALITY
The mean degree centrality is 0.00015508427622203387, with stdev 0.0004676179048092296
The maximum node is http://dbpedia.org/ontology/CareerStation - with value 1.0
The minimum node is http://dbpedia.org/resource/705_Erminia__Beltasteroid-stub__1 - with value 0.0


In [39]:
gcmStats(df_ig_gcm, 'pagerank')

PAGERANK  CENTRALITY
The mean degree centrality is 1.213101545212344e-07, with stdev 1.3753518418526512e-05
The maximum node is http://dbpedia.org/ontology/CareerStation - with value 0.0279108165979626
The minimum node is http://dbpedia.org/resource/Rush_Tower,_Missouri - with value 2.4350470029608183e-08


## Test on normal distribution

https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.normaltest.html

In [None]:
print("Normal distribution for Degree Centrality")
print('======================')
print('NetworkX')
print(stats.normaltest(df_gcm['degree']))
print(kstest(df_gcm['degree'], 'norm'), '[Kolmogorov-Smirnov-Test]') # Kolmogorov-Smirnov-Test for normality

print('----------------------')
print('iGraph')
print(stats.normaltest(df_ig_gcm['degree']))
print(kstest(df_ig_gcm['degree'], 'norm'), '[Kolmogorov-Smirnov-Test]') # Kolmogorov-Smirnov-Test for normality

#Normal distribution for Degree Centrality
#======================
#NetworkX
#NormaltestResult(statistic=69557965.55953461, pvalue=0.0)
#KstestResult(statistic=0.5000000483957555, pvalue=0.0) [Kolmogorov-Smirnov-Test]
#----------------------
#iGraph
#NormaltestResult(statistic=69557965.55953465, pvalue=0.0)
#KstestResult(statistic=0.8413447460685429, pvalue=0.0) [Kolmogorov-Smirnov-Test]

In [None]:
print("Normal distribution for Eigenvector Centrality")
print('======================')
print('NetworkX')
print("Not computed in NetworkX")

print('----------------------')
print('iGraph')
print(stats.normaltest(df_ig_gcm['eigenvector']))
print(kstest(df_ig_gcm['eigenvector'], 'norm'), '[Kolmogorov-Smirnov-Test]') # Kolmogorov-Smirnov-Test for normality

#Normal distribution for Eigenvector Centrality
#======================
#NetworkX
#Not computed in NetworkX
#----------------------
#iGraph
#NormaltestResult(statistic=63653056.418342546, pvalue=0.0)
#KstestResult(statistic=0.5, pvalue=0.0) [Kolmogorov-Smirnov-Test]

In [None]:
print("Normal distribution for PageRank")
print('======================')
print('NetworkX')
print(stats.normaltest(df_gcm['pagerank']))
print(kstest(df_gcm['pagerank'], 'norm'), '[Kolmogorov-Smirnov-Test]') # Kolmogorov-Smirnov-Test for normality

print('----------------------')
print('iGraph')
print(stats.normaltest(df_ig_gcm['pagerank']))
print(kstest(df_ig_gcm['pagerank'], 'norm'), '[Kolmogorov-Smirnov-Test]') # Kolmogorov-Smirnov-Test for normality

#Normal distribution for PageRank
#======================
#NetworkX
#NormaltestResult(statistic=68373823.03822671, pvalue=0.0)
#KstestResult(statistic=0.5000000072594243, pvalue=0.0) [Kolmogorov-Smirnov-Test]
#----------------------
#iGraph
#NormaltestResult(statistic=67903070.56349243, pvalue=0.0)
#KstestResult(statistic=0.500000009714432, pvalue=0.0) [Kolmogorov-Smirnov-Test]

In [4]:
# Compute the Correlation coefficient between the GCMs:

print("Correlation coefficient for ev-dg: ", np.corrcoef(df_ig_gcm.eigenvector, df_ig_gcm.degree))
print("Correlation coefficient for pr-dg: ", np.corrcoef(df_ig_gcm.pagerank, df_ig_gcm.degree))
print("Correlation coefficient for pr-ev: ", np.corrcoef(df_ig_gcm.pagerank, df_ig_gcm.eigenvector))

Correlation coefficient for ev-dg:  [[1.         0.60861409]
 [0.60861409 1.        ]]
Correlation coefficient for pr-dg:  [[1.         0.95806588]
 [0.95806588 1.        ]]
Correlation coefficient for pr-ev:  [[1.         0.52535622]
 [0.52535622 1.        ]]


## Get DBpedia categories

In [None]:
# Load pre-queried (via queried web-interface) DBpedia categories
# https://dbpedia.org/sparql/

# 90s films from the aggregated Categories: 1990_films, 1991_films, 1992_films, ...
df_film90 = pd.read_csv("../data/sparql_cat_queries/sparql_90film.csv")
df_film90.rename(columns={"90film": "resource"}, inplace=True) #rename column to match PageRank dataframe
#df_film90['resource'] = df_film90.apply(lambda row: node_path(row), axis=1) #split resource to get resource/name (based on URI) to match PageRank dataframe

# actors from the "Occupation": "Actor"@en
df_actor = pd.read_csv("../data/sparql_cat_queries/sparql_actor.csv")
df_actor.rename(columns={"actor": "resource"}, inplace=True)

# Music Genre from the "Type": "dbo:MusicGenre"
df_musicGenre = pd.read_csv("../data/sparql_cat_queries/sparql_musicGenre.csv")
df_musicGenre.rename(columns={"mGenre": "resource"}, inplace=True)

# Books from the "Type":"dbo:book"
df_book = pd.read_csv("../data/sparql_cat_queries/sparql_book.csv")
df_book.rename(columns={"book": "resource"}, inplace=True)

# Book authors from the "Type": author UNION book dbo:author
df_bookAut = pd.read_csv("../data/sparql_cat_queries/sparql_bookAuthor.csv")
df_bookAut.rename(columns={"author": "resource"}, inplace=True)

# Politicians from the "Type": Person/Politician
df_pol = pd.read_csv("../data/sparql_cat_queries/sparql_politician.csv")
df_pol.rename(columns={"pol": "resource"}, inplace=True)

# POTUS from the "Category": Presidents_of_the_United_States
df_potus = pd.read_csv("../data/sparql_cat_queries/sparql_potus.csv")
df_potus.rename(columns={"potus": "resource"}, inplace=True)

# Cities from the "Category": "dbo:City"
df_city = pd.read_csv("../data/sparql_cat_queries/sparql_city.csv")
df_city.rename(columns={"city": "resource"}, inplace=True)

# Lakes from the "Category": "dbo:Lake"
df_lake = pd.read_csv("../data/sparql_cat_queries/sparql_lake.csv")
df_lake.rename(columns={"lake": "resource"}, inplace=True)

# Mountains from the "Category": "dbo:Mountain"
df_mountain = pd.read_csv("../data/sparql_cat_queries/sparql_mountain.csv")
df_mountain.rename(columns={"mount": "resource"}, inplace=True)

# EU capitals from the "Category": Capitals_in_Europe
df_euCap = pd.read_csv("../data/sparql_cat_queries/sparql_euCap.csv")
df_euCap.rename(columns={"euCap": "resource"}, inplace=True)

# EU countries from the "Category": Countries_in_Europe
df_euCou = pd.read_csv("../data/sparql_cat_queries/sparql_euCountry.csv")
df_euCou.rename(columns={"euCountry": "resource"}, inplace=True)

# Asian countries from the "Category": "dbc:Countries_in_Asia", "dbc:East_Asian_countries" and "dbc:Central_Asian_countries"
df_asCou = pd.read_csv("../data/sparql_cat_queries/sparql_asCountry.csv")
df_asCou.rename(columns={"asCountry": "resource"}, inplace=True)

# Continents from the "Category": "dbo:Continents"
df_continent = pd.read_csv("../data/sparql_cat_queries/sparql_continent.csv")
df_continent.rename(columns={"conti": "resource"}, inplace=True)

# Birds from the "Class": "dbr:Bird"
df_bird = pd.read_csv("../data/sparql_cat_queries/sparql_bird.csv")
df_bird.rename(columns={"bird": "resource"}, inplace=True)

# African animals from the "Category": "dbc:Vertebrates_of_Africa"
df_africanAnimal = pd.read_csv("../data/sparql_cat_queries/sparql_africanAnimal.csv")
df_africanAnimal.rename(columns={"afrAnimal": "resource"}, inplace=True)

# Furniture from the "Category": "dbc:Furniture"
df_furniture = pd.read_csv("../data/sparql_cat_queries/sparql_furniture.csv")
df_furniture.rename(columns={"fur": "resource"}, inplace=True)

# Furniture from the "Category": "dbc:Vegetables"
df_vegetable = pd.read_csv("../data/sparql_cat_queries/sparql_vegetable.csv")
df_vegetable.rename(columns={"veg": "resource"}, inplace=True)

# Types of fast food from the "Category": "dbc:Fast_food"
df_fastFood = pd.read_csv("../data/sparql_cat_queries/sparql_fastfood.csv")
df_fastFood.rename(columns={"fastFood": "resource"}, inplace=True)

# European Landmark/Sight from multiple "Category": "dbc:Landmarks_in_x" with x being a country from europe
df_euSight = pd.read_csv("../data/sparql_cat_queries/sparql_euLandmark.csv")
df_euSight.rename(columns={"euroSight": "resource"}, inplace=True)

# DAX company from the "Type": "dbo:Company" and "dbp:tradedAs" -> "dbr:DAX"
df_daxCo = pd.read_csv("../data/sparql_cat_queries/sparql_daxCompany.csv")
df_daxCo.rename(columns={"DAXco": "resource"}, inplace=True)

# US company from ?
df_usCo = pd.read_csv("../data/sparql_cat_queries/sparql_usCompany.csv")
df_usCo.rename(columns={"usCo": "resource"}, inplace=True)

# US tech company from the "Category": "dbc:Technology_companies_of_the_United_States"
df_usTechCo = pd.read_csv("../data/sparql_cat_queries/sparql_usTechCompany.csv")
df_usTechCo.rename(columns={"usTech": "resource"}, inplace=True)

## Analyse categories

In [None]:
# Function for standardized scaling and normalisation of GCM values
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

def scaler(data_frame):
    helper_df = data_frame.copy()
    to_scale = StandardScaler()

    helper_df[['degree', 'eigenvector', 'pagerank']] = to_scale.fit_transform(helper_df[['degree', 'eigenvector', 'pagerank']])
    return helper_df

#df_ig_gcm = scaler(df_ig_gcm)

In [None]:
# Function for merging the sparql category with GCM data

# Merge the queried category df to the GCM metrics based on <resource> with a left outer join
# (Keep every row in the category dataframe -> missing values get NaN)

def catMerger(sparql_data, gcm_data):
    df_sparql = sparql_data.copy()
    df_gcm = gcm_data.copy()
    helper_df = pd.merge(df_sparql,
                 #df_gcm[['resource','degree','pagerank']],
                 df_gcm[['resource','degree', 'eigenvector', 'pagerank']],
                 on='resource', 
                 how='left')
    
    # scale and normalize the gcm metrics (same as scaler function)
    to_scale = StandardScaler()
    helper_df[['degree', 'eigenvector', 'pagerank']] = to_scale.fit_transform(helper_df[['degree', 'eigenvector', 'pagerank']])
    
    print("Dataframe merged with shape: {}".format(helper_df.shape))
    print('============================')

    #print(helper_df.shape[0] - helper_df.dropna().shape[0]) #return amount of rows containing NaN values
    
    return helper_df


In [None]:
# Function for merging the survey category data with the gcm category data

# Merge the survey category df to the GCM metrics per category based on <resource> with a left outer join
# (Keep every row in the category dataframe -> missing values get NaN)

def gcmSurfer(survey_data, cat_gcm_data):
    df_survey = survey_data.copy()
    df_cat_gcm = cat_gcm_data.copy()
    df_survey.rename(columns={"uri": "resource"}, inplace=True) #rename column to match GCM dataframe
    
    # Count how often an answer was given in the survey and save into new dataframe
    survey_count = df_survey['resource'].value_counts(normalize=True) #normalize values
    df_survey_counted = pd.DataFrame({'resource':survey_count.index, 'amount':survey_count.values})
    #euCap_result.name = euCap_result.name.str.lower()
    
    df_survey_gcm = pd.merge(df_survey_counted,
                 df_cat_gcm[['resource','name', 'degree','eigenvector','pagerank']],
                 on='resource', 
                 how='left')

    # assign all answers that were not matchable a PR of 0 (altough per definition of PR not possible) to allow plotting. Alternative: drop from df?
    #df_survey_gcm.pagerank = df_euCap_survey_gcm.pagerank.fillna(0)
    #df_survey_gcm.degree = df_euCap_survey_gcm.degree.fillna(0)
    
    nan_rows = df_survey_gcm.shape[0] - df_survey_gcm.dropna().shape[0]
    
    print("Rows with NaN values dropped: {}".format(nan_rows) + " from {}".format(df_survey_gcm.shape[0]) + " => {}% ".format(round((nan_rows/df_survey_gcm.shape[0])*100, 2))) #return amount of rows containing NaN values
    df_survey_gcm = df_survey_gcm.dropna()
    print("Dataframe merged with shape: {}".format(df_survey_gcm.shape))
    print('============================')
    
    return df_survey_gcm

In [None]:
# Function for creating "Camps" inside a category df used for ordinal regression

# takes two parameter: dataframe, num_splits
# dataframe = input dataframe containing URI, amount (of survey mentions), GCMs (degree, eigenvector, pagerank)
# num_splits = number defining the camp size, e.g. 3 -> three camps with 1 = top 33%; 2 = mid 33%; 3 = low 33%

def camper(data_frame, num_splits):
    helper_df = data_frame.copy()
    split_v = [int(len(helper_df.index)/num_splits * i) for i in np.arange(num_splits)]
    
    #dataframe['counter'] = dataframe.index
    for i in helper_df.index:
        
        split_smaller = [split <= i for split in split_v]
        camp = np.sum(split_smaller)
        
        #return dataframe
        helper_df.at[i,'camp'] = camp
    return helper_df

In [None]:
def ordinalReg(df_survey_gcm, list_gcm, num_splits): # list_gcm -> ['degree', 'eigenvector', 'pagerank']
    
    # ToDo: call camper inside here?
    helper_df = df_survey_gcm.copy()
    camped_df = camper(helper_df, num_splits)
    
    mod_prob = OrderedModel(camped_df['camp'],
                            #df_euCap_or[['degree', 'eigenvector', 'pagerank']],
                            camped_df[list_gcm],

                            distr='probit')
    
    print("Ordinal Regression:  {}".format(list_gcm))
    print('-------------------------------------')
    res_prob = mod_prob.fit(method='bfgs')

    return res_prob.summary()

In [None]:
# Function for merging the survey category data with the gcm category data

# Merge the survey category df to the GCM metrics per category based on <resource> with a left outer join
# (Keep every row in the category dataframe -> missing values get NaN)

def prepSurvGcmForLinReg(survey_data, cat_gcm_data):
    df_survey = survey_data.copy()
    df_cat_gcm = cat_gcm_data.copy()
    df_survey.rename(columns={"uri": "resource"}, inplace=True) #rename column to match GCM dataframe
        
    # Count how often an answer was given in the survey and save into new dataframe
    survey_count = df_survey['resource'].value_counts() #normalize values
    df_survey_counted = pd.DataFrame({'resource':survey_count.index, 'amount':survey_count.values})

    df_survey_gcm = pd.merge(df_survey_counted,
                df_cat_gcm[['resource','name', 'degree','eigenvector','pagerank']],
                #df_cat_gcm[['resource', 'degree','eigenvector','pagerank']],
                on='resource', 
                how='left')
    
    # Drop rows that were not matched with the category
    nan_rows = df_survey_gcm.shape[0] - df_survey_gcm.dropna().shape[0]    
    print("Rows with NaN values dropped: {}".format(nan_rows) + " from {}".format(df_survey_gcm.shape[0]) + " => {}% ".format(round((nan_rows/df_survey_gcm.shape[0])*100, 2))) #return amount of rows containing NaN values
    df_survey_gcm = df_survey_gcm.dropna()
    
    # Count how often an answer was given in the survey and save into new dataframe
    #df_survey_gcm['survey_mentions'] = df_survey_gcm['resource'].value_counts(normalize=True) #normalize values
    #df_survey_counted = pd.DataFrame({'resource':survey_count.index, 'amount':survey_count.values})
    
    # scale and normalize the survey data (same as scaler function)
    #to_scale = StandardScaler()
    #min_max_scaler = preprocessing.MinMaxScaler()
    #df_survey_gcm[['amount']] = min_max_scaler.fit_transform(df_survey_gcm[['amount']])
    df_survey_gcm[['amount_rank']] = 1

    
    print("Dataframe merged with shape: {}".format(df_survey_gcm.shape))
    
    return df_survey_gcm

In [None]:
## Linear regression

In [None]:
euCap_result = catMerger(df_euCap, df_ig_gcm)
euCap_result.head()

In [None]:
euCap_result = catMerger(df_euCap, df_ig_gcm)
df_survey_euCap = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_european_capital_cities_impr.csv", sep=';')

df_euCap_survey_gcm = gcmSurfer(df_survey_euCap, euCap_result)

### EU capital

In [None]:
euCap_result = catMerger(df_euCap, df_ig_gcm)

In [None]:
# Test normal distribution
stats.normaltest(euCap_result['pagerank'])

#NetworkX: NormaltestResult(statistic=57.7625461797501, pvalue=2.8643261421028853e-13)

In [None]:
# Barplot PageRank
euCap_PR_sorted = euCap_result.sort_values('pagerank')

plt.figure(figsize=(10,6))
plt.bar('name', 'pagerank',data=euCap_PR_sorted)
plt.xlabel("Capital cities in Europe", size=15)
plt.xticks(rotation=90)
plt.ylabel("PageRank", size=15)
plt.title("Bar plot with capital cities in Europe and their PageRank", size=18)
plt.savefig("../plots/plots_euCap/euCap_barplot_PR_asc.png", bbox_inches="tight")

# Barplot Degree Centrality
euCap_DC_sorted = euCap_result.sort_values('degree')

plt.figure(figsize=(10,6))
plt.bar('name', 'degree',data=euCap_DC_sorted)
plt.xlabel("Capital cities in Europe", size=15)
plt.xticks(rotation=90)
plt.ylabel("Degree Centrality", size=15)
plt.title("Bar plot with capital cities in Europe and their Degree Centrality", size=18)
plt.savefig("../plots/plots_euCap/euCap_barplot_DC_asc.png", bbox_inches="tight")

In [None]:
# Read survey data into dataframe

df_survey_euCap = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_european_capital_cities_impr.csv", sep=';')
#df_survey_euCap = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_european_capital_cities_prep.csv", sep=';')
df_survey_euCap.rename(columns={"uri": "resource"}, inplace=True) #rename column to match GCM dataframe

df_survey_euCap.head()

In [None]:
df_survey_euCap.rename(columns={"uri": "resource"}, inplace=True) #rename column to match GCM dataframe

# Count how often an answer was given in the survey and save into new dataframe

counted_survey_euCap = df_survey_euCap['resource'].value_counts(normalize=True) #normalize values

df_euCap_survey_counted = pd.DataFrame({'resource':counted_survey_euCap.index, 'amount':counted_survey_euCap.values})

#euCap_result.name = euCap_result.name.str.lower()

df_euCap_survey_gcm = pd.merge(df_euCap_survey_counted,
                 euCap_result[['resource','name', 'degree','eigenvector','pagerank']],
                 on='resource', 
                 how='left')

# assign all answers that were not matchable a PR of 0 (altough per definition of PR not possible) to allow plotting. Alternative: drop from df?
#df_euCap_survey_gcm.pagerank = df_euCap_survey_gcm.pagerank.fillna(0)
#df_euCap_survey_gcm.degree = df_euCap_survey_gcm.degree.fillna(0)

df_euCap_survey_gcm

In [None]:
df_euCap_survey_counted_and_sorted = df_euCap_survey_counted.sort_values('amount')

plt.figure(figsize=(10,6))
# bar plot with matplotlib
plt.bar('resource', 'amount',data=df_euCap_survey_counted_and_sorted)
plt.xlabel("Capital cities in Europe", size=15)
plt.xticks(rotation=90)
plt.ylabel("Mentions in Survey", size=15)
plt.title("Bar plot with capital cities in Europe and their mentions in the survey", size=18)
plt.savefig("../plots/plots_euCap/euCapital_barplot_SV_asc.png", bbox_inches="tight")

In [None]:
df_euCap_regression = df_euCap_survey_gcm.copy()
df_euCap_regression = df_euCap_regression.dropna()
df_euCap_regression

In [None]:
# Standardize GCMs
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

scaler = StandardScaler()

df_ig_gcm[['degree', 'eigenvector', 'pagerank']] = scaler.fit_transform(df_ig_gcm[['degree', 'eigenvector', 'pagerank']])
df_ig_gcm

In [None]:
lm_euCap = linear_model.LinearRegression()

pr_array_euCap = df_euCap_regression['pagerank'].to_numpy()
pr_array_euCap = pr_array_euCap.reshape(-1, 1)
sv_array_euCap = df_euCap_regression['amount'].to_numpy()
sv_array_euCap = sv_array_euCap.reshape(-1, 1)

lm_euCap.fit(sv_array_euCap, pr_array_euCap) # fitting the model

print("The coefficient is:", lm_euCap.coef_)
print("The intercept is:",lm_euCap.intercept_)


#Plot the data
sns.scatterplot(data=df_euCap_regression, x="amount", y="pagerank", size="pagerank", legend=False, hue="pagerank", palette="deep")

#Annotate data points
labels = df_euCap_survey_gcm['name']
# Loop through the data points 
for i, label in enumerate (labels):
    plt.annotate(label, (df_euCap_survey_gcm.amount[i]-0.015, df_euCap_survey_gcm.pagerank[i]-0.00002))
    if i == 5:
        break
y_pred = lm_euCap.predict(sv_array_euCap)
plt.plot(sv_array_euCap, y_pred, color = "b")

#plt.title('Scatter plot with EU capital cities and their PageRank', size=18)
plt.xlabel('amount of mentions in survey')
plt.ylabel('PageRank in DBpedia')

plt.savefig("../plots/plots_euCap/euCap_regression_surveyXpr_asc.png", bbox_inches="tight")
plt.show()

In [None]:
%%time

#import statsmodels.api as sm

#regression_sv_pr = pd.DataFrame(pr_array_euCap, sv_array_euCap)

# Fit and summarize OLS model
mod = sm.OLS(pr_array_euCap, sv_array_euCap)

res = mod.fit()

print(res.summary())

In [None]:
lm_euCap = linear_model.LinearRegression()

ev_array_euCap = df_euCap_regression['eigenvector'].to_numpy()
ev_array_euCap = ev_array_euCap.reshape(-1, 1)
sv_array_euCap = df_euCap_regression['amount'].to_numpy()
sv_array_euCap = sv_array_euCap.reshape(-1, 1)

lm_euCap.fit(sv_array_euCap, ev_array_euCap) # fitting the model

print("The coefficient is:", lm_euCap.coef_)
print("The intercept is:",lm_euCap.intercept_)


#Plot the data
sns.scatterplot(data=df_euCap_regression, x="amount", y="eigenvector", size="eigenvector", legend=False, hue="eigenvector", palette="deep")

#Annotate data points
labels = df_euCap_survey_gcm['name']
# Loop through the data points 
for i, label in enumerate (labels):
    plt.annotate(label, (df_euCap_survey_gcm.amount[i], df_euCap_survey_gcm.eigenvector[i]))
    if i == 8:
        break
y_pred = lm_euCap.predict(sv_array_euCap)
plt.plot(sv_array_euCap, y_pred, color = "g")

#plt.title('Scatter plot with EU capital cities and their PageRank', size=18)
plt.xlabel('amount of mentions in survey')
plt.ylabel('Eigenvector Centrality in DBpedia')

#plt.savefig("../plots/plots_euCap/euCap_regression_surveyXpr_asc.png", bbox_inches="tight")
plt.show()

In [None]:
lm_euCap = linear_model.LinearRegression()

dc_array_euCap = df_euCap_regression['degree'].to_numpy()
dc_array_euCap = dc_array_euCap.reshape(-1, 1)
sv_array_euCap = df_euCap_regression['amount'].to_numpy()
sv_array_euCap = sv_array_euCap.reshape(-1, 1)

lm_euCap.fit(sv_array_euCap, dc_array_euCap) # fitting the model

print("The coefficient is:", lm_euCap.coef_)
print("The intercept is:",lm_euCap.intercept_)


#Plot the data
sns.scatterplot(data=df_euCap_regression, x="amount", y="degree", size="degree", legend=False, hue="degree", palette="deep")

#Annotate data points
labels = df_euCap_survey_gcm['name']
# Loop through the data points 
for i, label in enumerate (labels):
    plt.annotate(label, (df_euCap_survey_gcm.amount[i]+0.01, df_euCap_survey_gcm.degree[i]-0.02))
    if i == 5:
        break
y_pred = lm_euCap.predict(sv_array_euCap)
plt.plot(sv_array_euCap, y_pred, color = "r")

#plt.title('Scatter plot with EU capital cities and their PageRank', size=18)
plt.xlabel('amount of mentions in survey')
plt.ylabel('DC in DBpedia')

#plt.savefig("../plots/plots_euCap/euCap_regression_surveyXdc_asc.png", bbox_inches="tight")
plt.show()

### EU country

In [None]:
# EU countries
euCou_result = pd.merge(df_euCou,
                 df_gcm[['resource','degree', 'pagerank']],
                 on='resource', 
                 how='left')

euCou_result

In [None]:
# Barplot PageRank
euCou_PR_sorted = euCou_result.sort_values('pagerank')

plt.figure(figsize=(10,6))
plt.bar('name', 'pagerank',data=euCou_PR_sorted)
plt.xlabel("Countries in Europe", size=15)
plt.xticks(rotation=90)
plt.ylabel("PageRank", size=15)
plt.title("Bar plot with countries in Europe and their PageRank", size=18)
plt.savefig("../plots/plots_euCou/euCountry_barplot_PR_asc.png", bbox_inches="tight")

# Barplot Degree Centrality
euCou_DC_sorted = euCou_result.sort_values('degree')

plt.figure(figsize=(10,6))
plt.bar('name', 'degree',data=euCou_DC_sorted)
plt.xlabel("Countries in Europe", size=15)
plt.xticks(rotation=90)
plt.ylabel("Degree Centrality", size=15)
plt.title("Bar plot with countries in Europe and their Degree Centrality", size=18)
plt.savefig("../plots/plots_euCap/euCountry_barplot_DC_asc.png", bbox_inches="tight")

In [None]:
# read survey data into dataframe
df_survey_euCou = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_european_countries_prep.csv", sep=';')
df_survey_euCou.rename(columns={"uri": "resource"}, inplace=True) #rename column to match GCM dataframe

df_survey_euCou.head()

In [None]:
#count how often an answer was given in the survey and save into new dataframe
counted_survey_euCou = df_survey_euCou['resource'].value_counts(normalize=True)

df_euCou_survey_counted = pd.DataFrame({'resource':counted_survey_euCou.index, 'amount':counted_survey_euCou.values})

#euCou_result.name = euCou_result.name.str.lower()

df_euCou_survey_gcm = pd.merge(df_euCou_survey_counted,
                 euCou_result[['resource','name', 'degree','pagerank']],
                 on='resource', 
                 how='left')

# assign all answers that were not matchable a PR of 0 (altough per definition of PR not possible) to allow plotting. Alternative: drop from df?
#df_euCou_survey_pr.pagerank = df_euCou_survey_pr.pagerank.fillna(0)
#df_euCap_survey_gcm.degree = df_euCap_survey_gcm.degree.fillna(0)

df_euCou_survey_gcm

In [None]:
df_euCou_survey_counted_and_sorted = df_euCou_survey_counted.sort_values('amount')

plt.figure(figsize=(10,6))
# bar plot with matplotlib
plt.bar('resource', 'amount',data=df_euCou_survey_counted_and_sorted)
plt.xlabel("Countries in Europe", size=15)
plt.xticks(rotation=90)
plt.ylabel("Mentions in Survey", size=15)
plt.title("Bar plot with countries in Europe and their mentions in the survey", size=18)
plt.savefig("../plots/plots_euCou/euCountries_barplot_SV_asc.png", bbox_inches="tight")

In [None]:
sns.scatterplot(data=df_euCou_survey_gcm, x="amount", y="pagerank", size="pagerank", legend=False, hue="pagerank", palette="deep")

#Annotate data points
labels = df_euCou_survey_gcm['name']
# Loop through the data points 
for i, label in enumerate (labels):
    plt.annotate(label, (df_euCou_survey_gcm.amount[i]+1.5, df_euCou_survey_gcm.pagerank[i]-0.00001))
    if i == 8:
        break
        
#plt.text(df_euCou_survey_pr.amount[df_euCou_survey_pr.name=='germany']-4.5,df_euCou_survey_pr.pagerank[df_euCou_survey_pr.name=='germany'],"GE")
#plt.text(df_euCou_survey_pr.amount[df_euCou_survey_pr.name=='france']+1.5,df_euCou_survey_pr.pagerank[df_euCou_survey_pr.name=='france'],"FR")
#plt.text(df_euCou_survey_pr.amount[df_euCou_survey_pr.name=='italy']+1.5,df_euCou_survey_pr.pagerank[df_euCou_survey_pr.name=='italy'],"IT")
#plt.text(df_euCou_survey_pr.amount[df_euCou_survey_pr.name=='spain']+1.5,df_euCou_survey_pr.pagerank[df_euCou_survey_pr.name=='spain'],"ES")
#plt.text(df_euCou_survey_pr.amount[df_euCou_survey_pr.name=='united kingdom']+1.5,df_euCou_survey_pr.pagerank[df_euCou_survey_pr.name=='united kingdom'],"UK")
#plt.text(df_euCou_survey_pr.amount[df_euCou_survey_pr.name=='poland']+1.5,df_euCou_survey_pr.pagerank[df_euCou_survey_pr.name=='poland'],"PL")
#plt.text(df_euCap_survey_pr.amount[df_euCap_survey_pr.name=='budapest']+1.5,df_euCap_survey_pr.pagerank[df_euCap_survey_pr.name=='budapest'],"Budapest")
#plt.text(df_euCap_survey_pr.amount[df_euCap_survey_pr.name=='stockholm']+1.5,df_euCap_survey_pr.pagerank[df_euCap_survey_pr.name=='stockholm'],"Stockholm")

#plt.title('Scatter plot with EU capital cities and their PageRank', size=18)
plt.xlabel('amount of mentions in survey')
plt.ylabel('PageRank in DBpedia')


plt.savefig("../plots/plots_euCou/euCou_scatter_SurveyXpr_asc.png", bbox_inches="tight")

In [None]:
# Scatterplot between Degree Centrality and Survey Results for EU capital cities

sns.scatterplot(data=df_euCou_survey_gcm, x="amount", y="degree", size="degree", legend=False, hue="degree", palette="deep")

#Annotate data points
labels = df_euCou_survey_gcm['name']
# Loop through the data points 
for i, label in enumerate (labels):
    plt.annotate(label, (df_euCou_survey_gcm.amount[i]+1.5, df_euCou_survey_gcm.degree[i]-0.00001))
    if i == 8:
        break
        
#plt.text(df_euCap_survey_gcm.amount[df_euCap_survey_gcm.name=='berlin'],df_euCap_survey_gcm.pagerank[df_euCap_survey_gcm.name=='berlin'],"Berlin")
#plt.text(df_euCap_survey_gcm.amount[df_euCap_survey_gcm.name=='paris'],df_euCap_survey_gcm.pagerank[df_euCap_survey_gcm.name=='paris'],"Paris")
#plt.text(df_euCap_survey_gcm.amount[df_euCap_survey_gcm.name=='london'],df_euCap_survey_gcm.pagerank[df_euCap_survey_gcm.name=='london'],"London")
#plt.text(df_euCap_survey_gcm.amount[df_euCap_survey_gcm.name=='moscow'],df_euCap_survey_gcm.pagerank[df_euCap_survey_gcm.name=='moscow'],"Moscow")
#plt.text(df_euCap_survey_gcm.amount[df_euCap_survey_gcm.name=='rome'],df_euCap_survey_gcm.pagerank[df_euCap_survey_gcm.name=='rome'],"Rome")
#plt.text(df_euCap_survey_gcm.amount[df_euCap_survey_gcm.name=='madrid'],df_euCap_survey_gcm.pagerank[df_euCap_survey_gcm.name=='madrid'],"Madrid")

#plt.title('Scatter plot with EU countries and their DC', size=18)
plt.xlabel('Amount of mentions in survey')
plt.ylabel('Degree Centrality in DBpedia')

plt.savefig("../plots/plots_euCou/euCou_scatter_SurveyXdc_asc.png", bbox_inches="tight")

In [None]:
df_euCou_regression = df_euCou_survey_gcm.copy()
df_euCou_regression = df_euCou_regression.drop(20) # custom for potus to drop value with index 0 as it is no potus entity
df_euCou_regression

In [None]:
lm_euCou = linear_model.LinearRegression()

pr_array_euCou = df_euCou_regression['pagerank'].to_numpy()
pr_array_euCou = pr_array_euCou.reshape(-1, 1)
sv_array_euCou = df_euCou_regression['amount'].to_numpy()
sv_array_euCou = sv_array_euCou.reshape(-1, 1)

lm_euCou.fit(sv_array_euCou, pr_array_euCou) # fitting the model

print("The coefficient is:", lm_euCou.coef_)
print("The intercept is:",lm_euCou.intercept_)

In [None]:
sns.scatterplot(data=df_euCou_regression, x="amount", y="pagerank", size="pagerank", legend=False, hue="pagerank", palette="deep")

#Annotate data points
labels = df_euCou_survey_gcm['name']
# Loop through the data points 
for i, label in enumerate (labels):
    plt.annotate(label, (df_euCou_survey_gcm.amount[i]+1.5, df_euCou_survey_gcm.pagerank[i]-0.00001))
    if i == 8:
        break
y_pred = lm_euCou.predict(sv_array_euCou)
plt.plot(sv_array_euCou, y_pred, color = "b")

#plt.title('Scatter plot with EU capital cities and their PageRank', size=18)
plt.xlabel('amount of mentions in survey')
plt.ylabel('PageRank in DBpedia')

plt.savefig("../plots/plots_euCou/euCou_regression_surveyXpr_asc.png", bbox_inches="tight")
plt.show()

In [None]:
lm_euCou = linear_model.LinearRegression()

dc_array_euCou = df_euCou_regression['degree'].to_numpy()
dc_array_euCou = dc_array_euCou.reshape(-1, 1)
sv_array_euCou = df_euCou_regression['amount'].to_numpy()
sv_array_euCou = sv_array_euCou.reshape(-1, 1)

lm_euCou.fit(sv_array_euCou, dc_array_euCou) # fitting the model

print("The coefficient is:", lm_euCou.coef_)
print("The intercept is:",lm_euCou.intercept_)

# Plot the data
sns.scatterplot(data=df_euCou_regression, x="amount", y="degree", size="pagerank", legend=False, hue="pagerank", palette="deep")

#Annotate data points
labels = df_euCou_survey_gcm['name']
# Loop through the data points 
for i, label in enumerate (labels):
    plt.annotate(label, (df_euCou_survey_gcm.amount[i]+1, df_euCou_survey_gcm.degree[i]))
    if i == 8:
        break
y_pred = lm_euCou.predict(sv_array_euCou)
plt.plot(sv_array_euCou, y_pred, color = "r")

#plt.title('Scatter plot with EU capital cities and their PageRank', size=18)
plt.xlabel('amount of mentions in survey')
plt.ylabel('PageRank in DBpedia')

plt.savefig("../plots/plots_euCou/euCou_regression_surveyXdc_asc.png", bbox_inches="tight")
plt.show()

### POTUS

In [None]:
# Match the GCMs to the queried POTUS category
potus_result = pd.merge(df_potus,
                 #df_pr[['resource_id','resource', 'pagerank']],
                 df_ig_gcm[['resource', 'degree', 'eigenvector', 'pagerank']],
                 on='resource', 
                 how='left')

#potus_result = potus_result.drop(25) # custom for potus to drop value with index 0 as it is no potus entity

potus_result.size

In [None]:
potus_result.head()

In [None]:
# Create dataframes with sorted DC and PR for plotting
potus_sorted_dg = potus_result.sort_values('degree')
potus_sorted_pr = potus_result.sort_values('pagerank')

# bar plot with DC
plt.figure(figsize=(10,6))
plt.bar('name', 'degree',data=potus_sorted_dg)
plt.xlabel("POTUS", size=15)
plt.xticks(rotation=90)
plt.ylabel("Degree Centrality", size=15)
plt.title("Bar plot with POTUS and Degree Centrality", size=18)
plt.savefig("../plots/plots_potus/potus_barplot_DC_asc.png", bbox_inches="tight")

# bar plot with PR
plt.figure(figsize=(10,6))
plt.bar('name', 'pagerank',data=potus_sorted_pr)
plt.xlabel("POTUS", size=15)
plt.xticks(rotation=90)
plt.ylabel("PageRank", size=15)
plt.title("Bar plot with POTUS and PageRank", size=18)
plt.savefig("../plots/plots_potus/potus_barplot_PR_asc.png", bbox_inches="tight")

In [None]:
# Read survey data into dataframe
df_survey_potus = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_potus_prep.csv", sep=';')
df_survey_potus.rename(columns={"uri": "resource"}, inplace=True) #rename column to match GCM dataframe

df_survey_potus.head()

In [None]:
# Count how often an unique answer was given in the survey and save to new dataframe
counted_survey_potus = df_survey_potus['resource'].value_counts(normalize=True)

df_survey_counted_potus = pd.DataFrame({'resource':counted_survey_potus.index, 'amount':counted_survey_potus.values})

#euCap_result.name = euCap_result.name.str.lower()

df_survey_gcm_potus = pd.merge(df_survey_counted_potus,
                 potus_result[['resource','name', 'degree','pagerank']],
                 on='resource', 
                 how='left')

# assign all answers that were not matchable a PR of 0 (altough per definition of PR not possible) to allow plotting. Alternative: drop from df?
#df_euCap_survey_gcm.pagerank = df_euCap_survey_gcm.pagerank.fillna(0)
#df_euCap_survey_gcm.degree = df_euCap_survey_gcm.degree.fillna(0)


#euCap_result['resource'] = euCap_result.apply(lambda row: node_path(row), axis=1)
df_survey_gcm_potus

In [None]:
df_survey_counted_and_sorted_potus = df_survey_counted_potus.sort_values('amount')

plt.figure(figsize=(10,6))
# bar plot with matplotlib
plt.bar('resource', 'amount',data=df_survey_counted_and_sorted_potus)
plt.xlabel("POTUS", size=15)
plt.xticks(rotation=90)
plt.ylabel("Mentions in Survey", size=15)
plt.title("Bar plot with POTUS and their mentions in the survey", size=18)
plt.savefig("../plots/plots_potus/potus_barplot_SV_asc.png", bbox_inches="tight")

In [None]:
# Scatterplot between PageRank and Survey Results for POTUS
sns.scatterplot(data=df_survey_gcm_potus, x="amount", y="pagerank", size="pagerank", legend=False, hue="pagerank", palette="deep")

#Annotate data points
labels = df_survey_gcm_potus['name']
# Loop through the data points 
for i, label in enumerate (labels):
    plt.annotate(label, (df_survey_gcm_potus.amount[i]+1.5, df_survey_gcm_potus.pagerank[i]-0.000001))
    if i == 9:
        break

#plt.text(df_survey_gcm_potus.amount[df_survey_gcm_potus.resource=='http://dbpedia.org/resource/Barack_Obama']-9.5,df_survey_gcm_potus.pagerank[df_survey_gcm_potus.resource=='http://dbpedia.org/resource/Barack_Obama'],"Obama")
#plt.text(df_euCap_survey_gcm.amount[df_euCap_survey_gcm.name=='paris']+1.5,df_euCap_survey_gcm.pagerank[df_euCap_survey_gcm.name=='paris'],"Paris")
#plt.text(df_euCap_survey_gcm.amount[df_euCap_survey_gcm.name=='london']+1.5,df_euCap_survey_gcm.pagerank[df_euCap_survey_gcm.name=='london'],"London")
#plt.text(df_euCap_survey_gcm.amount[df_euCap_survey_gcm.name=='moscow']+1.5,df_euCap_survey_gcm.pagerank[df_euCap_survey_gcm.name=='moscow'],"Moscow")
#plt.text(df_euCap_survey_gcm.amount[df_euCap_survey_gcm.name=='rome']+1.5,df_euCap_survey_gcm.pagerank[df_euCap_survey_gcm.name=='rome'],"Rome")
#plt.text(df_euCap_survey_gcm.amount[df_euCap_survey_gcm.name=='madrid']+1.5,df_euCap_survey_gcm.pagerank[df_euCap_survey_gcm.name=='madrid'],"Madrid")
#plt.text(df_euCap_survey_pr.amount[df_euCap_survey_pr.name=='budapest']+1.5,df_euCap_survey_pr.pagerank[df_euCap_survey_pr.name=='budapest'],"Budapest")
#plt.text(df_euCap_survey_pr.amount[df_euCap_survey_pr.name=='stockholm']+1.5,df_euCap_survey_pr.pagerank[df_euCap_survey_pr.name=='stockholm'],"Stockholm")

#plt.title('Scatter plot with POTUS and their PageRank', size=18)
plt.xlabel('Amount of mentions in survey')
plt.ylabel('PageRank in DBpedia')


plt.savefig("../plots/plots_potus/potus_scatter_surveyXpr_asc.png", bbox_inches="tight")

In [None]:
# Scatterplot between Degree Centrality and Survey Results for POTUS
sns.scatterplot(data=df_survey_gcm_potus, x="amount", y="degree", size="degree", legend=False, hue="degree", palette="deep")

#Annotate data points
labels = df_survey_gcm_potus['name']
# Loop through the data points 
for i, label in enumerate (labels):
    plt.annotate(label, (df_survey_gcm_potus.amount[i]+1.5, df_survey_gcm_potus.degree[i]-0.000001))
    if i == 9:
        break

#plt.text(df_survey_gcm_potus.amount[df_survey_gcm_potus.resource=='http://dbpedia.org/resource/Barack_Obama']-9.5,df_survey_gcm_potus.pagerank[df_survey_gcm_potus.resource=='http://dbpedia.org/resource/Barack_Obama'],"Obama")
#plt.text(df_euCap_survey_gcm.amount[df_euCap_survey_gcm.name=='paris']+1.5,df_euCap_survey_gcm.pagerank[df_euCap_survey_gcm.name=='paris'],"Paris")
#plt.text(df_euCap_survey_gcm.amount[df_euCap_survey_gcm.name=='london']+1.5,df_euCap_survey_gcm.pagerank[df_euCap_survey_gcm.name=='london'],"London")
#plt.text(df_euCap_survey_gcm.amount[df_euCap_survey_gcm.name=='moscow']+1.5,df_euCap_survey_gcm.pagerank[df_euCap_survey_gcm.name=='moscow'],"Moscow")
#plt.text(df_euCap_survey_gcm.amount[df_euCap_survey_gcm.name=='rome']+1.5,df_euCap_survey_gcm.pagerank[df_euCap_survey_gcm.name=='rome'],"Rome")
#plt.text(df_euCap_survey_gcm.amount[df_euCap_survey_gcm.name=='madrid']+1.5,df_euCap_survey_gcm.pagerank[df_euCap_survey_gcm.name=='madrid'],"Madrid")
#plt.text(df_euCap_survey_pr.amount[df_euCap_survey_pr.name=='budapest']+1.5,df_euCap_survey_pr.pagerank[df_euCap_survey_pr.name=='budapest'],"Budapest")
#plt.text(df_euCap_survey_pr.amount[df_euCap_survey_pr.name=='stockholm']+1.5,df_euCap_survey_pr.pagerank[df_euCap_survey_pr.name=='stockholm'],"Stockholm")

#plt.title('Scatter plot with POTUS and their DC', size=18)
plt.xlabel('Amount of mentions in survey')
plt.ylabel('Degree Centrality in DBpedia')

plt.savefig("../plots/plots_potus/potus_scatter_surveyXdc_asc.png", bbox_inches="tight")

In [None]:
# DC Regression test for POTUS

df_potus_regression = df_survey_gcm_potus.copy()

lm_potus = linear_model.LinearRegression()

dc_array_potus = df_potus_regression['degree'].to_numpy()
dc_array_potus = dc_array_potus.reshape(-1, 1)
sv_array_potus = df_potus_regression['amount'].to_numpy()
sv_array_potus = sv_array_potus.reshape(-1, 1)

lm_potus.fit(sv_array_potus, dc_array_potus) # fitting the model

print("The coefficient is:", lm_potus.coef_)
print("The intercept is:",lm_potus.intercept_)

# Scatter with regression function
sns.scatterplot(data=df_potus_regression, x="amount", y="degree", size="degree", legend=False, hue="degree", palette="deep")

#Annotate data points
labels = df_potus_regression['name']
# Loop through the data points 
for i, label in enumerate (labels):
    plt.annotate(label, (df_potus_regression.amount[i]+1.5, df_potus_regression.degree[i]))
    if i == 9:
        break
y_pred = lm_potus.predict(sv_array_potus)
plt.plot(sv_array_potus, y_pred, color = "r")

#plt.title('Regression with POTUS and their PageRank', size=18)
plt.xlabel('Amount of mentions in survey')
plt.ylabel('Degree Centrality in DBpedia')
plt.show()

plt.savefig("../plots/plots_potus/potus_regression_surveyXdc_asc.png", bbox_inches="tight")

In [None]:
# Regression test for POTUS

df_potus_regression = df_survey_gcm_potus.copy()

lm_potus = linear_model.LinearRegression()

pr_array_potus = df_potus_regression['pagerank'].to_numpy()
pr_array_potus = pr_array_potus.reshape(-1, 1)
sv_array_potus = df_potus_regression['amount'].to_numpy()
sv_array_potus = sv_array_potus.reshape(-1, 1)

lm_potus.fit(sv_array_potus, pr_array_potus) # fitting the model

print("The coefficient is:", lm_potus.coef_)
print("The intercept is:",lm_potus.intercept_)

# Scatter with regression function
sns.scatterplot(data=df_potus_regression, x="amount", y="pagerank", size="pagerank", legend=False, hue="pagerank", palette="deep")

#Annotate data points
labels = df_potus_regression['name']
# Loop through the data points 
for i, label in enumerate (labels):
    plt.annotate(label, (df_potus_regression.amount[i]+1.5, df_potus_regression.pagerank[i]))
    if i == 9:
        break
y_pred = lm_potus.predict(sv_array_potus)
plt.plot(sv_array_potus, y_pred, color = "b")

#plt.title('Regression with POTUS and their PageRank', size=18)
plt.xlabel('Amount of mentions in survey')
plt.ylabel('PageRank in DBpedia')
plt.show()

plt.savefig("../plots/plots_potus/potus_regression_surveyXpr_asc.png", bbox_inches="tight")

### Birds

In [None]:
# Match the GCMs to the queried POTUS category
birds_result = pd.merge(df_bird,
                 df_gcm[['resource', 'degree','pagerank']],
                 on='resource', 
                 how='left')

#potus_result = potus_result.drop(25) # custom for potus to drop value with index 0 as it is no potus entity

birds_result.size

In [None]:
birds_result.head()

In [None]:
# Create dataframes with sorted DC and PR for plotting
birds_sorted_dc = birds_result.sort_values('degree')
birds_sorted_pr = birds_result.sort_values('pagerank')

# bar plot with DC
plt.figure(figsize=(10,6))
plt.bar('name', 'degree',data=birds_sorted_dc)
plt.xlabel("Birds", size=15)
plt.xticks(rotation=90)
plt.ylabel("Degree Centrality", size=15)
plt.title("Bar plot with Birds and Degree Centrality", size=18)
#plt.savefig("../plots/plots_birds/birds_barplot_DC_asc.png", bbox_inches="tight")

# bar plot with PR
plt.figure(figsize=(10,6))
plt.bar('name', 'pagerank',data=birds_sorted_pr)
plt.xlabel("Birds", size=15)
plt.xticks(rotation=90)
plt.ylabel("PageRank", size=15)
plt.title("Bar plot with Birds and PageRank", size=18)
#plt.savefig("../plots/plots_birds/birds_barplot_PR_asc.png", bbox_inches="tight")

In [None]:
# Read survey data into dataframe
df_survey_birds = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_birds_prep.csv", sep=';')
df_survey_birds.rename(columns={"uri": "resource"}, inplace=True) #rename column to match GCM dataframe

df_survey_birds.head()

In [None]:
# Count how often an unique answer was given in the survey and save to new dataframe
counted_survey_birds = df_survey_birds['resource'].value_counts(normalize=True)

df_survey_counted_birds = pd.DataFrame({'resource':counted_survey_birds.index, 'amount':counted_survey_birds.values})

#euCap_result.name = euCap_result.name.str.lower()

df_survey_gcm_birds = pd.merge(df_survey_counted_birds,
                 birds_result[['resource', 'degree','pagerank']],
                 #df_gcm[['resource', 'degree','pagerank']],
                 on='resource', 
                 how='left')

# assign all answers that were not matchable a PR of 0 (altough per definition of PR not possible) to allow plotting. Alternative: drop from df?
#df_euCap_survey_gcm.pagerank = df_euCap_survey_gcm.pagerank.fillna(0)
#df_euCap_survey_gcm.degree = df_euCap_survey_gcm.degree.fillna(0)


#euCap_result['resource'] = euCap_result.apply(lambda row: node_path(row), axis=1)
df_survey_gcm_birds

In [None]:
df_survey_counted_and_sorted_birds = df_survey_counted_birds.sort_values('amount')

plt.figure(figsize=(10,6))
# bar plot with matplotlib
plt.bar('resource', 'amount',data=df_survey_counted_and_sorted_birds)
plt.xlabel("Birds", size=15)
plt.xticks(rotation=90)
plt.ylabel("Mentions in Survey", size=15)
plt.title("Bar plot with Birds and their mentions in the survey", size=18)
#plt.savefig("../plots/plots_potus/potus_barplot_SV_asc.png", bbox_inches="tight")

In [None]:
# Regression test for Birds

df_birds_regression = df_survey_gcm_birds.copy()
df_birds_regression = df_birds_regression.dropna(subset=['pagerank', 'degree']) #drop rows with NaN values

lm_birds = linear_model.LinearRegression()

pr_array_birds = df_birds_regression['pagerank'].to_numpy()
pr_array_birds = pr_array_birds.reshape(-1, 1)
sv_array_birds = df_birds_regression['amount'].to_numpy()
sv_array_birds = sv_array_birds.reshape(-1, 1)

lm_birds.fit(sv_array_birds, pr_array_birds) # fitting the model

print("The coefficient is:", lm_birds.coef_)
print("The intercept is:",lm_birds.intercept_)


# Scatter with regression function
sns.scatterplot(data=df_birds_regression, x="amount", y="pagerank", size="pagerank", legend=False, hue="pagerank", palette="deep")

#Annotate data points
labels = df_birds_regression['resource']
# Loop through the data points 
for i, label in enumerate (labels):
    plt.annotate(label, (df_birds_regression.amount[i]+1.5, df_birds_regression.pagerank[i]))
    if i == 9:
        break
y_pred = lm_birds.predict(sv_array_birds)
plt.plot(sv_array_birds, y_pred, color = "b")

#plt.title('Regression with POTUS and their PageRank', size=18)
plt.xlabel('Amount of mentions in survey')
plt.ylabel('PageRank in DBpedia')

#plt.savefig("../plots/plots_potus/potus_regression_surveyXdc_asc.png", bbox_inches="tight")
plt.show()

### Vegetables

In [None]:
# Match the GCMs to the queried POTUS category
vegetables_result = pd.merge(df_vegetable,
                 df_gcm[['resource', 'degree','pagerank']],
                 on='resource', 
                 how='left')

vegetables_result.size

In [None]:
vegetables_result.head()

In [None]:
# Create dataframes with sorted DC and PR for plotting
vegetables_sorted_dc = vegetables_result.sort_values('degree')
vegetables_sorted_pr = vegetables_result.sort_values('pagerank')

# bar plot with DC
plt.figure(figsize=(10,6))
plt.bar('name', 'degree',data=vegetables_sorted_dc)
plt.xlabel("Birds", size=15)
plt.xticks(rotation=90)
plt.ylabel("Degree Centrality", size=15)
plt.title("Bar plot with Vegetables and Degree Centrality", size=18)
#plt.savefig("../plots/plots_birds/birds_barplot_DC_asc.png", bbox_inches="tight")

# bar plot with PR
plt.figure(figsize=(10,6))
plt.bar('name', 'pagerank',data=vegetables_sorted_pr)
plt.xlabel("Birds", size=15)
plt.xticks(rotation=90)
plt.ylabel("PageRank", size=15)
plt.title("Bar plot with Vegetables and PageRank", size=18)
#plt.savefig("../plots/plots_birds/birds_barplot_PR_asc.png", bbox_inches="tight")

In [None]:
# Read survey data into dataframe
df_survey_vegetables = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_vegetables_prep.csv", sep=';')
df_survey_vegetables.rename(columns={"uri": "resource"}, inplace=True) #rename column to match GCM dataframe

df_survey_vegetables.head()

In [None]:
# Count how often an unique answer was given in the survey and save to new dataframe
counted_survey_vegetables = df_survey_vegetables['resource'].value_counts(normalize=True)

df_survey_counted_vegetables = pd.DataFrame({'resource':counted_survey_vegetables.index, 'amount':counted_survey_vegetables.values})

#euCap_result.name = euCap_result.name.str.lower()

df_survey_gcm_vegetables = pd.merge(df_survey_counted_vegetables,
                 vegetables_result[['resource','name', 'degree','pagerank']],
                 on='resource', 
                 how='left')

# assign all answers that were not matchable a PR of 0 (altough per definition of PR not possible) to allow plotting. Alternative: drop from df?
#df_euCap_survey_gcm.pagerank = df_euCap_survey_gcm.pagerank.fillna(0)
#df_euCap_survey_gcm.degree = df_euCap_survey_gcm.degree.fillna(0)


#euCap_result['resource'] = euCap_result.apply(lambda row: node_path(row), axis=1)
df_survey_gcm_vegetables

In [None]:
df_survey_counted_and_sorted_vegs= df_survey_counted_vegetables.sort_values('amount')

plt.figure(figsize=(10,6))
# bar plot with matplotlib
plt.bar('resource', 'amount',data=df_survey_counted_and_sorted_vegs)
plt.xlabel("Vegetables", size=15)
plt.xticks(rotation=90)
plt.ylabel("Mentions in Survey", size=15)
plt.title("Bar plot with Vegetables and their mentions in the survey", size=18)
#plt.savefig("../plots/plots_vegetables/vegetables_barplot_SV_asc.png", bbox_inches="tight")

In [None]:
# Regression test for Vegetables

df_vegetables_regression = df_survey_gcm_vegetables.copy()
df_vegetables_regression = df_vegetables_regression.dropna(subset=['pagerank', 'degree']) #drop rows with NaN values

lm_vegs = linear_model.LinearRegression()

pr_array_vegs = df_vegetables_regression['pagerank'].to_numpy()
pr_array_vegs = pr_array_vegs.reshape(-1, 1)
sv_array_vegs = df_vegetables_regression['amount'].to_numpy()
sv_array_vegs = sv_array_vegs.reshape(-1, 1)

lm_vegs.fit(sv_array_vegs, pr_array_vegs) # fitting the model

print("The coefficient is:", lm_vegs.coef_)
print("The intercept is:",lm_vegs.intercept_)


# Scatter with regression function
sns.scatterplot(data=df_vegetables_regression, x="amount", y="pagerank", size="pagerank", legend=False, hue="pagerank", palette="deep")

#Annotate data points
labels = df_vegetables_regression['name']
# Loop through the data points 
for i, label in enumerate (labels):
    plt.annotate(label, (df_vegetables_regression.amount[i]+1.5, df_vegetables_regression.pagerank[i]))
    if i == 8:
        break
y_pred = lm_vegs.predict(sv_array_vegs)
plt.plot(sv_array_vegs, y_pred, color = "b")

#plt.title('Regression with POTUS and their PageRank', size=18)
plt.xlabel('Amount of mentions in survey')
plt.ylabel('PageRank in DBpedia')

plt.savefig("../plots/plots_vegetables/vegetables_regression_surveyXpr_asc.png", bbox_inches="tight")
plt.show()

In [None]:
# Regression test for Vegetables

df_vegetables_regression = df_survey_gcm_vegetables.copy()
df_vegetables_regression = df_vegetables_regression.dropna(subset=['pagerank', 'degree']) #drop rows with NaN values

lm_vegs = linear_model.LinearRegression()

dc_array_vegs = df_vegetables_regression['degree'].to_numpy()
dc_array_vegs = dc_array_vegs.reshape(-1, 1)
sv_array_vegs = df_vegetables_regression['amount'].to_numpy()
sv_array_vegs = sv_array_vegs.reshape(-1, 1)

lm_vegs.fit(sv_array_vegs, dc_array_vegs) # fitting the model

print("The coefficient is:", lm_vegs.coef_)
print("The intercept is:",lm_vegs.intercept_)


# Scatter with regression function
sns.scatterplot(data=df_vegetables_regression, x="amount", y="degree", size="degree", legend=False, hue="degree", palette="deep")

#Annotate data points
labels = df_vegetables_regression['name']
# Loop through the data points 
for i, label in enumerate (labels):
    plt.annotate(label, (df_vegetables_regression.amount[i]+1.5, df_vegetables_regression.pagerank[i]))
    if i == 8:
        break
y_pred = lm_vegs.predict(sv_array_vegs)
plt.plot(sv_array_vegs, y_pred, color = "r")

#plt.title('Regression with POTUS and their PageRank', size=18)
plt.xlabel('Amount of mentions in survey')
plt.ylabel('Degree Centrality in DBpedia')

plt.savefig("../plots/plots_vegetables/vegetables_regression_surveyXdc_asc.png", bbox_inches="tight")
plt.show()

### 90s film

In [None]:
%%time

# Merge the DFs based on resource with an Left Merge / Left Outer Join
# (Keep every row in the left dataframe. Missing values get NaN)

# 90s films
film90_result = pd.merge(df_film90,
                 df_gcm[['resource','degree', 'pagerank']],
                 on='resource', 
                 how='left')

#film90_result['resource'] = film90_result.apply(lambda row: node_path(row), axis=1)
film90_result.size

In [None]:
# Barplot PageRank
film90_PR_sorted = film90_result.sort_values('pagerank')
film90_PR_sorted_cut = film90_PR_sorted.head(50)

plt.figure(figsize=(10,6))
plt.bar('name', 'pagerank',data=film90_PR_sorted_cut)
plt.xlabel("Films from the 90s", size=15)
plt.xticks(rotation=90)
plt.ylabel("PageRank", size=15)
plt.title("Bar plot with films from the 90s", size=18)
#plt.savefig("../plots/plots_film90/film90_barplot_PR_asc.png", bbox_inches="tight")

# Barplot Degree Centrality
film90_DC_sorted = film90_result.sort_values('degree')
film90_DC_sorted_cut = film90_DC_sorted.head(50)

plt.figure(figsize=(10,6))
plt.bar('name', 'degree',data=film90_DC_sorted_cut)
plt.xlabel("Films from the 90s", size=15)
plt.xticks(rotation=90)
plt.ylabel("Degree Centrality", size=15)
plt.title("Bar plot with films from the 90s and their Degree Centrality", size=18)
#plt.savefig("../plots/plots_film90/film90_barplot_DC_asc.png", bbox_inches="tight")

In [None]:
film90_PR_sorted_cut.head(10)