# Ordinal Regression

In [1]:
# Importing packages
import pandas as pd
#from SPARQLWrapper import SPARQLWrapper, JSON, N3
#import lxml
import numpy as np
import matplotlib.pyplot as plt
#import seaborn as sns
#import networkx as nx
#from networkx import Graph as NXGraph
#from rdflib import Graph as RDFGraph
#from rdflib.extras.external_graph_libs import rdflib_to_networkx_graph
#import statistics
#import collections
from scipy import stats
from scipy.stats import kstest
# https://www.statology.org/normality-test-python/

#from urllib import parse
from sklearn import linear_model
#from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
import statsmodels.api as sm
from statsmodels.miscmodels.ordinal_model import OrderedModel

import mlnotify 
# https://github.com/aporia-ai/mlnotify
# Use %%notify at beginning of cell

import os.path, time
# Use %%time at beginning of cell

print('------------')
!python --version
print("Pandas " + pd.__version__)
print("Numpy " + np.__version__)
#print("Seaborn " + sns.__version__)
#print("Networkx " + nx.__version__)
print('------------')
print("All packages loaded and ready to roll :-)")

------------
Python 3.7.6
Pandas 1.3.4
Numpy 1.18.1
------------
All packages loaded and ready to roll :-)


In [4]:
# Load GCM data

# NetworkX
#df_gcm = pd.read_csv('../data/gcm_computed.nosync/nx-gcm.csv')

# iGraph
df_ig_gcm = pd.read_csv('../data/gcm_computed/ig-gcm.csv')

df_ig_gcm.head()

Unnamed: 0,resource,degree,eigenvector,pagerank
0,http://dbpedia.org/resource/Billy_Coggins__Car...,3,0.0007845722,5.936257e-08
1,http://dbpedia.org/ontology/CareerStation,1624880,1.0,0.02791082
2,http://dbpedia.org/resource/News_Patrol__A2Z_N...,9,7.198654e-11,1.150764e-07
3,http://dbpedia.org/resource/ZOE_Broadcasting_N...,20,9.183437e-10,2.792414e-07
4,http://dbpedia.org/resource/Yu_Yongfu,6,2.790954e-09,1.047756e-07


In [5]:
print("Data types:")
print(df_ig_gcm.dtypes)
print('----------------------')
print("Memory usage:")
print(df_ig_gcm.memory_usage(deep=True))
print('------------------------')
print("Dataframe shape is {}".format(df_ig_gcm.shape))

Data types:
resource        object
degree           int64
eigenvector    float64
pagerank       float64
dtype: object
----------------------
Memory usage:
Index                128
resource       964958363
degree          65946664
eigenvector     65946664
pagerank        65946664
dtype: int64
------------------------
Dataframe shape is (8243333, 4)


In [6]:
# Load pre-queried (via queried web-interface) DBpedia categories
# https://dbpedia.org/sparql/

# 90s films from the aggregated Categories: 1990_films, 1991_films, 1992_films, ...
df_film90 = pd.read_csv("../data/sparql_cat_queries/sparql_90film.csv")
df_film90.rename(columns={"film90": "resource"}, inplace=True) #rename column to match PageRank dataframe

# actors from the "Occupation": "Actor"@en
df_actor = pd.read_csv("../data/sparql_cat_queries/sparql_actor.csv")
df_actor.rename(columns={"actor": "resource"}, inplace=True)

# Music Genre from the "Type": "dbo:MusicGenre"
df_musicGenre = pd.read_csv("../data/sparql_cat_queries/sparql_musicGenre.csv")
df_musicGenre.rename(columns={"mGenre": "resource"}, inplace=True)

# Books from the "Type":"dbo:book"
df_book = pd.read_csv("../data/sparql_cat_queries/sparql_book.csv")
df_book.rename(columns={"book": "resource"}, inplace=True)

# Book authors from the "Type": author UNION book dbo:author
df_bookAut = pd.read_csv("../data/sparql_cat_queries/sparql_bookAuthor.csv")
df_bookAut.rename(columns={"author": "resource"}, inplace=True)

# Politicians from the "Type": Person/Politician
df_pol = pd.read_csv("../data/sparql_cat_queries/sparql_politician.csv")
df_pol.rename(columns={"pol": "resource"}, inplace=True)

# POTUS from the "Category": Presidents_of_the_United_States
df_potus = pd.read_csv("../data/sparql_cat_queries/sparql_potus.csv")
df_potus.rename(columns={"potus": "resource"}, inplace=True)

# Cities from the "Category": "dbo:City"
df_city = pd.read_csv("../data/sparql_cat_queries/sparql_city.csv")
df_city.rename(columns={"city": "resource"}, inplace=True)

# Lakes from the "Category": "dbo:Lake"
df_lake = pd.read_csv("../data/sparql_cat_queries/sparql_lake.csv")
df_lake.rename(columns={"lake": "resource"}, inplace=True)

# Mountains from the "Category": "dbo:Mountain"
df_mountain = pd.read_csv("../data/sparql_cat_queries/sparql_mountain.csv")
df_mountain.rename(columns={"mount": "resource"}, inplace=True)

# EU capitals from the "Category": Capitals_in_Europe
df_euCap = pd.read_csv("../data/sparql_cat_queries/sparql_euCap.csv")
df_euCap.rename(columns={"euCap": "resource"}, inplace=True)

# EU countries from the "Category": Countries_in_Europe
df_euCou = pd.read_csv("../data/sparql_cat_queries/sparql_euCountry.csv")
df_euCou.rename(columns={"euCountry": "resource"}, inplace=True)

# Asian countries from the "Category": "dbc:Countries_in_Asia", "dbc:East_Asian_countries" and "dbc:Central_Asian_countries"
df_asCou = pd.read_csv("../data/sparql_cat_queries/sparql_asCountry.csv")
df_asCou.rename(columns={"asCountry": "resource"}, inplace=True)

# Continents from the "Category": "dbo:Continents"
df_continent = pd.read_csv("../data/sparql_cat_queries/sparql_continent.csv")
df_continent.rename(columns={"conti": "resource"}, inplace=True)

# Birds from the "Class": "dbr:Bird"
df_bird = pd.read_csv("../data/sparql_cat_queries/sparql_bird.csv")
df_bird.rename(columns={"bird": "resource"}, inplace=True)

# African animals from the "Category": "dbc:Vertebrates_of_Africa"
df_africanAnimal = pd.read_csv("../data/sparql_cat_queries/sparql_africanAnimal.csv")
df_africanAnimal.rename(columns={"afrAnimal": "resource"}, inplace=True)

# Furniture from the "Category": "dbc:Furniture"
df_furniture = pd.read_csv("../data/sparql_cat_queries/sparql_furniture.csv")
df_furniture.rename(columns={"fur": "resource"}, inplace=True)

# Furniture from the "Category": "dbc:Vegetables"
df_vegetable = pd.read_csv("../data/sparql_cat_queries/sparql_vegetable.csv")
df_vegetable.rename(columns={"veg": "resource"}, inplace=True)

# Types of fast food from the "Category": "dbc:Fast_food"
df_fastFood = pd.read_csv("../data/sparql_cat_queries/sparql_fastfood.csv")
df_fastFood.rename(columns={"fastFood": "resource"}, inplace=True)

# European Landmark/Sight from multiple "Category": "dbc:Landmarks_in_x" with x being a country from europe
df_euSight = pd.read_csv("../data/sparql_cat_queries/sparql_euLandmark.csv")
df_euSight.rename(columns={"euroSight": "resource"}, inplace=True)

# DAX company from the "Type": "dbo:Company" and "dbp:tradedAs" -> "dbr:DAX"
df_daxCo = pd.read_csv("../data/sparql_cat_queries/sparql_daxCompany.csv")
df_daxCo.rename(columns={"DAXco": "resource"}, inplace=True)

# US company from ?
df_usCo = pd.read_csv("../data/sparql_cat_queries/sparql_usCompany.csv")
df_usCo.rename(columns={"usCo": "resource"}, inplace=True)

# US tech company from the "Category": "dbc:Technology_companies_of_the_United_States"
df_usTechCo = pd.read_csv("../data/sparql_cat_queries/sparql_usTechCompany.csv")
df_usTechCo.rename(columns={"usTech": "resource"}, inplace=True)

In [7]:
# Function for standardized scaling and normalisation of GCM values
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

def scaler(data_frame):
    helper_df = data_frame.copy()
    to_scale = StandardScaler()

    helper_df[['degree', 'eigenvector', 'pagerank']] = to_scale.fit_transform(helper_df[['degree', 'eigenvector', 'pagerank']])
    return helper_df

#df_ig_gcm = scaler(df_ig_gcm)

In [8]:
# Function for merging the sparql category with GCM data

# Merge the queried category df to the GCM metrics based on <resource> with a left outer join
  # (Keep every row in the category dataframe -> missing values get NaN)
  # Then, normalize GCM metrics

def catMerger(sparql_data, gcm_data):
    df_sparql = sparql_data.copy()
    df_gcm = gcm_data.copy()
    helper_df = pd.merge(df_sparql,
                #df_gcm[['resource','degree','pagerank']],
                df_gcm[['resource','degree', 'eigenvector', 'pagerank']],
                on='resource', 
                how='left')
    
    # scale and normalize the gcm metrics (same as scaler function)
    to_scale = preprocessing.StandardScaler()
    helper_df[['degree', 'eigenvector', 'pagerank']] = to_scale.fit_transform(helper_df[['degree', 'eigenvector', 'pagerank']])
    
    print("Dataframe merged with shape: {}".format(helper_df.shape))
    print('============================')

    #print(helper_df.shape[0] - helper_df.dropna().shape[0]) #return amount of rows containing NaN values
    
    return helper_df

In [9]:
# ALTERNATIVE function for merging the survey category data with the gcm category data

# Merge the survey category df to the GCM metrics per category based on <resource> with a left outer join
  # (Keep every row in the category dataframe -> missing values get NaN)

def ALTgcmSurfer(survey_data, cat_gcm_data):
    df_survey = survey_data.copy()
    df_cat_gcm = cat_gcm_data.copy()
    df_survey.rename(columns={"uri": "resource"}, inplace=True) #rename column to match GCM data frame
    
    # Count how often an answer was given in the survey and save into new data frame
    survey_count = df_survey['resource'].value_counts(normalize=True) #normalize values
    df_survey_counted = pd.DataFrame({'resource':survey_count.index, 'amount':survey_count.values})

    df_survey_gcm = pd.merge(df_survey_counted,
                    df_cat_gcm[['resource','name', 'degree','eigenvector','pagerank']],
                    #df_cat_gcm[['resource', 'degree','eigenvector','pagerank']],
                    on='resource', 
                    how='left')
    
    nan_rows = df_survey_gcm.shape[0] - df_survey_gcm.dropna().shape[0]
    
    print("Rows with NaN values dropped: {}".format(nan_rows) + " from {}".format(df_survey_gcm.shape[0]) + " => {}% ".format(round((nan_rows/df_survey_gcm.shape[0])*100, 2))) #return amount of rows containing NaN values
    df_survey_gcm = df_survey_gcm.dropna()
    print("Dataframe merged with shape: {}".format(df_survey_gcm.shape))
    
    return df_survey_gcm

In [10]:
# Function for merging the survey category data with the gcm category data

# Merge the survey category df to the GCM metrics per category based on <resource> with a left outer join
  # (Keep every row in the category dataframe -> missing values get NaN)
  # Drop NaN rows, then -> normalize survey results

def gcmSurfer(survey_data, cat_gcm_data):
    df_survey = survey_data.copy()
    df_cat_gcm = cat_gcm_data.copy()
    df_survey.rename(columns={"uri": "resource"}, inplace=True) #rename column to match GCM data frame
        
    # Count how often an answer was given in the survey and save into new data frame
    survey_count = df_survey['resource'].value_counts() #normalize values
    df_survey_counted = pd.DataFrame({'resource':survey_count.index, 'amount':survey_count.values})

    df_survey_gcm = pd.merge(df_survey_counted,
                    df_cat_gcm[['resource','name', 'degree','eigenvector','pagerank']],
                    #df_cat_gcm[['resource', 'degree','eigenvector','pagerank']],
                    on='resource', 
                    how='left')
    
    # Drop rows that were not matched with the category
    nan_rows = df_survey_gcm.shape[0] - df_survey_gcm.dropna().shape[0]    
    print("Rows with NaN values dropped: {}".format(nan_rows) + " from {}".format(df_survey_gcm.shape[0]) + " => {}% ".format(round((nan_rows/df_survey_gcm.shape[0])*100, 2))) #return amount of rows containing NaN values
    df_survey_gcm = df_survey_gcm.dropna()
    
    # Count how often an answer was given in the survey and save into new data frame
    #df_survey_gcm['survey_mentions'] = df_survey_gcm['resource'].value_counts(normalize=True) #normalize values
    #df_survey_counted = pd.DataFrame({'resource':survey_count.index, 'amount':survey_count.values})
    
    # scale and normalize the survey data
    min_max_scaler = preprocessing.MinMaxScaler()
    df_survey_gcm[['amount']] = min_max_scaler.fit_transform(df_survey_gcm[['amount']])
    
    print("Dataframe merged with shape: {}".format(df_survey_gcm.shape))
    return df_survey_gcm

In [11]:
# Function for creating "Camps" inside a category that is later used for ordinal regression

# Takes two parameter: dataframe, num_splits
  # dataframe = input data frame containing URI, amount (survey mentions), GCMs (degree, eigenvector, pagerank)
  # num_splits = number for the camp size, e.g. 3 => three camps with 1 = top 33%; 2 = mid 33%; 3 = low 33%

def camper(data_frame, num_splits):
    helper_df = data_frame.copy()
    split_v = [int(len(helper_df.index)/num_splits * i) for i in np.arange(num_splits)]
    
    for i in helper_df.index:
        split_smaller = [split <= i for split in split_v]
        camp = np.sum(split_smaller)
        
        #return dataframe
        helper_df.at[i,'camp'] = camp       
    return helper_df

In [12]:
# Function for ordinal regression
  # df_survey_gcm = input data frame 
  # list_gcm = give a list with the variables that should be used for ordinal regression
    # examples: ['degree', 'eigenvector', 'pagerank'] or ['eigenvector', 'pagerank'] or ['pagerank']
    
    
def ordinalReg(df_survey_gcm, list_gcm, num_splits):
    helper_df = df_survey_gcm.copy()
    camped_df = camper(helper_df, num_splits) #calls camper function to create column with camps

    mod_prob = OrderedModel(camped_df['camp'],
                            camped_df[list_gcm],
                            distr='probit')
    
    print("Ordinal Regression:  {}".format(list_gcm))
    print('-------------------------------------')
    res_prob = mod_prob.fit(method='bfgs')
    return res_prob.summary()

## Music genres

In [13]:
# Music Genre from the "Type": "dbo:MusicGenre"

mGenre_result = catMerger(df_musicGenre, df_ig_gcm)
df_survey_mGenre = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_music_genres_prep.csv", sep=';')

df_survey_mGenre_gcm = gcmSurfer(df_survey_mGenre, mGenre_result)

Dataframe merged with shape: (10000, 5)
Rows with NaN values dropped: 9 from 42 => 21.43% 
Dataframe merged with shape: (33, 6)


In [14]:
df_survey_mGenre_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_music_genres_impr.csv", sep=';')

df_survey_mGenre_gcm_impr = gcmSurfer(df_survey_mGenre_impr, mGenre_result)

Rows with NaN values dropped: 5 from 38 => 13.16% 
Dataframe merged with shape: (33, 6)


In [62]:
ordinalReg(df_survey_mGenre_gcm_impr, ['degree'], 3)

Ordinal Regression:  ['degree']
-------------------------------------
Optimization terminated successfully.
         Current function value: 0.935415
         Iterations: 10
         Function evaluations: 12
         Gradient evaluations: 12


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-30.869
Model:,OrderedModel,AIC:,67.74
Method:,Maximum Likelihood,BIC:,72.23
Date:,"Tue, 26 Jul 2022",,
Time:,14:23:10,,
No. Observations:,33,,
Df Residuals:,30,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
degree,-0.1023,0.035,-2.922,0.003,-0.171,-0.034
1.0/2.0,-1.1505,0.340,-3.381,0.001,-1.817,-0.484
2.0/3.0,-0.0240,0.278,-0.086,0.931,-0.570,0.522


In [16]:
ordinalReg(df_survey_mGenre_gcm_impr, ['eigenvector', 'pagerank'], 2)

Ordinal Regression:  ['eigenvector', 'pagerank']
-------------------------------------
Optimization terminated successfully.
         Current function value: 0.461062
         Iterations: 25
         Function evaluations: 28
         Gradient evaluations: 28


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-15.215
Model:,OrderedModel,AIC:,36.43
Method:,Maximum Likelihood,BIC:,40.92
Date:,"Fri, 29 Jul 2022",,
Time:,16:33:15,,
No. Observations:,33,,
Df Residuals:,30,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
eigenvector,-17.7574,19.045,-0.932,0.351,-55.086,19.571
pagerank,-0.1840,0.080,-2.309,0.021,-0.340,-0.028
1.0/2.0,-0.5754,0.620,-0.927,0.354,-1.791,0.641


In [19]:
ordinalReg(df_survey_mGenre_gcm_impr, ['pagerank'], 3)

Ordinal Regression:  ['pagerank']
-------------------------------------
Optimization terminated successfully.
         Current function value: 0.935621
         Iterations: 10
         Function evaluations: 12
         Gradient evaluations: 12


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-30.875
Model:,OrderedModel,AIC:,67.75
Method:,Maximum Likelihood,BIC:,72.24
Date:,"Fri, 29 Jul 2022",,
Time:,16:33:42,,
No. Observations:,33,,
Df Residuals:,30,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
pagerank,-0.1043,0.036,-2.921,0.003,-0.174,-0.034
1.0/2.0,-1.1416,0.339,-3.372,0.001,-1.805,-0.478
2.0/3.0,-0.0228,0.279,-0.082,0.935,-0.569,0.523


### POTUS

In [20]:
# POTUS from the "Category": Presidents_of_the_United_States

potus_result = catMerger(df_potus, df_ig_gcm)
df_survey_potus = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_potus_prep.csv", sep=';')

df_survey_potus_gcm = gcmSurfer(df_survey_potus, potus_result)

Dataframe merged with shape: (48, 5)
Rows with NaN values dropped: 12 from 27 => 44.44% 
Dataframe merged with shape: (15, 6)


In [21]:
# POTUS from the "Category": Presidents_of_the_United_States

potus_result = catMerger(df_potus, df_ig_gcm)
df_survey_potus_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_potus_impr.csv", sep=';')

df_survey_potus_gcm_impr = gcmSurfer(df_survey_potus_impr, potus_result)

Dataframe merged with shape: (48, 5)
Rows with NaN values dropped: 0 from 19 => 0.0% 
Dataframe merged with shape: (19, 6)


In [22]:
ordinalReg(df_survey_potus_gcm_impr, ['degree'], 3)

Ordinal Regression:  ['degree']
-------------------------------------
Optimization terminated successfully.
         Current function value: 0.878801
         Iterations: 9
         Function evaluations: 10
         Gradient evaluations: 10


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-16.697
Model:,OrderedModel,AIC:,39.39
Method:,Maximum Likelihood,BIC:,42.23
Date:,"Fri, 29 Jul 2022",,
Time:,16:34:25,,
No. Observations:,19,,
Df Residuals:,16,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
degree,-0.7357,0.296,-2.485,0.013,-1.316,-0.155
1.0/2.0,-1.0442,0.391,-2.671,0.008,-1.810,-0.278
2.0/3.0,0.0740,0.353,0.209,0.834,-0.618,0.766


In [23]:
ordinalReg(df_survey_potus_gcm_impr, ['eigenvector'], 3)

Ordinal Regression:  ['eigenvector']
-------------------------------------
Optimization terminated successfully.
         Current function value: 1.043551
         Iterations: 8
         Function evaluations: 9
         Gradient evaluations: 9


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-19.827
Model:,OrderedModel,AIC:,45.65
Method:,Maximum Likelihood,BIC:,48.49
Date:,"Fri, 29 Jul 2022",,
Time:,16:34:26,,
No. Observations:,19,,
Df Residuals:,16,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
eigenvector,0.3597,0.323,1.113,0.266,-0.274,0.993
1.0/2.0,-0.4462,0.307,-1.454,0.146,-1.048,0.155
2.0/3.0,-0.1455,0.355,-0.409,0.682,-0.842,0.551


In [26]:
ordinalReg(df_survey_potus_gcm_impr, ['pagerank'], 3)

Ordinal Regression:  ['pagerank']
-------------------------------------
Optimization terminated successfully.
         Current function value: 0.876059
         Iterations: 9
         Function evaluations: 10
         Gradient evaluations: 10


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-16.645
Model:,OrderedModel,AIC:,39.29
Method:,Maximum Likelihood,BIC:,42.12
Date:,"Fri, 29 Jul 2022",,
Time:,16:34:42,,
No. Observations:,19,,
Df Residuals:,16,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
pagerank,-0.7389,0.297,-2.492,0.013,-1.320,-0.158
1.0/2.0,-1.0416,0.390,-2.668,0.008,-1.807,-0.276
2.0/3.0,0.0778,0.353,0.220,0.826,-0.615,0.770


## Cities

In [None]:
# Cities from the "Category": "dbo:City"

city_result = catMerger(df_city, df_ig_gcm)
df_survey_city = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_cities_prep.csv", sep=';')

df_city_survey_gcm = gcmSurfer(df_survey_city, city_result)

In [None]:
df_survey_city_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_cities_impr.csv", sep=';')

df_city_survey_gcm_impr = gcmSurfer(df_survey_city_impr, city_result)

In [None]:
ordinalReg(df_city_survey_gcm, ['degree'], 3)

In [None]:
ordinalReg(df_city_survey_gcm, ['eigenvector'], 3)

## Lakes

In [23]:
# Lakes from the "Category": "dbo:Lake"

lake_result = catMerger(df_lake, df_ig_gcm)
df_survey_lake = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_lakes_prep.csv", sep=';')

df_lake_survey_gcm = gcmSurfer(df_survey_lake, lake_result)

Dataframe merged with shape: (10000, 5)
Rows with NaN values dropped: 42 from 80 => 52.5% 
Dataframe merged with shape: (38, 6)


In [30]:
df_survey_lake_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_lakes_impr.csv", sep=';')

df_lake_survey_gcm_impr = gcmSurfer(df_survey_lake_impr, lake_result)

Rows with NaN values dropped: 20 from 70 => 28.57% 
Dataframe merged with shape: (50, 6)


In [67]:
ordinalReg(df_lake_survey_gcm_impr, ['degree'], 3)

Ordinal Regression:  ['degree']
-------------------------------------
Optimization terminated successfully.
         Current function value: 1.039805
         Iterations: 9
         Function evaluations: 11
         Gradient evaluations: 11


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-51.99
Model:,OrderedModel,AIC:,110.0
Method:,Maximum Likelihood,BIC:,115.7
Date:,"Tue, 26 Jul 2022",,
Time:,14:23:35,,
No. Observations:,50,,
Df Residuals:,47,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
degree,-0.0770,0.053,-1.454,0.146,-0.181,0.027
1.0/2.0,-0.6697,0.214,-3.136,0.002,-1.088,-0.251
2.0/3.0,-0.4388,0.259,-1.691,0.091,-0.947,0.070


In [None]:
ordinalReg(df_city_survey_gcm, ['pagerank'], 3)

In [68]:
ordinalReg(df_lake_survey_gcm_impr, ['eigenvector'], 3)

Ordinal Regression:  ['eigenvector']
-------------------------------------
Optimization terminated successfully.
         Current function value: 1.045514
         Iterations: 9
         Function evaluations: 10
         Gradient evaluations: 10


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-52.276
Model:,OrderedModel,AIC:,110.6
Method:,Maximum Likelihood,BIC:,116.3
Date:,"Tue, 26 Jul 2022",,
Time:,14:23:38,,
No. Observations:,50,,
Df Residuals:,47,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
eigenvector,-0.1432,0.116,-1.234,0.217,-0.371,0.084
1.0/2.0,-0.6322,0.207,-3.055,0.002,-1.038,-0.227
2.0/3.0,-0.4485,0.259,-1.729,0.084,-0.957,0.060


In [69]:
ordinalReg(df_lake_survey_gcm_impr, ['pagerank'], 3)

Ordinal Regression:  ['pagerank']
-------------------------------------
Optimization terminated successfully.
         Current function value: 1.043757
         Iterations: 9
         Function evaluations: 11
         Gradient evaluations: 11


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-52.188
Model:,OrderedModel,AIC:,110.4
Method:,Maximum Likelihood,BIC:,116.1
Date:,"Tue, 26 Jul 2022",,
Time:,14:23:42,,
No. Observations:,50,,
Df Residuals:,47,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
pagerank,-0.0610,0.047,-1.310,0.190,-0.152,0.030
1.0/2.0,-0.6497,0.211,-3.077,0.002,-1.064,-0.236
2.0/3.0,-0.4447,0.259,-1.714,0.087,-0.953,0.064


## Mountains

In [34]:
# Mountains from the "Category": "dbo:Mountain"

mountain_result = catMerger(df_mountain, df_ig_gcm)
df_survey_mountain = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_mountains_prep.csv", sep=';')

df_mountain_survey_gcm = gcmSurfer(df_survey_mountain, mountain_result)

Dataframe merged with shape: (10000, 5)
Rows with NaN values dropped: 28 from 56 => 50.0% 
Dataframe merged with shape: (28, 6)


In [35]:
df_survey_mountain_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_mountains_impr.csv", sep=';')

df_mountain_survey_gcm_impr = gcmSurfer(df_survey_mountain_impr, mountain_result)

Rows with NaN values dropped: 17 from 52 => 32.69% 
Dataframe merged with shape: (35, 6)


In [70]:
ordinalReg(df_mountain_survey_gcm_impr, ['degree'], 3)

Ordinal Regression:  ['degree']
-------------------------------------
Optimization terminated successfully.
         Current function value: 1.059722
         Iterations: 8
         Function evaluations: 10
         Gradient evaluations: 10


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-37.09
Model:,OrderedModel,AIC:,80.18
Method:,Maximum Likelihood,BIC:,84.85
Date:,"Tue, 26 Jul 2022",,
Time:,14:23:51,,
No. Observations:,35,,
Df Residuals:,32,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
degree,-0.0910,0.085,-1.068,0.286,-0.258,0.076
1.0/2.0,-0.6002,0.248,-2.423,0.015,-1.086,-0.115
2.0/3.0,-0.3863,0.298,-1.298,0.194,-0.970,0.197


In [71]:
ordinalReg(df_mountain_survey_gcm_impr, ['eigenvector'], 3)

Ordinal Regression:  ['eigenvector']
-------------------------------------
Optimization terminated successfully.
         Current function value: 1.072129
         Iterations: 7
         Function evaluations: 8
         Gradient evaluations: 8


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-37.525
Model:,OrderedModel,AIC:,81.05
Method:,Maximum Likelihood,BIC:,85.72
Date:,"Tue, 26 Jul 2022",,
Time:,14:23:55,,
No. Observations:,35,,
Df Residuals:,32,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
eigenvector,-0.1245,0.236,-0.527,0.598,-0.587,0.339
1.0/2.0,-0.5163,0.230,-2.247,0.025,-0.967,-0.066
2.0/3.0,-0.4046,0.298,-1.359,0.174,-0.988,0.179


In [72]:
ordinalReg(df_mountain_survey_gcm_impr, ['pagerank'], 3)

Ordinal Regression:  ['pagerank']
-------------------------------------
Optimization terminated successfully.
         Current function value: 1.057636
         Iterations: 8
         Function evaluations: 10
         Gradient evaluations: 10


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-37.017
Model:,OrderedModel,AIC:,80.03
Method:,Maximum Likelihood,BIC:,84.7
Date:,"Tue, 26 Jul 2022",,
Time:,14:23:59,,
No. Observations:,35,,
Df Residuals:,32,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
pagerank,-0.0924,0.082,-1.133,0.257,-0.252,0.067
1.0/2.0,-0.6111,0.249,-2.451,0.014,-1.100,-0.122
2.0/3.0,-0.3832,0.298,-1.287,0.198,-0.967,0.200


## EU capital cities

In [39]:
euCap_result = catMerger(df_euCap, df_ig_gcm)
df_survey_euCap = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_european_capital_cities_prep.csv", sep=';')

df_euCap_survey_gcm = gcmSurfer(df_survey_euCap, euCap_result)

Dataframe merged with shape: (52, 5)
Rows with NaN values dropped: 7 from 29 => 24.14% 
Dataframe merged with shape: (22, 6)


In [40]:
df_survey_euCap_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_european_capital_cities_impr.csv", sep=';')

df_euCap_survey_gcm_impr = gcmSurfer(df_survey_euCap_impr, euCap_result)

Rows with NaN values dropped: 4 from 28 => 14.29% 
Dataframe merged with shape: (24, 6)


In [73]:
ordinalReg(df_euCap_survey_gcm_impr, ['degree'], 3)

Ordinal Regression:  ['degree']
-------------------------------------
Optimization terminated successfully.
         Current function value: 0.997739
         Iterations: 10
         Function evaluations: 11
         Gradient evaluations: 11


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-23.946
Model:,OrderedModel,AIC:,53.89
Method:,Maximum Likelihood,BIC:,57.43
Date:,"Tue, 26 Jul 2022",,
Time:,14:24:05,,
No. Observations:,24,,
Df Residuals:,21,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
degree,-0.6354,0.391,-1.625,0.104,-1.402,0.131
1.0/2.0,-0.7235,0.310,-2.333,0.020,-1.331,-0.116
2.0/3.0,-0.0367,0.305,-0.120,0.904,-0.634,0.561


In [74]:
ordinalReg(df_euCap_survey_gcm_impr, ['pagerank'], 3)

Ordinal Regression:  ['pagerank']
-------------------------------------
Optimization terminated successfully.
         Current function value: 0.997401
         Iterations: 10
         Function evaluations: 11
         Gradient evaluations: 11


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-23.938
Model:,OrderedModel,AIC:,53.88
Method:,Maximum Likelihood,BIC:,57.41
Date:,"Tue, 26 Jul 2022",,
Time:,14:24:09,,
No. Observations:,24,,
Df Residuals:,21,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
pagerank,-0.6299,0.384,-1.640,0.101,-1.383,0.123
1.0/2.0,-0.7345,0.313,-2.350,0.019,-1.347,-0.122
2.0/3.0,-0.0362,0.305,-0.119,0.905,-0.633,0.561


In [75]:
ordinalReg(df_euCap_survey_gcm_impr, ['eigenvector'], 3)

Ordinal Regression:  ['eigenvector']
-------------------------------------
Optimization terminated successfully.
         Current function value: 1.097723
         Iterations: 7
         Function evaluations: 8
         Gradient evaluations: 8


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-26.345
Model:,OrderedModel,AIC:,58.69
Method:,Maximum Likelihood,BIC:,62.22
Date:,"Tue, 26 Jul 2022",,
Time:,14:24:12,,
No. Observations:,24,,
Df Residuals:,21,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
eigenvector,0.0525,0.254,0.207,0.836,-0.445,0.550
1.0/2.0,-0.4045,0.294,-1.378,0.168,-0.980,0.171
2.0/3.0,-0.1481,0.307,-0.482,0.630,-0.750,0.454


## European countries

In [44]:
euCou_result = catMerger(df_euCou, df_ig_gcm)
df_survey_euCou = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_european_countries_prep.csv", sep=';')

df_euCou_survey_gcm = gcmSurfer(df_survey_euCou, euCou_result)

Dataframe merged with shape: (51, 5)
Rows with NaN values dropped: 1 from 29 => 3.45% 
Dataframe merged with shape: (28, 6)


In [45]:
df_survey_euCou_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_european_countries_impr.csv", sep=';')

df_euCou_survey_gcm_impr = gcmSurfer(df_survey_euCou_impr, euCou_result)

Rows with NaN values dropped: 0 from 29 => 0.0% 
Dataframe merged with shape: (29, 6)


In [76]:
ordinalReg(df_euCou_survey_gcm_impr, ['degree'], 3)

Ordinal Regression:  ['degree']
-------------------------------------
Optimization terminated successfully.
         Current function value: 0.524739
         Iterations: 13
         Function evaluations: 14
         Gradient evaluations: 14


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-15.217
Model:,OrderedModel,AIC:,36.43
Method:,Maximum Likelihood,BIC:,40.54
Date:,"Tue, 26 Jul 2022",,
Time:,14:24:20,,
No. Observations:,29,,
Df Residuals:,26,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
degree,-4.4186,1.303,-3.392,0.001,-6.972,-1.866
1.0/2.0,-0.5506,0.430,-1.282,0.200,-1.393,0.291
2.0/3.0,0.7615,0.292,2.607,0.009,0.189,1.334


In [77]:
ordinalReg(df_euCou_survey_gcm_impr, ['eigenvector'], 3)

Ordinal Regression:  ['eigenvector']
-------------------------------------
Optimization terminated successfully.
         Current function value: 0.914592
         Iterations: 8
         Function evaluations: 9
         Gradient evaluations: 9


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-26.523
Model:,OrderedModel,AIC:,59.05
Method:,Maximum Likelihood,BIC:,63.15
Date:,"Tue, 26 Jul 2022",,
Time:,14:24:23,,
No. Observations:,29,,
Df Residuals:,26,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
eigenvector,-0.8600,0.315,-2.733,0.006,-1.477,-0.243
1.0/2.0,-0.7168,0.276,-2.599,0.009,-1.257,-0.176
2.0/3.0,0.0989,0.270,0.367,0.714,-0.430,0.628


In [78]:
ordinalReg(df_euCou_survey_gcm_impr, ['pagerank'], 3)

Ordinal Regression:  ['pagerank']
-------------------------------------
Optimization terminated successfully.
         Current function value: 0.510107
         Iterations: 13
         Function evaluations: 14
         Gradient evaluations: 14


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-14.793
Model:,OrderedModel,AIC:,35.59
Method:,Maximum Likelihood,BIC:,39.69
Date:,"Tue, 26 Jul 2022",,
Time:,14:24:26,,
No. Observations:,29,,
Df Residuals:,26,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
pagerank,-4.6758,1.380,-3.387,0.001,-7.381,-1.970
1.0/2.0,-0.5870,0.427,-1.374,0.169,-1.424,0.250
2.0/3.0,0.7977,0.295,2.704,0.007,0.220,1.376


## Asian countries

In [49]:
# Asian countries from the "Category": "dbc:Countries_in_Asia", "dbc:East_Asian_countries" and "dbc:Central_Asian_countries"

asCou_result = catMerger(df_asCou, df_ig_gcm)
df_survey_asCou = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_asian_countries_prep.csv", sep=';') # .csv needed no manual uri correction

df_asCou_survey_gcm = gcmSurfer(df_survey_asCou, asCou_result)

Dataframe merged with shape: (60, 5)
Rows with NaN values dropped: 7 from 33 => 21.21% 
Dataframe merged with shape: (26, 6)


In [50]:
df_survey_asCou_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_asian_countries_impr.csv", sep=';') # .csv needed no manual uri correction

df_asCou_survey_gcm_impr = gcmSurfer(df_survey_asCou_impr, asCou_result)

Rows with NaN values dropped: 7 from 33 => 21.21% 
Dataframe merged with shape: (26, 6)


In [57]:
ordinalReg(df_asCou_survey_gcm_impr, ['degree'], 3)

Ordinal Regression:  ['degree']
-------------------------------------
Optimization terminated successfully.
         Current function value: 1.086538
         Iterations: 6
         Function evaluations: 7
         Gradient evaluations: 7


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-28.25
Model:,OrderedModel,AIC:,62.5
Method:,Maximum Likelihood,BIC:,66.27
Date:,"Tue, 26 Jul 2022",,
Time:,14:21:50,,
No. Observations:,26,,
Df Residuals:,23,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
degree,-0.1010,0.177,-0.569,0.569,-0.449,0.247
1.0/2.0,-0.5460,0.269,-2.027,0.043,-1.074,-0.018
2.0/3.0,-0.2204,0.310,-0.711,0.477,-0.828,0.387


In [79]:
ordinalReg(df_asCou_survey_gcm_impr, ['eigenvector'], 3)

Ordinal Regression:  ['eigenvector']
-------------------------------------
Optimization terminated successfully.
         Current function value: 1.063999
         Iterations: 7
         Function evaluations: 8
         Gradient evaluations: 8


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-27.664
Model:,OrderedModel,AIC:,61.33
Method:,Maximum Likelihood,BIC:,65.1
Date:,"Tue, 26 Jul 2022",,
Time:,14:24:33,,
No. Observations:,26,,
Df Residuals:,23,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
eigenvector,-0.2238,0.186,-1.202,0.229,-0.589,0.141
1.0/2.0,-0.5895,0.271,-2.172,0.030,-1.121,-0.058
2.0/3.0,-0.1905,0.310,-0.615,0.539,-0.798,0.417


In [80]:
ordinalReg(df_asCou_survey_gcm_impr, ['pagerank'], 3)

Ordinal Regression:  ['pagerank']
-------------------------------------
Optimization terminated successfully.
         Current function value: 1.076496
         Iterations: 7
         Function evaluations: 8
         Gradient evaluations: 8


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-27.989
Model:,OrderedModel,AIC:,61.98
Method:,Maximum Likelihood,BIC:,65.75
Date:,"Tue, 26 Jul 2022",,
Time:,14:24:36,,
No. Observations:,26,,
Df Residuals:,23,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
pagerank,-0.1670,0.184,-0.905,0.365,-0.528,0.194
1.0/2.0,-0.5808,0.273,-2.127,0.033,-1.116,-0.046
2.0/3.0,-0.2077,0.310,-0.670,0.503,-0.815,0.400


## Continents

In [54]:
# Continents from the "Category": "dbo:Continents"

continent_result = catMerger(df_continent, df_ig_gcm)
df_survey_continent = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_continents_prep.csv", sep=';')

df_continent_survey_gcm = gcmSurfer(df_survey_continent, continent_result)

Dataframe merged with shape: (20, 5)
Rows with NaN values dropped: 7 from 10 => 70.0% 
Dataframe merged with shape: (3, 6)


In [55]:
df_survey_continent_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_continents_impr.csv", sep=';') # .csv needed no manual uri correction

df_continent_survey_gcm_impr = gcmSurfer(df_survey_continent_impr, continent_result)

Rows with NaN values dropped: 1 from 8 => 12.5% 
Dataframe merged with shape: (7, 6)


In [81]:
ordinalReg(df_continent_survey_gcm_impr, ['degree'], 3)

Ordinal Regression:  ['degree']
-------------------------------------
Optimization terminated successfully.
         Current function value: 1.025213
         Iterations: 8
         Function evaluations: 9
         Gradient evaluations: 9


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-7.1765
Model:,OrderedModel,AIC:,20.35
Method:,Maximum Likelihood,BIC:,20.19
Date:,"Tue, 26 Jul 2022",,
Time:,14:24:46,,
No. Observations:,7,,
Df Residuals:,4,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
degree,0.3437,0.439,0.783,0.434,-0.517,1.204
1.0/2.0,-0.4831,0.516,-0.935,0.350,-1.495,0.529
2.0/3.0,-0.2370,0.621,-0.382,0.703,-1.454,0.980


In [82]:
ordinalReg(df_continent_survey_gcm_impr, ['eigenvector'], 3)

Ordinal Regression:  ['eigenvector']
-------------------------------------
Optimization terminated successfully.
         Current function value: 1.022176
         Iterations: 8
         Function evaluations: 9
         Gradient evaluations: 9


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-7.1552
Model:,OrderedModel,AIC:,20.31
Method:,Maximum Likelihood,BIC:,20.15
Date:,"Tue, 26 Jul 2022",,
Time:,14:24:54,,
No. Observations:,7,,
Df Residuals:,4,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
eigenvector,0.3434,0.426,0.805,0.421,-0.492,1.179
1.0/2.0,-0.5060,0.514,-0.983,0.325,-1.514,0.502
2.0/3.0,-0.2320,0.621,-0.374,0.709,-1.449,0.985


In [83]:
ordinalReg(df_continent_survey_gcm_impr, ['pagerank'], 3)

Ordinal Regression:  ['pagerank']
-------------------------------------
Optimization terminated successfully.
         Current function value: 1.018234
         Iterations: 8
         Function evaluations: 9
         Gradient evaluations: 9


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-7.1276
Model:,OrderedModel,AIC:,20.26
Method:,Maximum Likelihood,BIC:,20.09
Date:,"Tue, 26 Jul 2022",,
Time:,14:24:54,,
No. Observations:,7,,
Df Residuals:,4,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
pagerank,0.3686,0.452,0.816,0.414,-0.517,1.254
1.0/2.0,-0.4900,0.516,-0.950,0.342,-1.501,0.521
2.0/3.0,-0.2296,0.620,-0.370,0.711,-1.445,0.986


## Birds

In [86]:
# Birds from the "Class": "dbr:Bird"

bird_result = catMerger(df_bird, df_ig_gcm)
df_survey_bird = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_birds_prep.csv", sep=';')

df_birds_survey_gcm = gcmSurfer(df_survey_bird, bird_result)

Dataframe merged with shape: (10000, 5)
Rows with NaN values dropped: 52 from 61 => 85.25% 
Dataframe merged with shape: (9, 6)


In [87]:
df_survey_bird_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_birds_impr.csv", sep=';')

df_birds_survey_gcm_impr = gcmSurfer(df_survey_bird_impr, bird_result)

Rows with NaN values dropped: 27 from 50 => 54.0% 
Dataframe merged with shape: (23, 6)


In [88]:
ordinalReg(df_birds_survey_gcm_impr, ['degree'], 3)

Ordinal Regression:  ['degree']
-------------------------------------
Optimization terminated successfully.
         Current function value: 0.698968
         Iterations: 8
         Function evaluations: 10
         Gradient evaluations: 10


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-16.076
Model:,OrderedModel,AIC:,38.15
Method:,Maximum Likelihood,BIC:,41.56
Date:,"Tue, 26 Jul 2022",,
Time:,14:25:25,,
No. Observations:,23,,
Df Residuals:,20,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
degree,-0.1867,0.137,-1.367,0.171,-0.454,0.081
1.0/2.0,-1.4993,0.399,-3.761,0.000,-2.281,-0.718
2.0/3.0,-0.2557,0.457,-0.560,0.576,-1.151,0.640


In [89]:
ordinalReg(df_birds_survey_gcm_impr, ['eigenvector'], 3)

Ordinal Regression:  ['eigenvector']
-------------------------------------
Optimization terminated successfully.
         Current function value: 0.626334
         Iterations: 20
         Function evaluations: 25
         Gradient evaluations: 25


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-14.406
Model:,OrderedModel,AIC:,34.81
Method:,Maximum Likelihood,BIC:,38.22
Date:,"Tue, 26 Jul 2022",,
Time:,14:25:25,,
No. Observations:,23,,
Df Residuals:,20,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
eigenvector,153.1924,153.030,1.001,0.317,-146.741,453.125
1.0/2.0,-6.4446,4.830,-1.334,0.182,-15.911,3.022
2.0/3.0,-0.0657,0.467,-0.141,0.888,-0.980,0.849


In [90]:
ordinalReg(df_birds_survey_gcm_impr, ['pagerank'], 3)

Ordinal Regression:  ['pagerank']
-------------------------------------
Optimization terminated successfully.
         Current function value: 0.702905
         Iterations: 8
         Function evaluations: 10
         Gradient evaluations: 10


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-16.167
Model:,OrderedModel,AIC:,38.33
Method:,Maximum Likelihood,BIC:,41.74
Date:,"Tue, 26 Jul 2022",,
Time:,14:25:25,,
No. Observations:,23,,
Df Residuals:,20,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
pagerank,-0.1829,0.141,-1.301,0.193,-0.458,0.093
1.0/2.0,-1.5020,0.399,-3.763,0.000,-2.284,-0.720
2.0/3.0,-0.2641,0.457,-0.578,0.563,-1.159,0.631


## African animals

In [91]:
# African animals from the "Category": "dbc:Vertebrates_of_Africa"

afrAnimal_result = catMerger(df_africanAnimal, df_ig_gcm)
df_survey_afrAnimal = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_african_animals_prep.csv", sep=';')

df_afrAnimal_survey_gcm = gcmSurfer(df_survey_afrAnimal, afrAnimal_result)

Dataframe merged with shape: (10000, 5)
Rows with NaN values dropped: 22 from 29 => 75.86% 
Dataframe merged with shape: (7, 6)


In [92]:
df_survey_afrAnimal_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_african_animals_impr.csv", sep=';')

df_afrAnimal_survey_gcm_impr = gcmSurfer(df_survey_afrAnimal_impr, afrAnimal_result)

Rows with NaN values dropped: 11 from 25 => 44.0% 
Dataframe merged with shape: (14, 6)


In [93]:
ordinalReg(df_afrAnimal_survey_gcm_impr, ['degree'], 3)

Ordinal Regression:  ['degree']
-------------------------------------
Optimization terminated successfully.
         Current function value: 0.838457
         Iterations: 10
         Function evaluations: 11
         Gradient evaluations: 11


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-11.738
Model:,OrderedModel,AIC:,29.48
Method:,Maximum Likelihood,BIC:,31.39
Date:,"Tue, 26 Jul 2022",,
Time:,14:25:36,,
No. Observations:,14,,
Df Residuals:,11,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
degree,-0.8070,0.470,-1.718,0.086,-1.728,0.114
1.0/2.0,-0.6815,0.390,-1.748,0.080,-1.446,0.083
2.0/3.0,-0.7587,0.657,-1.154,0.248,-2.047,0.529


In [94]:
ordinalReg(df_afrAnimal_survey_gcm_impr, ['eigenvector'], 3)

Ordinal Regression:  ['eigenvector']
-------------------------------------
Optimization terminated successfully.
         Current function value: 0.811853
         Iterations: 24
         Function evaluations: 26
         Gradient evaluations: 26


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-11.366
Model:,OrderedModel,AIC:,28.73
Method:,Maximum Likelihood,BIC:,30.65
Date:,"Tue, 26 Jul 2022",,
Time:,14:25:37,,
No. Observations:,14,,
Df Residuals:,11,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
eigenvector,-99.1852,91.219,-1.087,0.277,-277.971,79.600
1.0/2.0,-109.3762,99.896,-1.095,0.274,-305.169,86.417
2.0/3.0,-0.7280,0.655,-1.112,0.266,-2.011,0.555


In [95]:
ordinalReg(df_afrAnimal_survey_gcm_impr, ['pagerank'], 3)

Ordinal Regression:  ['pagerank']
-------------------------------------
Optimization terminated successfully.
         Current function value: 0.831932
         Iterations: 9
         Function evaluations: 10
         Gradient evaluations: 10


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-11.647
Model:,OrderedModel,AIC:,29.29
Method:,Maximum Likelihood,BIC:,31.21
Date:,"Tue, 26 Jul 2022",,
Time:,14:25:37,,
No. Observations:,14,,
Df Residuals:,11,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
pagerank,-0.5715,0.326,-1.755,0.079,-1.210,0.067
1.0/2.0,-0.8741,0.419,-2.085,0.037,-1.696,-0.053
2.0/3.0,-0.7455,0.657,-1.134,0.257,-2.034,0.543


## Furniture

In [None]:
# Furniture from the "Category": "dbc:Furniture"

furniture_result = catMerger(df_furniture, df_ig_gcm)
df_survey_furniture = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_furniture_prep.csv", sep=';') # .csv needed no manual uri correction

df_furniture_survey_gcm = gcmSurfer(df_survey_furniture, furniture_result)

In [None]:
df_survey_furniture_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_furniture_impr.csv", sep=';')

df_furniture_survey_gcm_impr = gcmSurfer(df_survey_furniture_impr, furniture_result)

In [None]:
ordinalReg(df_vegetable_survey_gcm_impr, ['degree'], 3)

In [None]:
ordinalReg(df_vegetable_survey_gcm_impr, ['eigenvector'], 3)

In [None]:
ordinalReg(df_vegetable_survey_gcm_impr, ['pagerank'], 3)

## Vegetables

In [96]:
# Vegetables from the "Category": "dbc:Vegetables"

vegetable_result = catMerger(df_vegetable, df_ig_gcm)
df_survey_vegetable = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_vegetables_prep.csv", sep=';') # .csv needed no manual uri correction

df_vegetable_survey_gcm = gcmSurfer(df_survey_vegetable, vegetable_result)

Dataframe merged with shape: (3742, 5)
Rows with NaN values dropped: 17 from 42 => 40.48% 
Dataframe merged with shape: (25, 6)


In [97]:
df_survey_vegetable_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_vegetables_impr.csv", sep=';') # .csv needed no manual uri correction

df_vegetable_survey_gcm_impr = gcmSurfer(df_survey_vegetable_impr, vegetable_result)

Rows with NaN values dropped: 8 from 37 => 21.62% 
Dataframe merged with shape: (29, 6)


In [98]:
ordinalReg(df_vegetable_survey_gcm_impr, ['degree'], 3)

Ordinal Regression:  ['degree']
-------------------------------------
Optimization terminated successfully.
         Current function value: 1.057914
         Iterations: 8
         Function evaluations: 10
         Gradient evaluations: 10


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-30.68
Model:,OrderedModel,AIC:,67.36
Method:,Maximum Likelihood,BIC:,71.46
Date:,"Tue, 26 Jul 2022",,
Time:,14:26:19,,
No. Observations:,29,,
Df Residuals:,26,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
degree,-0.0468,0.039,-1.208,0.227,-0.123,0.029
1.0/2.0,-0.6594,0.281,-2.348,0.019,-1.210,-0.109
2.0/3.0,-0.3034,0.314,-0.968,0.333,-0.918,0.311


In [99]:
ordinalReg(df_vegetable_survey_gcm_impr, ['eigenvector'], 3)

Ordinal Regression:  ['eigenvector']
-------------------------------------
Optimization terminated successfully.
         Current function value: 1.079849
         Iterations: 8
         Function evaluations: 10
         Gradient evaluations: 10


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-31.316
Model:,OrderedModel,AIC:,68.63
Method:,Maximum Likelihood,BIC:,72.73
Date:,"Tue, 26 Jul 2022",,
Time:,14:26:19,,
No. Observations:,29,,
Df Residuals:,26,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
eigenvector,-0.3849,0.835,-0.461,0.645,-2.022,1.252
1.0/2.0,-0.5661,0.288,-1.962,0.050,-1.131,-0.001
2.0/3.0,-0.3342,0.314,-1.066,0.286,-0.949,0.280


In [100]:
ordinalReg(df_vegetable_survey_gcm_impr, ['pagerank'], 3)

Ordinal Regression:  ['pagerank']
-------------------------------------
Optimization terminated successfully.
         Current function value: 1.055869
         Iterations: 8
         Function evaluations: 10
         Gradient evaluations: 10


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-30.62
Model:,OrderedModel,AIC:,67.24
Method:,Maximum Likelihood,BIC:,71.34
Date:,"Tue, 26 Jul 2022",,
Time:,14:26:19,,
No. Observations:,29,,
Df Residuals:,26,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
pagerank,-0.0498,0.040,-1.249,0.212,-0.128,0.028
1.0/2.0,-0.6699,0.283,-2.371,0.018,-1.224,-0.116
2.0/3.0,-0.3010,0.313,-0.960,0.337,-0.915,0.313


## Fast Food

In [101]:
# Types of fast food from the "Category": "dbc:Fast_food" 

fastfood_result = catMerger(df_fastFood, df_ig_gcm)
df_survey_fastfood = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_fast_food_prep.csv", sep=';')

df_fastfood_survey_gcm = gcmSurfer(df_survey_fastfood, fastfood_result)

Dataframe merged with shape: (4693, 5)
Rows with NaN values dropped: 17 from 31 => 54.84% 
Dataframe merged with shape: (14, 6)


In [102]:
df_survey_fastfood_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_fast_food_impr.csv", sep=';')

df_fastfood_survey_gcm_impr = gcmSurfer(df_survey_fastfood_impr, fastfood_result)

Rows with NaN values dropped: 9 from 27 => 33.33% 
Dataframe merged with shape: (18, 6)


In [103]:
ordinalReg(df_fastfood_survey_gcm_impr, ['degree'], 3)

Ordinal Regression:  ['degree']
-------------------------------------
Optimization terminated successfully.
         Current function value: 0.583724
         Iterations: 14
         Function evaluations: 16
         Gradient evaluations: 16


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-10.507
Model:,OrderedModel,AIC:,27.01
Method:,Maximum Likelihood,BIC:,29.69
Date:,"Tue, 26 Jul 2022",,
Time:,14:26:32,,
No. Observations:,18,,
Df Residuals:,15,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
degree,-0.4547,0.240,-1.893,0.058,-0.926,0.016
1.0/2.0,-1.5054,0.523,-2.878,0.004,-2.530,-0.480
2.0/3.0,-0.5895,0.655,-0.899,0.368,-1.874,0.695


In [104]:
ordinalReg(df_fastfood_survey_gcm_impr, ['eigenvector'], 3)

Ordinal Regression:  ['eigenvector']
-------------------------------------
Optimization terminated successfully.
         Current function value: 0.874093
         Iterations: 30
         Function evaluations: 31
         Gradient evaluations: 31


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-15.734
Model:,OrderedModel,AIC:,37.47
Method:,Maximum Likelihood,BIC:,40.14
Date:,"Tue, 26 Jul 2022",,
Time:,14:26:32,,
No. Observations:,18,,
Df Residuals:,15,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
eigenvector,-176.9676,130.372,-1.357,0.175,-432.493,78.558
1.0/2.0,87.1902,64.577,1.350,0.177,-39.378,213.758
2.0/3.0,-1.1415,0.668,-1.710,0.087,-2.450,0.167


In [105]:
ordinalReg(df_fastfood_survey_gcm_impr, ['pagerank'], 3)

Ordinal Regression:  ['pagerank']
-------------------------------------
Optimization terminated successfully.
         Current function value: 0.585697
         Iterations: 14
         Function evaluations: 16
         Gradient evaluations: 16


0,1,2,3
Dep. Variable:,camp,Log-Likelihood:,-10.543
Model:,OrderedModel,AIC:,27.09
Method:,Maximum Likelihood,BIC:,29.76
Date:,"Tue, 26 Jul 2022",,
Time:,14:26:32,,
No. Observations:,18,,
Df Residuals:,15,,
Df Model:,3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
pagerank,-0.4209,0.215,-1.959,0.050,-0.842,0.000
1.0/2.0,-1.4863,0.516,-2.879,0.004,-2.498,-0.474
2.0/3.0,-0.5943,0.655,-0.907,0.365,-1.879,0.690


## European landmarks/sights

In [106]:
# European Landmark/Sight from multiple "Category": "dbc:Landmarks_in_x" with x being a country from europe

euSight_result = catMerger(df_euSight, df_ig_gcm)
df_survey_euSight = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_european_landmarks_prep.csv", sep=';')

df_euSight_survey_gcm = gcmSurfer(df_survey_euSight, euSight_result)

Dataframe merged with shape: (1403, 5)
Rows with NaN values dropped: 59 from 65 => 90.77% 
Dataframe merged with shape: (6, 6)


In [None]:
df_survey_euSight_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_european_landmarks_impr.csv", sep=';')

df_euSight_survey_gcm_impr = gcmSurfer(df_survey_euSight_impr, euSight_result)

In [None]:
ordinalReg(df_euSight_survey_gcm, ['degree'], 3)

In [None]:
ordinalReg(df_euSight_survey_gcm, ['eigenvector'], 3)

In [None]:
ordinalReg(df_euSight_survey_gcm, ['pagerank'], 3)

## DAX companies

In [None]:
# DAX company from the "Type": "dbo:Company" and "dbp:tradedAs" -> "dbr:DAX"

daxCo_result = catMerger(df_daxCo, df_ig_gcm)
df_daxCo = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_dax_companies_prep.csv", sep=';')

df_daxCo_survey_gcm = gcmSurfer(df_daxCo, daxCo_result)

In [None]:
df_daxCo_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_dax_companies_impr.csv", sep=';')

df_daxCo_survey_gcm_impr = gcmSurfer(df_daxCo_impr, daxCo_result)

In [None]:
df_daxCo_survey_gcm

In [None]:
ordinalReg(df_daxCo_survey_gcm, ['degree'], 3)

In [None]:
ordinalReg(df_daxCo_survey_gcm, ['eigenvector'], 3)

In [None]:
ordinalReg(df_daxCo_survey_gcm, ['pagerank'], 3)

## US companies

In [None]:
# US company from "Category": Lists_of_companies_of_the_United_States

usCo_result = catMerger(df_usCo, df_ig_gcm)
df_survey_usCo = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_us_companies_prep.csv", sep=';', encoding="ISO-8859-1")

df_usCo_survey_gcm = gcmSurfer(df_survey_usCo, usCo_result)

In [None]:
usCo_result = catMerger(df_usCo, df_ig_gcm)
df_survey_usCo_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_us_companies_impr.csv", sep=';', encoding="ISO-8859-1") # .csv needed no manual uri correction

df_usCo_survey_gcm_impr = gcmSurfer(df_survey_usCo_impr, usCo_result)

In [None]:
ordinalReg(df_usCo_survey_gcm, ['degree'], 3)

In [None]:
ordinalReg(df_usCo_survey_gcm, ['eigenvector'], 3)

In [None]:
ordinalReg(df_usCo_survey_gcm, ['pagerank'], 3)

## US technology companies

In [None]:
# US tech company from the "Category": "dbc:Technology_companies_of_the_United_States"

usTech_result = catMerger(df_usTechCo, df_ig_gcm)
df_survey_usTech = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_us_tech_companies_prep.csv", sep=';') # .csv needed no manual uri correction

df_usTech_survey_gcm = gcmSurfer(df_survey_usTech, usTech_result)

In [None]:
df_survey_usTech_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_us_tech_companies_impr.csv", sep=';') # .csv needed no manual uri correction

df_usTech_survey_gcm_impr = gcmSurfer(df_survey_usTech_impr, usTech_result)

In [None]:
ordinalReg(df_usTech_survey_gcm, ['degree'], 3)

In [None]:
ordinalReg(df_usTech_survey_gcm, ['eigenvector'], 3)

In [None]:
ordinalReg(df_usTech_survey_gcm, ['pagerank'], 3)