# Ordinal Regression

In [1]:
# Importing packages
import pandas as pd
#from SPARQLWrapper import SPARQLWrapper, JSON, N3
#import lxml
import numpy as np
import matplotlib.pyplot as plt
#import seaborn as sns
#import networkx as nx
#from networkx import Graph as NXGraph
#from rdflib import Graph as RDFGraph
#from rdflib.extras.external_graph_libs import rdflib_to_networkx_graph
#import statistics
#import collections
from scipy import stats
from scipy.stats import kstest
# https://www.statology.org/normality-test-python/

#from urllib import parse
from sklearn import linear_model
#from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
import statsmodels.api as sm
from statsmodels.miscmodels.ordinal_model import OrderedModel

import mlnotify 
# https://github.com/aporia-ai/mlnotify
# Use %%notify at beginning of cell

import os.path, time
# Use %%time at beginning of cell

print('------------')
!python --version
print("Pandas " + pd.__version__)
print("Numpy " + np.__version__)
#print("Seaborn " + sns.__version__)
#print("Networkx " + nx.__version__)
print('------------')
print("All packages loaded and ready to roll :-)")

------------
Python 3.7.6
Pandas 1.3.4
Numpy 1.18.1
------------
All packages loaded and ready to roll :-)


In [2]:
# Load GCM data

# NetworkX
#df_gcm = pd.read_csv('../data/gcm_computed.nosync/nx-gcm.csv')

# iGraph
df_ig_gcm = pd.read_csv('../data/gcm_computed.nosync/ig-gcm.csv')

df_ig_gcm.head()

Unnamed: 0,resource,degree,eigenvector,pagerank
0,http://dbpedia.org/resource/Billy_Coggins__Car...,3,0.0007845722,5.936257e-08
1,http://dbpedia.org/ontology/CareerStation,1624880,1.0,0.02791082
2,http://dbpedia.org/resource/News_Patrol__A2Z_N...,9,7.198654e-11,1.150764e-07
3,http://dbpedia.org/resource/ZOE_Broadcasting_N...,20,9.183437e-10,2.792414e-07
4,http://dbpedia.org/resource/Yu_Yongfu,6,2.790954e-09,1.047756e-07


In [3]:
print("Data types:")
print(df_ig_gcm.dtypes)
print('----------------------')
print("Memory usage:")
print(df_ig_gcm.memory_usage(deep=True))
print('------------------------')
print("Dataframe shape is {}".format(df_ig_gcm.shape))

Data types:
resource        object
degree           int64
eigenvector    float64
pagerank       float64
dtype: object
----------------------
Memory usage:
Index                128
resource       964958363
degree          65946664
eigenvector     65946664
pagerank        65946664
dtype: int64
------------------------
Dataframe shape is (8243333, 4)


In [4]:
# Load pre-queried (via queried web-interface) DBpedia categories
# https://dbpedia.org/sparql/

# 90s films from the aggregated Categories: 1990_films, 1991_films, 1992_films, ...
df_film90 = pd.read_csv("../data/sparql_cat_queries/sparql_90film.csv")
df_film90.rename(columns={"film90": "resource"}, inplace=True) #rename column to match PageRank dataframe

# actors from the "Occupation": "Actor"@en
df_actor = pd.read_csv("../data/sparql_cat_queries/sparql_actor.csv")
df_actor.rename(columns={"actor": "resource"}, inplace=True)

# Music Genre from the "Type": "dbo:MusicGenre"
df_musicGenre = pd.read_csv("../data/sparql_cat_queries/sparql_musicGenre.csv")
df_musicGenre.rename(columns={"mGenre": "resource"}, inplace=True)

# Books from the "Type":"dbo:book"
df_book = pd.read_csv("../data/sparql_cat_queries/sparql_book.csv")
df_book.rename(columns={"book": "resource"}, inplace=True)

# Book authors from the "Type": author UNION book dbo:author
df_bookAut = pd.read_csv("../data/sparql_cat_queries/sparql_bookAuthor.csv")
df_bookAut.rename(columns={"author": "resource"}, inplace=True)

# Politicians from the "Type": Person/Politician
df_pol = pd.read_csv("../data/sparql_cat_queries/sparql_politician.csv")
df_pol.rename(columns={"pol": "resource"}, inplace=True)

# POTUS from the "Category": Presidents_of_the_United_States
df_potus = pd.read_csv("../data/sparql_cat_queries/sparql_potus.csv")
df_potus.rename(columns={"potus": "resource"}, inplace=True)

# Cities from the "Category": "dbo:City"
df_city = pd.read_csv("../data/sparql_cat_queries/sparql_city.csv")
df_city.rename(columns={"city": "resource"}, inplace=True)

# Lakes from the "Category": "dbo:Lake"
df_lake = pd.read_csv("../data/sparql_cat_queries/sparql_lake.csv")
df_lake.rename(columns={"lake": "resource"}, inplace=True)

# Mountains from the "Category": "dbo:Mountain"
df_mountain = pd.read_csv("../data/sparql_cat_queries/sparql_mountain.csv")
df_mountain.rename(columns={"mount": "resource"}, inplace=True)

# EU capitals from the "Category": Capitals_in_Europe
df_euCap = pd.read_csv("../data/sparql_cat_queries/sparql_euCap.csv")
df_euCap.rename(columns={"euCap": "resource"}, inplace=True)

# EU countries from the "Category": Countries_in_Europe
df_euCou = pd.read_csv("../data/sparql_cat_queries/sparql_euCountry.csv")
df_euCou.rename(columns={"euCountry": "resource"}, inplace=True)

# Asian countries from the "Category": "dbc:Countries_in_Asia", "dbc:East_Asian_countries" and "dbc:Central_Asian_countries"
df_asCou = pd.read_csv("../data/sparql_cat_queries/sparql_asCountry.csv")
df_asCou.rename(columns={"asCountry": "resource"}, inplace=True)

# Continents from the "Category": "dbo:Continents"
df_continent = pd.read_csv("../data/sparql_cat_queries/sparql_continent.csv")
df_continent.rename(columns={"conti": "resource"}, inplace=True)

# Birds from the "Class": "dbr:Bird"
df_bird = pd.read_csv("../data/sparql_cat_queries/sparql_bird.csv")
df_bird.rename(columns={"bird": "resource"}, inplace=True)

# African animals from the "Category": "dbc:Vertebrates_of_Africa"
df_africanAnimal = pd.read_csv("../data/sparql_cat_queries/sparql_africanAnimal.csv")
df_africanAnimal.rename(columns={"afrAnimal": "resource"}, inplace=True)

# Furniture from the "Category": "dbc:Furniture"
df_furniture = pd.read_csv("../data/sparql_cat_queries/sparql_furniture.csv")
df_furniture.rename(columns={"fur": "resource"}, inplace=True)

# Furniture from the "Category": "dbc:Vegetables"
df_vegetable = pd.read_csv("../data/sparql_cat_queries/sparql_vegetable.csv")
df_vegetable.rename(columns={"veg": "resource"}, inplace=True)

# Types of fast food from the "Category": "dbc:Fast_food"
df_fastFood = pd.read_csv("../data/sparql_cat_queries/sparql_fastfood.csv")
df_fastFood.rename(columns={"fastFood": "resource"}, inplace=True)

# European Landmark/Sight from multiple "Category": "dbc:Landmarks_in_x" with x being a country from europe
df_euSight = pd.read_csv("../data/sparql_cat_queries/sparql_euLandmark.csv")
df_euSight.rename(columns={"euroSight": "resource"}, inplace=True)

# DAX company from the "Type": "dbo:Company" and "dbp:tradedAs" -> "dbr:DAX"
df_daxCo = pd.read_csv("../data/sparql_cat_queries/sparql_daxCompany.csv")
df_daxCo.rename(columns={"DAXco": "resource"}, inplace=True)

# US company from ?
df_usCo = pd.read_csv("../data/sparql_cat_queries/sparql_usCompany.csv")
df_usCo.rename(columns={"usCo": "resource"}, inplace=True)

# US tech company from the "Category": "dbc:Technology_companies_of_the_United_States"
df_usTechCo = pd.read_csv("../data/sparql_cat_queries/sparql_usTechCompany.csv")
df_usTechCo.rename(columns={"usTech": "resource"}, inplace=True)

In [5]:
# Function for standardized scaling and normalisation of GCM values
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

def scaler(data_frame):
    helper_df = data_frame.copy()
    to_scale = StandardScaler()

    helper_df[['degree', 'eigenvector', 'pagerank']] = to_scale.fit_transform(helper_df[['degree', 'eigenvector', 'pagerank']])
    return helper_df

#df_ig_gcm = scaler(df_ig_gcm)

In [12]:
# Function for merging the sparql category with GCM data

# Merge the queried category df to the GCM metrics based on <resource> with a left outer join
  # (Keep every row in the category dataframe -> missing values get NaN)
  # Then, normalize GCM metrics

def catMerger(sparql_data, gcm_data):
    df_sparql = sparql_data.copy()
    df_gcm = gcm_data.copy()
    helper_df = pd.merge(df_sparql,
                #df_gcm[['resource','degree','pagerank']],
                df_gcm[['resource','degree', 'eigenvector', 'pagerank']],
                on='resource', 
                how='left')
    
    # scale and normalize the gcm metrics (same as scaler function)
    to_scale = preprocessing.StandardScaler()
    helper_df[['degree', 'eigenvector', 'pagerank']] = to_scale.fit_transform(helper_df[['degree', 'eigenvector', 'pagerank']])
    
    print("Dataframe merged with shape: {}".format(helper_df.shape))
    print('============================')

    #print(helper_df.shape[0] - helper_df.dropna().shape[0]) #return amount of rows containing NaN values
    
    return helper_df

In [13]:
# Function for merging the survey category data with the gcm category data

# Merge the survey category df to the GCM metrics per category based on <resource> with a left outer join
  # (Keep every row in the category dataframe -> missing values get NaN)

def ALTgcmSurfer(survey_data, cat_gcm_data):
    df_survey = survey_data.copy()
    df_cat_gcm = cat_gcm_data.copy()
    df_survey.rename(columns={"uri": "resource"}, inplace=True) #rename column to match GCM dataframe
    
    # Count how often an answer was given in the survey and save into new dataframe
    survey_count = df_survey['resource'].value_counts(normalize=True) #normalize values
    df_survey_counted = pd.DataFrame({'resource':survey_count.index, 'amount':survey_count.values})

    df_survey_gcm = pd.merge(df_survey_counted,
                    df_cat_gcm[['resource','name', 'degree','eigenvector','pagerank']],
                    #df_cat_gcm[['resource', 'degree','eigenvector','pagerank']],
                    on='resource', 
                    how='left')
    
    nan_rows = df_survey_gcm.shape[0] - df_survey_gcm.dropna().shape[0]
    
    print("Rows with NaN values dropped: {}".format(nan_rows) + " from {}".format(df_survey_gcm.shape[0]) + " => {}% ".format(round((nan_rows/df_survey_gcm.shape[0])*100, 2))) #return amount of rows containing NaN values
    df_survey_gcm = df_survey_gcm.dropna()
    print("Dataframe merged with shape: {}".format(df_survey_gcm.shape))
    
    return df_survey_gcm

In [14]:
# Function for merging the survey category data with the gcm category data

# Merge the survey category df to the GCM metrics per category based on <resource> with a left outer join
  # (Keep every row in the category dataframe -> missing values get NaN)
  # Drop NaN rows, then -> normalize survey results

def gcmSurfer(survey_data, cat_gcm_data):
    df_survey = survey_data.copy()
    df_cat_gcm = cat_gcm_data.copy()
    df_survey.rename(columns={"uri": "resource"}, inplace=True) #rename column to match GCM dataframe
        
    # Count how often an answer was given in the survey and save into new dataframe
    survey_count = df_survey['resource'].value_counts() #normalize values
    df_survey_counted = pd.DataFrame({'resource':survey_count.index, 'amount':survey_count.values})

    df_survey_gcm = pd.merge(df_survey_counted,
                    df_cat_gcm[['resource','name', 'degree','eigenvector','pagerank']],
                    #df_cat_gcm[['resource', 'degree','eigenvector','pagerank']],
                    on='resource', 
                    how='left')
    
    # Drop rows that were not matched with the category
    nan_rows = df_survey_gcm.shape[0] - df_survey_gcm.dropna().shape[0]    
    print("Rows with NaN values dropped: {}".format(nan_rows) + " from {}".format(df_survey_gcm.shape[0]) + " => {}% ".format(round((nan_rows/df_survey_gcm.shape[0])*100, 2))) #return amount of rows containing NaN values
    df_survey_gcm = df_survey_gcm.dropna()
    
    # Count how often an answer was given in the survey and save into new dataframe
    #df_survey_gcm['survey_mentions'] = df_survey_gcm['resource'].value_counts(normalize=True) #normalize values
    #df_survey_counted = pd.DataFrame({'resource':survey_count.index, 'amount':survey_count.values})
    
    # scale and normalize the survey data
    min_max_scaler = preprocessing.MinMaxScaler()
    df_survey_gcm[['amount']] = min_max_scaler.fit_transform(df_survey_gcm[['amount']])
    
    print("Dataframe merged with shape: {}".format(df_survey_gcm.shape))
    
    return df_survey_gcm

In [15]:
# Function for creating "Camps" inside a category that is later used for ordinal regression

# Takes two parameter: dataframe, num_splits
  # dataframe = input dataframe containing URI, amount (survey mentions), GCMs (degree, eigenvector, pagerank)
  # num_splits = number for the camp size, e.g. 3 => three camps with 1 = top 33%; 2 = mid 33%; 3 = low 33%

def camper(data_frame, num_splits):
    helper_df = data_frame.copy()
    split_v = [int(len(helper_df.index)/num_splits * i) for i in np.arange(num_splits)]
    
    for i in helper_df.index:
        split_smaller = [split <= i for split in split_v]
        camp = np.sum(split_smaller)
        
        #return dataframe
        helper_df.at[i,'camp'] = camp
        
    return helper_df

In [16]:
def ordinalReg(df_survey_gcm, list_gcm, num_splits): # list_gcm -> ['degree', 'eigenvector', 'pagerank']  
    helper_df = df_survey_gcm.copy()
    camped_df = camper(helper_df, num_splits) #calls camper function to create column with camps

    mod_prob = OrderedModel(camped_df['camp'],
                            camped_df[list_gcm],
                            distr='probit')
    
    print("Ordinal Regression:  {}".format(list_gcm))
    print('-------------------------------------')
    res_prob = mod_prob.fit(method='bfgs')

    return res_prob.summary()

## 90s film

In [17]:
# 90s films from the aggregated Categories: 1990_films, 1991_films, 1992_films, ...

film90_result = catMerger(df_film90, df_ig_gcm)
df_survey_film90 = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_film90_prep.csv", sep=';')

df_survey_film90_gcm = gcmSurfer(df_survey_film90, film90_result)

Dataframe merged with shape: (10000, 5)
Rows with NaN values dropped: 98 from 121 => 80.99% 
Dataframe merged with shape: (23, 6)


In [18]:
# Use the manually improved URIs instead
df_survey_film90impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_film90_impr.csv", sep=';')

df_survey_film90_gcm_impr = gcmSurfer(df_survey_film90impr, film90_result)

Rows with NaN values dropped: 56 from 112 => 50.0% 
Dataframe merged with shape: (56, 6)


In [None]:
df_survey_film90_gcm

In [None]:
ALTdf_survey_film90_gcm = ALTgcmSurfer(df_survey_film90, film90_result)

In [None]:
ALTdf_survey_film90_gcm

In [None]:
ordinalReg(df_survey_film90_gcm, ['degree'], 3)

In [None]:
ordinalReg(df_survey_film90_gcm, ['eigenvector'], 3)

In [None]:
ordinalReg(df_survey_film90_gcm, ['pagerank'], 3)

## Actors

In [19]:
# actors from the "Occupation": "Actor"@en

actor_result = catMerger(df_actor, df_ig_gcm)
df_survey_actor = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_actors_prep.csv", sep=';')

df_survey_actor_gcm = gcmSurfer(df_survey_actor, actor_result)

Dataframe merged with shape: (10000, 5)
Rows with NaN values dropped: 80 from 143 => 55.94% 
Dataframe merged with shape: (63, 6)


In [20]:
df_survey_actor_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_actors_impr.csv", sep=';')

df_survey_actor_gcm_impr = gcmSurfer(df_survey_actor_impr, actor_result)

Rows with NaN values dropped: 69 from 138 => 50.0% 
Dataframe merged with shape: (69, 6)


In [None]:
df_survey_actor_gcm

In [None]:
ordinalReg(df_survey_actor_gcm, ['degree'], 3)

In [None]:
ordinalReg(df_survey_actor_gcm, ['eigenvector'], 3)

In [None]:
ordinalReg(df_survey_actor_gcm, ['pagerank'], 3)

## Music genres

In [21]:
# Music Genre from the "Type": "dbo:MusicGenre"

mGenre_result = catMerger(df_musicGenre, df_ig_gcm)
df_survey_mGenre = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_music_genres_prep.csv", sep=';')

df_survey_mGenre_gcm = gcmSurfer(df_survey_mGenre, mGenre_result)

Dataframe merged with shape: (10000, 5)
Rows with NaN values dropped: 9 from 42 => 21.43% 
Dataframe merged with shape: (33, 6)


In [22]:
df_survey_mGenre_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_music_genres_impr.csv", sep=';')

df_survey_mGenre_gcm_impr = gcmSurfer(df_survey_mGenre_impr, mGenre_result)

Rows with NaN values dropped: 5 from 38 => 13.16% 
Dataframe merged with shape: (33, 6)


In [None]:
ordinalReg(df_survey_mGenre_gcm, ['degree'], 3)

In [None]:
ordinalReg(df_survey_mGenre_gcm, ['eigenvector'], 3)

In [None]:
ordinalReg(df_survey_mGenre_gcm, ['pagerank'], 3)

## Books

In [23]:
# Books from the "Type":"dbo:book"

book_result = catMerger(df_book, df_ig_gcm)
df_survey_book = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_books_prep.csv", sep=';')

df_survey_book_gcm = gcmSurfer(df_survey_book, book_result)

Dataframe merged with shape: (10000, 5)
Rows with NaN values dropped: 172 from 182 => 94.51% 
Dataframe merged with shape: (10, 6)


In [24]:
df_survey_book_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_books_impr.csv", sep=';')

df_survey_book_gcm_impr = gcmSurfer(df_survey_book_impr, book_result)

Rows with NaN values dropped: 139 from 156 => 89.1% 
Dataframe merged with shape: (17, 6)


In [None]:
ordinalReg(df_survey_book_gcm_impr, ['degree'], 3)

In [None]:
ordinalReg(df_survey_book_gcm_impr, ['eigenvector'], 3)

In [None]:
ordinalReg(df_survey_book_gcm_impr, ['pagerank'], 3)

## Book authors

In [25]:
# Book authors from the "Type": author UNION book dbo:author

bookAut_result = catMerger(df_bookAut, df_ig_gcm)
df_survey_bookAut = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_authors_prep.csv", sep=';')

df_survey_bookAut_gcm = gcmSurfer(df_survey_bookAut, bookAut_result)

Dataframe merged with shape: (4498, 5)
Rows with NaN values dropped: 125 from 143 => 87.41% 
Dataframe merged with shape: (18, 6)


In [26]:
df_survey_bookAut_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_authors_impr.csv", sep=';')

df_survey_bookAut_gcm_impr = gcmSurfer(df_survey_bookAut_impr, bookAut_result)

Rows with NaN values dropped: 119 from 138 => 86.23% 
Dataframe merged with shape: (19, 6)


In [None]:
ordinalReg(df_survey_bookAut_gcm, ['degree'], 3)

In [None]:
ordinalReg(df_survey_bookAut_gcm, ['eigenvector'], 3)

In [None]:
ordinalReg(df_survey_bookAut_gcm, ['pagerank'], 3)

## Politician

In [27]:
# Politicians from the "Type": Person/Politician

pol_result = catMerger(df_pol, df_ig_gcm)
df_survey_pol = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_politicians_prep.csv", sep=';')

df_survey_pol_gcm = gcmSurfer(df_survey_pol, pol_result)

Dataframe merged with shape: (10000, 5)
Rows with NaN values dropped: 72 from 75 => 96.0% 
Dataframe merged with shape: (3, 6)


In [28]:
df_survey_pol_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_politicians_impr.csv", sep=';')

df_survey_pol_gcm_impr = gcmSurfer(df_survey_pol_impr, pol_result)

Rows with NaN values dropped: 73 from 76 => 96.05% 
Dataframe merged with shape: (3, 6)


In [None]:
ordinalReg(df_survey_pol_gcm, ['degree'], 3)

In [None]:
ordinalReg(df_survey_pol_gcm, ['eigenvector'], 3)

In [None]:
ordinalReg(df_survey_pol_gcm, ['pagerank'], 3)

### POTUS

In [29]:
# POTUS from the "Category": Presidents_of_the_United_States

potus_result = catMerger(df_potus, df_ig_gcm)
df_survey_potus = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_potus_prep.csv", sep=';')

df_survey_potus_gcm = gcmSurfer(df_survey_potus, potus_result)

Dataframe merged with shape: (48, 5)
Rows with NaN values dropped: 12 from 27 => 44.44% 
Dataframe merged with shape: (15, 6)


In [30]:
# POTUS from the "Category": Presidents_of_the_United_States

potus_result = catMerger(df_potus, df_ig_gcm)
df_survey_potus_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_potus_impr.csv", sep=';')

df_survey_potus_gcm_impr = gcmSurfer(df_survey_potus_impr, potus_result)

Dataframe merged with shape: (48, 5)
Rows with NaN values dropped: 0 from 19 => 0.0% 
Dataframe merged with shape: (19, 6)


In [None]:
ordinalReg(df_survey_potus_gcm, ['degree'], 3)

In [None]:
ordinalReg(df_survey_potus_gcm, ['eigenvector'], 3)

In [None]:
ordinalReg(df_survey_potus_gcm, ['pagerank'], 3)

## Cities

In [31]:
# Cities from the "Category": "dbo:City"

city_result = catMerger(df_city, df_ig_gcm)
df_survey_city = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_cities_prep.csv", sep=';')

df_city_survey_gcm = gcmSurfer(df_survey_city, city_result)

Dataframe merged with shape: (10000, 5)
Rows with NaN values dropped: 85 from 89 => 95.51% 
Dataframe merged with shape: (4, 6)


In [32]:
df_survey_city_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_cities_impr.csv", sep=';')

df_city_survey_gcm_impr = gcmSurfer(df_survey_city_impr, city_result)

Rows with NaN values dropped: 82 from 90 => 91.11% 
Dataframe merged with shape: (8, 6)


In [None]:
ordinalReg(df_city_survey_gcm, ['degree'], 3)

In [None]:
ordinalReg(df_city_survey_gcm, ['eigenvector'], 3)

In [None]:
ordinalReg(df_city_survey_gcm, ['pagerank'], 3)

## Lakes

In [33]:
# Lakes from the "Category": "dbo:Lake"

lake_result = catMerger(df_lake, df_ig_gcm)
df_survey_lake = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_lakes_prep.csv", sep=';')

df_lake_survey_gcm = gcmSurfer(df_survey_lake, lake_result)

Dataframe merged with shape: (10000, 5)
Rows with NaN values dropped: 42 from 80 => 52.5% 
Dataframe merged with shape: (38, 6)


In [34]:
df_survey_lake_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_lakes_impr.csv", sep=';')

df_lake_survey_gcm_impr = gcmSurfer(df_survey_lake_impr, lake_result)

Rows with NaN values dropped: 20 from 70 => 28.57% 
Dataframe merged with shape: (50, 6)


In [None]:
ordinalReg(df_lake_survey_gcm, ['degree'], 3)

In [None]:
ordinalReg(df_lake_survey_gcm, ['eigenvector'], 3)

In [None]:
ordinalReg(df_lake_survey_gcm, ['pagerank'], 3)

### Mountains

In [35]:
# Mountains from the "Category": "dbo:Mountain"

mountain_result = catMerger(df_mountain, df_ig_gcm)
df_survey_mountain = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_mountains_prep.csv", sep=';')

df_mountain_survey_gcm = gcmSurfer(df_survey_mountain, mountain_result)

Dataframe merged with shape: (10000, 5)
Rows with NaN values dropped: 28 from 56 => 50.0% 
Dataframe merged with shape: (28, 6)


In [36]:
df_survey_mountain_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_mountains_impr.csv", sep=';')

df_mountain_survey_gcm_impr = gcmSurfer(df_survey_mountain_impr, mountain_result)

Rows with NaN values dropped: 17 from 52 => 32.69% 
Dataframe merged with shape: (35, 6)


In [None]:
ordinalReg(df_mountain_survey_gcm, ['degree'], 3)

In [None]:
ordinalReg(df_mountain_survey_gcm, ['eigenvector'], 3)

In [None]:
ordinalReg(df_mountain_survey_gcm, ['pagerank'], 3)

### EU capital cities

In [37]:
euCap_result = catMerger(df_euCap, df_ig_gcm)
df_survey_euCap = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_european_capital_cities_prep.csv", sep=';')

df_euCap_survey_gcm = gcmSurfer(df_survey_euCap, euCap_result)

Dataframe merged with shape: (52, 5)
Rows with NaN values dropped: 7 from 29 => 24.14% 
Dataframe merged with shape: (22, 6)


In [38]:
df_survey_euCap_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_european_capital_cities_impr.csv", sep=';')

df_euCap_survey_gcm_impr = gcmSurfer(df_survey_euCap_impr, euCap_result)

Rows with NaN values dropped: 4 from 28 => 14.29% 
Dataframe merged with shape: (24, 6)


In [None]:
ordinalReg(df_euCap_survey_gcm, ['degree'], 3)

In [None]:
ordinalReg(df_euCap_survey_gcm, ['pagerank'], 3)

In [None]:
ordinalReg(df_euCap_survey_gcm, ['eigenvector'], 3)

### European countries

In [39]:
euCou_result = catMerger(df_euCou, df_ig_gcm)
df_survey_euCou = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_european_countries_prep.csv", sep=';')

df_euCou_survey_gcm = gcmSurfer(df_survey_euCou, euCou_result)

Dataframe merged with shape: (51, 5)
Rows with NaN values dropped: 1 from 29 => 3.45% 
Dataframe merged with shape: (28, 6)


In [40]:
df_survey_euCou_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_european_countries_impr.csv", sep=';')

df_euCou_survey_gcm_impr = gcmSurfer(df_survey_euCou_impr, euCou_result)

Rows with NaN values dropped: 0 from 29 => 0.0% 
Dataframe merged with shape: (29, 6)


In [None]:
ordinalReg(df_euCou_survey_gcm, ['degree'], 3)

In [None]:
ordinalReg(df_euCou_survey_gcm, ['eigenvector'], 3)

In [None]:
ordinalReg(df_euCou_survey_gcm, ['pagerank'], 3)

## Asian countries

In [41]:
# Asian countries from the "Category": "dbc:Countries_in_Asia", "dbc:East_Asian_countries" and "dbc:Central_Asian_countries"

asCou_result = catMerger(df_asCou, df_ig_gcm)
df_survey_asCou = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_asian_countries_prep.csv", sep=';') # .csv needed no manual uri correction

df_asCou_survey_gcm = gcmSurfer(df_survey_asCou, asCou_result)

Dataframe merged with shape: (60, 5)
Rows with NaN values dropped: 7 from 33 => 21.21% 
Dataframe merged with shape: (26, 6)


In [42]:
df_survey_asCou_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_asian_countries_impr.csv", sep=';') # .csv needed no manual uri correction

df_asCou_survey_gcm_impr = gcmSurfer(df_survey_asCou_impr, asCou_result)

Rows with NaN values dropped: 7 from 33 => 21.21% 
Dataframe merged with shape: (26, 6)


In [None]:
ordinalReg(df_asCou_survey_gcm, ['degree'], 3)

In [None]:
ordinalReg(df_asCou_survey_gcm, ['eigenvector'], 3)

In [None]:
ordinalReg(df_asCou_survey_gcm, ['pagerank'], 3)

## Continents

In [43]:
# Continents from the "Category": "dbo:Continents"

continent_result = catMerger(df_continent, df_ig_gcm)
df_survey_continent = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_continents_prep.csv", sep=';')

df_continent_survey_gcm = gcmSurfer(df_survey_continent, continent_result)

Dataframe merged with shape: (20, 5)
Rows with NaN values dropped: 7 from 10 => 70.0% 
Dataframe merged with shape: (3, 6)


In [44]:
df_survey_continent_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_continents_impr.csv", sep=';') # .csv needed no manual uri correction

df_continent_survey_gcm_impr = gcmSurfer(df_survey_continent_impr, continent_result)

Rows with NaN values dropped: 1 from 8 => 12.5% 
Dataframe merged with shape: (7, 6)


In [None]:
ordinalReg(df_continent_survey_gcm, ['degree'], 3)

In [None]:
ordinalReg(df_continent_survey_gcm, ['eigenvector'], 3)

In [None]:
ordinalReg(df_continent_survey_gcm, ['pagerank'], 3)

## Birds

In [45]:
# Birds from the "Class": "dbr:Bird"

bird_result = catMerger(df_bird, df_ig_gcm)
df_survey_bird = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_birds_prep.csv", sep=';')

df_birds_survey_gcm = gcmSurfer(df_survey_bird, bird_result)

Dataframe merged with shape: (10000, 5)
Rows with NaN values dropped: 52 from 61 => 85.25% 
Dataframe merged with shape: (9, 6)


In [46]:
df_survey_bird_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_birds_impr.csv", sep=';')

df_birds_survey_gcm_impr = gcmSurfer(df_survey_bird_impr, bird_result)

Rows with NaN values dropped: 27 from 50 => 54.0% 
Dataframe merged with shape: (23, 6)


In [None]:
ordinalReg(df_birds_survey_gcm, ['degree'], 3)

In [None]:
ordinalReg(df_birds_survey_gcm, ['eigenvector'], 3)

In [None]:
ordinalReg(df_birds_survey_gcm, ['pagerank'], 3)

## African animals

In [47]:
# African animals from the "Category": "dbc:Vertebrates_of_Africa"

afrAnimal_result = catMerger(df_africanAnimal, df_ig_gcm)
df_survey_afrAnimal = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_african_animals_prep.csv", sep=';')

df_afrAnimal_survey_gcm = gcmSurfer(df_survey_afrAnimal, afrAnimal_result)

Dataframe merged with shape: (10000, 5)
Rows with NaN values dropped: 22 from 29 => 75.86% 
Dataframe merged with shape: (7, 6)


In [48]:
df_survey_afrAnimal_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_african_animals_impr.csv", sep=';')

df_afrAnimal_survey_gcm_impr = gcmSurfer(df_survey_afrAnimal_impr, afrAnimal_result)

Rows with NaN values dropped: 11 from 25 => 44.0% 
Dataframe merged with shape: (14, 6)


In [None]:
ordinalReg(df_afrAnimal_survey_gcm, ['degree'], 3)

In [None]:
ordinalReg(df_afrAnimal_survey_gcm, ['eigenvector'], 3)

In [None]:
ordinalReg(df_afrAnimal_survey_gcm, ['pagerank'], 3)

## Furniture

In [49]:
# Furniture from the "Category": "dbc:Furniture"

furniture_result = catMerger(df_furniture, df_ig_gcm)
df_survey_furniture = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_furniture_prep.csv", sep=';') # .csv needed no manual uri correction

df_furniture_survey_gcm = gcmSurfer(df_survey_furniture, furniture_result)

Dataframe merged with shape: (193, 5)
Rows with NaN values dropped: 27 from 30 => 90.0% 
Dataframe merged with shape: (3, 6)


In [50]:
df_survey_furniture_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_furniture_impr.csv", sep=';')

df_furniture_survey_gcm_impr = gcmSurfer(df_survey_furniture_impr, furniture_result)

Rows with NaN values dropped: 16 from 27 => 59.26% 
Dataframe merged with shape: (11, 6)


In [None]:
ordinalReg(df_furniture_survey_gcm, ['degree'], 3)

In [None]:
ordinalReg(df_furniture_survey_gcm, ['eigenvector'], 3)

In [None]:
ordinalReg(df_furniture_survey_gcm, ['pagerank'], 3)

## Vegetables

In [51]:
# Vegetables from the "Category": "dbc:Vegetables"

vegetable_result = catMerger(df_vegetable, df_ig_gcm)
df_survey_vegetable = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_vegetables_prep.csv", sep=';') # .csv needed no manual uri correction

df_vegetable_survey_gcm = gcmSurfer(df_survey_vegetable, vegetable_result)

Dataframe merged with shape: (3742, 5)
Rows with NaN values dropped: 17 from 42 => 40.48% 
Dataframe merged with shape: (25, 6)


In [52]:
df_survey_vegetable_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_vegetables_impr.csv", sep=';') # .csv needed no manual uri correction

df_vegetable_survey_gcm_impr = gcmSurfer(df_survey_vegetable_impr, vegetable_result)

Rows with NaN values dropped: 8 from 37 => 21.62% 
Dataframe merged with shape: (29, 6)


In [None]:
ordinalReg(df_vegetable_survey_gcm, ['degree'], 3)

In [None]:
ordinalReg(df_vegetable_survey_gcm, ['eigenvector'], 3)

In [None]:
ordinalReg(df_vegetable_survey_gcm, ['pagerank'], 3)

## Fast Food

In [53]:
# Types of fast food from the "Category": "dbc:Fast_food" 

fastfood_result = catMerger(df_fastFood, df_ig_gcm)
df_survey_fastfood = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_fast_food_prep.csv", sep=';')

df_fastfood_survey_gcm = gcmSurfer(df_survey_fastfood, fastfood_result)

Dataframe merged with shape: (4693, 5)
Rows with NaN values dropped: 17 from 31 => 54.84% 
Dataframe merged with shape: (14, 6)


In [54]:
df_survey_fastfood_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_fast_food_impr.csv", sep=';')

df_fastfood_survey_gcm_impr = gcmSurfer(df_survey_fastfood_impr, fastfood_result)

Rows with NaN values dropped: 9 from 27 => 33.33% 
Dataframe merged with shape: (18, 6)


In [None]:
ordinalReg(df_fastfood_survey_gcm, ['degree'], 3)

In [None]:
ordinalReg(df_fastfood_survey_gcm, ['eigenvector'], 3)

In [None]:
ordinalReg(df_fastfood_survey_gcm, ['pagerank'], 3)

## European landmarks/sights

In [55]:
# European Landmark/Sight from multiple "Category": "dbc:Landmarks_in_x" with x being a country from europe

euSight_result = catMerger(df_euSight, df_ig_gcm)
df_survey_euSight = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_european_landmarks_prep.csv", sep=';')

df_euSight_survey_gcm = gcmSurfer(df_survey_euSight, euSight_result)

Dataframe merged with shape: (1403, 5)
Rows with NaN values dropped: 59 from 65 => 90.77% 
Dataframe merged with shape: (6, 6)


In [57]:
df_survey_euSight_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_european_landmarks_impr.csv", sep=';')

df_euSight_survey_gcm_impr = gcmSurfer(df_survey_euSight_impr, euSight_result)

Rows with NaN values dropped: 55 from 64 => 85.94% 
Dataframe merged with shape: (9, 6)


In [None]:
ordinalReg(df_euSight_survey_gcm, ['degree'], 3)

In [None]:
ordinalReg(df_euSight_survey_gcm, ['eigenvector'], 3)

In [None]:
ordinalReg(df_euSight_survey_gcm, ['pagerank'], 3)

## DAX companies

In [58]:
# DAX company from the "Type": "dbo:Company" and "dbp:tradedAs" -> "dbr:DAX"

daxCo_result = catMerger(df_daxCo, df_ig_gcm)
df_daxCo = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_dax_companies_prep.csv", sep=';')

df_daxCo_survey_gcm = gcmSurfer(df_daxCo, daxCo_result)

Dataframe merged with shape: (28, 5)
Rows with NaN values dropped: 47 from 60 => 78.33% 
Dataframe merged with shape: (13, 6)


In [59]:
df_daxCo_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_dax_companies_impr.csv", sep=';')

df_daxCo_survey_gcm_impr = gcmSurfer(df_daxCo_impr, daxCo_result)

Rows with NaN values dropped: 37 from 53 => 69.81% 
Dataframe merged with shape: (16, 6)


In [None]:
df_daxCo_survey_gcm

In [None]:
ordinalReg(df_daxCo_survey_gcm, ['degree'], 3)

In [None]:
ordinalReg(df_daxCo_survey_gcm, ['eigenvector'], 3)

In [None]:
ordinalReg(df_daxCo_survey_gcm, ['pagerank'], 3)

## US companies

In [60]:
# US company from "Category": Lists_of_companies_of_the_United_States

usCo_result = catMerger(df_usCo, df_ig_gcm)
df_survey_usCo = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_us_companies_prep.csv", sep=';', encoding="ISO-8859-1")

df_usCo_survey_gcm = gcmSurfer(df_survey_usCo, usCo_result)

Dataframe merged with shape: (1079, 5)
Rows with NaN values dropped: 83 from 83 => 100.0% 


ValueError: Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required by MinMaxScaler.

In [66]:
usCo_result = catMerger(df_usCo, df_ig_gcm)
df_survey_usCo_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_us_companies_impr.csv", sep=';', encoding="ISO-8859-1") # .csv needed no manual uri correction

df_usCo_survey_gcm_impr = gcmSurfer(df_survey_usCo_impr, usCo_result)

Dataframe merged with shape: (1079, 5)
Rows with NaN values dropped: 76 from 76 => 100.0% 


ValueError: Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required by MinMaxScaler.

In [None]:
ordinalReg(df_usCo_survey_gcm, ['degree'], 3)

In [None]:
ordinalReg(df_usCo_survey_gcm, ['eigenvector'], 3)

In [None]:
ordinalReg(df_usCo_survey_gcm, ['pagerank'], 3)

## US technology companies

In [61]:
# US tech company from the "Category": "dbc:Technology_companies_of_the_United_States"

usTech_result = catMerger(df_usTechCo, df_ig_gcm)
df_survey_usTech = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/sv_us_tech_companies_prep.csv", sep=';') # .csv needed no manual uri correction

df_usTech_survey_gcm = gcmSurfer(df_survey_usTech, usTech_result)

Dataframe merged with shape: (10000, 5)
Rows with NaN values dropped: 35 from 36 => 97.22% 
Dataframe merged with shape: (1, 6)


In [62]:
df_survey_usTech_impr = pd.read_csv("../data/survey_single_cat_files/sv_prep_uri/improved_uri/sv_us_tech_companies_impr.csv", sep=';') # .csv needed no manual uri correction

df_usTech_survey_gcm_impr = gcmSurfer(df_survey_usTech_impr, usTech_result)

Rows with NaN values dropped: 31 from 34 => 91.18% 
Dataframe merged with shape: (3, 6)


In [None]:
ordinalReg(df_usTech_survey_gcm, ['degree'], 3)

In [None]:
ordinalReg(df_usTech_survey_gcm, ['eigenvector'], 3)

In [None]:
ordinalReg(df_usTech_survey_gcm, ['pagerank'], 3)