In [1]:
import csv
import pandas as pd
from owlready2 import *
import numpy as np
from unidecode import unidecode
import math



# Import Data

In [2]:
countries_df_final = pd.read_csv('./data/countries_df_final.csv', index_col='Index', encoding="utf-16")    
regions_df_final = pd.read_csv('./data/regions_df_final.csv', index_col='Index')    
continents_df_final = pd.read_csv('./data/continents_df_final.csv', index_col='Index')

# PreProcess

In [3]:
def edit_prospect_value(text):    
    
    if text.replace(",", "").isnumeric():
        return int(text.replace(",", ""))
    else:
        return sum( map(int, text.replace(",", "").replace("(N)", "").replace("(S)", "").split('/')))


continents_df_final['Population prospect by 2050'] = continents_df_final['Population prospect by 2050'].apply(lambda x: edit_prospect_value(x))
countries_df_final['Capital city'] = countries_df_final['Capital city'].apply(lambda x: unidecode(str(x)))
countries_df_final['UTC time offset'] = countries_df_final['UTC time offset'].apply(lambda x: unidecode(str(x)))
countries_df_final['Code'] = countries_df_final['Code'].apply(lambda x: str(x).replace("1 ", "1"))

# Define Namespces

In [4]:
geo_ontology = get_ontology("http://ce.sharif.edu/onto")

ns_country = get_ontology("http://ce.sharif.edu/onto/country")
ns_region = get_ontology("http://ce.sharif.edu/onto/region")
ns_continent = get_ontology("http://ce.sharif.edu/onto/continent")
ns_lang = get_ontology("http://ce.sharif.edu/onto/language")

# Define Classes

In [5]:
with geo_ontology:
    class Territory(Thing): 
        pass
    class Country(Territory): 
        namespace = ns_country
        
    class Region(Territory): 
        namespace = ns_region

    class Continent(Territory): 
        namespace = ns_continent
        
    class Language(Thing):
        namespace = ns_lang
        
    AllDisjoint([Continent, Country, Region, Language])

# Object Properties


In [6]:
with geo_ontology:
    
    #Country
    class neighbour_of(Country >> Country): 
        pass
    
    class country_has_region(Country >> Region, FunctionalProperty): 
        pass
    
    class has_language(Country >> Language): 
        pass

    #Region
    class region_has_country(Region >> Country, InverseFunctionalProperty): 
        inverse_property = country_has_region
        
    class region_has_subregion(Region >> Region, InverseFunctionalProperty):
        pass

    class subregion_has_region(Region >> Region, InverseFunctionalProperty):
        inverse_property = region_has_subregion
    
    class region_has_continent(Region >> Continent, FunctionalProperty): 
        pass
    
    
    #Continent
    class continent_has_region(Continent >> Region): 
        inverse_property = region_has_continent


# Data Type Properties


In [7]:
with geo_ontology:
    
    #Teritory
    class tr_name(Territory >> str, FunctionalProperty): pass
    class tr_population(Territory >> int, FunctionalProperty): pass
    class tr_area(Territory >> int, FunctionalProperty): pass
    class tr_pupulation_prospect(Continent >> int, FunctionalProperty): pass
    
    
    #Country
    class c_code(Country >> int): pass
    class c_capital(Country >> str, FunctionalProperty): pass
    class c_currency(Country >> str, FunctionalProperty): pass
    class c_toffset(Country >> str): pass
    class c_domain(Country >> str, FunctionalProperty): pass
    class c_population_density(Country >> float, FunctionalProperty): pass
    class c_capital_population(Country >> int, FunctionalProperty): pass
    class c_sex_ratio(Country >> float, FunctionalProperty): pass
    class c_gdp(Country >> int, FunctionalProperty): pass
    class c_gdp_rate(Country >> float, FunctionalProperty): pass

# Insert Data

In [8]:
def convertURI(text):
    return unidecode(text).replace("&", "and").replace(" ", "_").replace("-", "_")

def normalize(text):
    return unidecode(text).replace("&", "and")

In [9]:
with geo_ontology:
    for index, row in continents_df_final.iterrows():
        the_continent = Continent(convertURI(row['Index.1']), ns_continent, tr_name=normalize(row['Index.1'])) 
        if row['Surface area(km2)']: the_continent.tr_area = int(row['Surface area(km2)'])
        if row['Population(000, 2020)']: the_continent.tr_population = int(row['Population(000, 2020)'])
        if row['Population prospect by 2050']: the_continent.tr_pupulation_prospect = int(row['Population prospect by 2050'])

        if row['Regions']:
            for i in row['Regions'].replace("[","").replace("]", "").replace("'","").split(", "):
                the_region = Region(convertURI(i), ns_region) 
                the_continent.continent_has_region.append(the_region)

In [10]:
with geo_ontology:
    for index, row in regions_df_final.iterrows():
        the_region = Region(convertURI(row['Index.1']), ns_region, tr_name=normalize(row['Index.1'])) 
        if row['Surface area(km2)']: the_region.tr_area = int(row['Surface area(km2)'])
        if row['Population(000, 2020)']: the_region.tr_population = int(row['Population(000, 2020)'])
            
        if row['Region'] in ['Sub-Saharan Africa', 'Latin America & Caribbean']:
            the_super_region = Region(convertURI(row['Region']), ns_region) 
            the_region.subregion_has_region.append(the_super_region)
            
        if str(row['Countries'])!='nan':
            for i in str(row['Countries']).replace("[","").replace("]", "").replace("'","").split(", "):
                the_country = Country(convertURI(i), ns_country) 
                the_region.region_has_country.append(the_country)            

In [11]:
with geo_ontology:
    for index, row in countries_df_final.iterrows():
        the_country = Country(convertURI(row['Index.1']), ns_country, tr_name=normalize(row['Index.1']))
        
        if str(row['Surface area(km2)'])!='nan':
            try:
                the_country.tr_area = int(row['Surface area(km2)'])
            except:
                pass #~0
                
        if row['Code']!='nan':
            for i in str(row['Code']).split(" "):
                the_country.c_code.append(int(i))

        if row['Capital city']!='nan':
            the_country.c_capital =  normalize(row['Capital city'])
            
        if str(row['Currency+ISO-4217'])!='nan':
            the_country.c_currency = str(row['Currency+ISO-4217'])

        if row['UTC time offset']!='nan':
            for i in row['UTC time offset'].replace("[","").replace("]", "").replace("'","").split(", "):
                the_country.c_toffset.append(str(i))
        
        if str(row['Internet Domain'])!='nan':
            the_country.c_domain = str(row['Internet Domain']).replace(".", "")
            
        if str(row['Pop. density(per km2, 2020)'])!='nan':
            the_country.c_population_density = float(row['Pop. density(per km2, 2020)'])
            
        if str(row['Capital city pop.(000, 2020)'])!='nan':
            try:
                the_country.c_capital_population = float(row['Capital city pop.(000, 2020)'])
            except:
                pass #date!
            
        if str(row['Sex ratio(m per 100 f)'])!='nan':
            the_country.c_sex_ratio = float(row['Sex ratio(m per 100 f)'])
        
        if str(row['GDP: Gross domestic product(million current US$)'])!='nan':
            the_country.c_gdp = int(row['GDP: Gross domestic product(million current US$)'])
 
        if str(row['GDP growth rate(annual %, const. 2015 prices)'])!='nan':
            try:
                the_country.c_gdp_rate = float(row['GDP growth rate(annual %, const. 2015 prices)'])
            except:
                pass #~0.0
        
        if str(row['Language'])!='nan':
            for i in str(row['Language']).replace("[","").replace("]", "").replace("'","").split(", "):
                if i not in ['No offical language in India', '', '37 other languages']:
                    if row['Index.1'] =='India':
                        print(i)
                    the_lang = Language(convertURI(i), ns_lang) 
                    the_country.has_language.append(the_lang)
                                        
        if str(row['Neighbors'])!='nan' and str(row['Neighbors'])!='[]':
            for i in str(row['Neighbors']).replace("[","").replace("]", "").replace("'","").split(", "):
                the_neighbor = Country(convertURI(i), ns_country) 
                the_country.neighbour_of.append(the_neighbor)

In [12]:
geo_ontology.save(file="./output/ontology.owl")

In [23]:
#TODO 
a = "Population growth rate(average annual %)	Urban population(% of total population)	Urban population growth rate(average annual %)	Fertility rate, total(live births per woman)	Life expectancy at birth(females/males, years)	Population age distribution(0-14/60+ years old, %)	International migrant stock(000/% of total pop.)	Refugees and others of concern to UNHCR(000)	Infant mortality rate(per 1 000 live births)	Health: Current expenditure(% of GDP)	Health: Physicians(per 1 000 pop.)	Education: Government expenditure(% of GDP)	Education: Primary gross enrol. ratio(f/m per 100 pop.)	Education: Secondary gross enrol. ratio(f/m per 100 pop.)	Education: Tertiary gross enrol. ratio(f/m per 100 pop.)	Intentional homicide rate(per 100 000 pop.)	Seats held by women in national parliaments(%)	Individuals using the Internet(per 100 inhabitants)	Threatened species(number)	Forested area(% of land area)	Energy production, primary(Petajoules)	Energy supply per capita(Gigajoules)	Important sites for terrestrial biodiversity protected(%)	Net Official Development Assist. received(% of GNI)	Research & Development expenditure(% of GDP)	CO2 emission estimates(million tons/tons per capita)	Tourist/visitor arrivals at national borders(000)	Pop. using safely managed sanitation(urban/rural %)	Pop. using safely managed drinking water(urban/rural, %)	Net Official Development Assist. disbursed(% of GNI)"
for i in a.split(")"):
    print(i.split('(')[0].replace(".", "").replace(" ", "_"). replace(":","").replace("/", "").lower())

population_growth_rate
	urban_population
	urban_population_growth_rate
	fertility_rate,_total
	life_expectancy_at_birth
	population_age_distribution
	international_migrant_stock
	refugees_and_others_of_concern_to_unhcr
	infant_mortality_rate
	health_current_expenditure
	health_physicians
	education_government_expenditure
	education_primary_gross_enrol_ratio
	education_secondary_gross_enrol_ratio
	education_tertiary_gross_enrol_ratio
	intentional_homicide_rate
	seats_held_by_women_in_national_parliaments
	individuals_using_the_internet
	threatened_species
	forested_area
	energy_production,_primary
	energy_supply_per_capita
	important_sites_for_terrestrial_biodiversity_protected
	net_official_development_assist_received
	research_&_development_expenditure
	co2_emission_estimates
	touristvisitor_arrivals_at_national_borders
	pop_using_safely_managed_sanitation
	pop_using_safely_managed_drinking_water
	net_official_development_assist_disbursed

