In [1]:
!pip install -r requirements.txt

import logging
#logger = logging.getLogger()
#logger.setLevel(logging.INFO)

import pandas as pd
import requests
import zipfile
import os
import shutil
from SPARQLWrapper import SPARQLWrapper, JSON, XML
import logging

from data_imports import *



# Import Data

In [2]:
df_api = import_kantonZH_api() 
df_wikidata = import_wikidata_kantonZH()
swisstopowikidata = import_swisstopowikidata_kantonZH()

In [3]:
print(df_api.shape)
df_api.head()

(10032, 4)


Unnamed: 0,BFS_NR,GEBIET_NAME,date,population
0,21,Adlikon,2018-12-31,707.0
1,131,Adliswil,2018-12-31,18681.0
2,241,Aesch ZH,2018-12-31,1555.0
3,1,Aeugst a.A.,2018-12-31,1979.0
4,2,Affoltern a.A.,2018-12-31,12201.0


In [4]:
print(swisstopowikidata.shape)
swisstopowikidata.head()

(162, 3)


Unnamed: 0,Name,bfs,wikidata_id
0,Hinwil,117,Q69383
1,Ottenbach,11,Q67513
2,Wetzikon (ZH),121,Q68305
3,Adliswil,131,Q68210
4,Richterswil,138,Q68227


# Merge df_api with swisstopowikidata 

## Aim: add wikidata qnumber to df_api 

In [5]:
df_api_qnumber = pd.merge(df_api, swisstopowikidata, how='left', left_on=['BFS_NR'], right_on=['bfs'])
print(df_api_qnumber.shape)
df_api_qnumber.head()

(10032, 7)


Unnamed: 0,BFS_NR,GEBIET_NAME,date,population,Name,bfs,wikidata_id
0,21,Adlikon,2018-12-31,707.0,Adlikon,21.0,Q68356
1,131,Adliswil,2018-12-31,18681.0,Adliswil,131.0,Q68210
2,241,Aesch ZH,2018-12-31,1555.0,Aesch (ZH),241.0,Q68342
3,1,Aeugst a.A.,2018-12-31,1979.0,Aeugst am Albis,1.0,Q64945
4,2,Affoltern a.A.,2018-12-31,12201.0,Affoltern am Albis,2.0,Q68290


###### Tere is no qnumber for some bfsnumbers. these bfsnumber do all not exist anymore because of "Gemeindefusionen". 
###### Decision: Only take bfsnumbers that are used today

In [6]:
# view na in kantonZHapiANDswisstopowikidata
df_api_qnumber_na = df_api_qnumber[df_api_qnumber.isna().any(axis=1)].sort_values(by=['BFS_NR'])
print(df_api_qnumber_na.shape)
print(df_api_qnumber_na.BFS_NR.unique()) # !!! Warum sind für diese keine Q Wikinr. vorhanden?)
df_api_qnumber_na

(798, 7)
[ 36  42  44 132 133 134 140 142 171 174 175 179 217 222]


Unnamed: 0,BFS_NR,GEBIET_NAME,date,population,Name,bfs,wikidata_id
8024,36,Oberstammheim (bis 2018),1973-12-31,760.0,,,
1160,36,Oberstammheim (bis 2018),2012-12-31,1125.0,,,
3624,36,Oberstammheim (bis 2018),1998-12-31,1041.0,,,
9256,36,Oberstammheim (bis 2018),1966-12-31,850.0,,,
7496,36,Oberstammheim (bis 2018),1976-12-31,785.0,,,
3800,36,Oberstammheim (bis 2018),1997-12-31,1042.0,,,
984,36,Oberstammheim (bis 2018),2013-12-31,1142.0,,,
7320,36,Oberstammheim (bis 2018),1977-12-31,788.0,,,
4152,36,Oberstammheim (bis 2018),1995-12-31,1069.0,,,
9432,36,Oberstammheim (bis 2018),1965-12-31,831.0,,,


In [7]:
df_api_qnumber[df_api_qnumber['BFS_NR']==297]

Unnamed: 0,BFS_NR,GEBIET_NAME,date,population,Name,bfs,wikidata_id
11,297,Bauma,2018-12-31,4961.0,Bauma,297.0,Q67145
187,297,Bauma,2017-12-31,4925.0,Bauma,297.0,Q67145
363,297,Bauma,2016-12-31,4896.0,Bauma,297.0,Q67145
539,297,Bauma,2015-12-31,4837.0,Bauma,297.0,Q67145
715,297,Bauma,2014-12-31,4718.0,Bauma,297.0,Q67145
891,297,Bauma,2013-12-31,4650.0,Bauma,297.0,Q67145
1067,297,Bauma,2012-12-31,4569.0,Bauma,297.0,Q67145
1243,297,Bauma,2011-12-31,4550.0,Bauma,297.0,Q67145
1419,297,Bauma,2010-12-31,4522.0,Bauma,297.0,Q67145
1595,297,Bauma,2009-12-31,4516.0,Bauma,297.0,Q67145


In [8]:
df_api_qnumber[df_api_qnumber['BFS_NR']==171]

Unnamed: 0,BFS_NR,GEBIET_NAME,date,population,Name,bfs,wikidata_id
12,171,Bauma (bis 2014),2018-12-31,,,,
188,171,Bauma (bis 2014),2017-12-31,,,,
364,171,Bauma (bis 2014),2016-12-31,,,,
540,171,Bauma (bis 2014),2015-12-31,,,,
716,171,Bauma (bis 2014),2014-12-31,4367.0,,,
892,171,Bauma (bis 2014),2013-12-31,4297.0,,,
1068,171,Bauma (bis 2014),2012-12-31,4218.0,,,
1244,171,Bauma (bis 2014),2011-12-31,4199.0,,,
1420,171,Bauma (bis 2014),2010-12-31,4160.0,,,
1596,171,Bauma (bis 2014),2009-12-31,4150.0,,,


In [9]:
df_api_qnumber[df_api_qnumber['BFS_NR']==179]

Unnamed: 0,BFS_NR,GEBIET_NAME,date,population,Name,bfs,wikidata_id
140,179,Sternenberg (bis 2014),2018-12-31,,,,
316,179,Sternenberg (bis 2014),2017-12-31,,,,
492,179,Sternenberg (bis 2014),2016-12-31,,,,
668,179,Sternenberg (bis 2014),2015-12-31,,,,
844,179,Sternenberg (bis 2014),2014-12-31,351.0,,,
1020,179,Sternenberg (bis 2014),2013-12-31,353.0,,,
1196,179,Sternenberg (bis 2014),2012-12-31,351.0,,,
1372,179,Sternenberg (bis 2014),2011-12-31,351.0,,,
1548,179,Sternenberg (bis 2014),2010-12-31,362.0,,,
1724,179,Sternenberg (bis 2014),2009-12-31,366.0,,,


In [10]:
df_api_qnumber_dropna = pd.merge(df_api, swisstopowikidata, how='left', left_on=['BFS_NR'], right_on=['bfs']).dropna()
print(df_api_qnumber_dropna.shape)
df_api_qnumber_dropna.head()

(9234, 7)


Unnamed: 0,BFS_NR,GEBIET_NAME,date,population,Name,bfs,wikidata_id
0,21,Adlikon,2018-12-31,707.0,Adlikon,21.0,Q68356
1,131,Adliswil,2018-12-31,18681.0,Adliswil,131.0,Q68210
2,241,Aesch ZH,2018-12-31,1555.0,Aesch (ZH),241.0,Q68342
3,1,Aeugst a.A.,2018-12-31,1979.0,Aeugst am Albis,1.0,Q64945
4,2,Affoltern a.A.,2018-12-31,12201.0,Affoltern am Albis,2.0,Q68290


In [11]:
# check
if df_api_qnumber_na.shape[0] + df_api_qnumber_dropna.shape[0] == df_api_qnumber.shape[0]:
    print("check ok")

check ok


# Merge df_api with wikidata


## Aim: Check population and ... ???

In [12]:
df_api_qnumber_dropna['check'] = df_api_qnumber_dropna['date'].astype(str) + '---' + df_api['BFS_NR'].astype(str)
df_wikidata['check'] = df_wikidata['date'].astype(str) + '---' + df_wikidata['bfs_id'].astype(str)

In [13]:
df_api_sel_f = df_api_qnumber_dropna[(df_api_qnumber_dropna['check'].isin(df_wikidata['check']) == False)]
df_api_sel_f.to_csv("test.tsv", header = True, sep='\t')
print(df_api_sel_f.columns)
print(df_api_sel_f.shape)
df_api_sel_f.head()

Index(['BFS_NR', 'GEBIET_NAME', 'date', 'population', 'Name', 'bfs',
       'wikidata_id', 'check'],
      dtype='object')
(8871, 8)


Unnamed: 0,BFS_NR,GEBIET_NAME,date,population,Name,bfs,wikidata_id,check
138,292,Stammheim,2018-12-31,2740.0,Stammheim,292.0,Q60322693,2018-12-31---292
213,294,Elgg,2017-12-31,4903.0,Elgg,294.0,Q67137,2017-12-31---294
314,292,Stammheim,2017-12-31,2741.0,Stammheim,292.0,Q60322693,2017-12-31---292
352,21,Adlikon,2016-12-31,662.0,Adlikon,21.0,Q68356,2016-12-31---21
353,131,Adliswil,2016-12-31,18651.0,Adliswil,131.0,Q68210,2016-12-31---131


In [14]:
df_api_sel_t = df_api_qnumber_dropna[(df_api_qnumber_dropna['check'].isin(df_wikidata['check']) == True)]
print(df_api_sel_t.columns)
print(df_api_sel_t.shape)
df_api_sel_t.head()

Index(['BFS_NR', 'GEBIET_NAME', 'date', 'population', 'Name', 'bfs',
       'wikidata_id', 'check'],
      dtype='object')
(363, 8)


Unnamed: 0,BFS_NR,GEBIET_NAME,date,population,Name,bfs,wikidata_id,check
0,21,Adlikon,2018-12-31,707.0,Adlikon,21.0,Q68356,2018-12-31---21
1,131,Adliswil,2018-12-31,18681.0,Adliswil,131.0,Q68210,2018-12-31---131
2,241,Aesch ZH,2018-12-31,1555.0,Aesch (ZH),241.0,Q68342,2018-12-31---241
3,1,Aeugst a.A.,2018-12-31,1979.0,Aeugst am Albis,1.0,Q64945,2018-12-31---1
4,2,Affoltern a.A.,2018-12-31,12201.0,Affoltern am Albis,2.0,Q68290,2018-12-31---2


In [21]:
ttt = pd.merge(df_api_sel_t, df_wikidata, how='left', left_on=['check'], right_on=['check'])
print(ttt.shape)
print(df_api_sel_t.shape)
ttt[["Name", "BFS_NR", "wikidata_id_y", "date_x", "date_y", "population_x", "population_y", "refpublisher"]]

Unnamed: 0,Name,BFS_NR,wikidata_id_y,date_x,date_y,population_x,population_y,refpublisher
0,Adlikon,21,Q68356,2018-12-31,2018-12-31,707.0,707,http://www.wikidata.org/entity/Q285453
1,Adliswil,131,Q68210,2018-12-31,2018-12-31,18681.0,18765,http://www.wikidata.org/entity/Q285453
2,Aesch (ZH),241,Q68342,2018-12-31,2018-12-31,1555.0,1555,http://www.wikidata.org/entity/Q285453
3,Aeugst am Albis,1,Q64945,2018-12-31,2018-12-31,1979.0,1982,http://www.wikidata.org/entity/Q285453
4,Affoltern am Albis,2,Q68290,2018-12-31,2018-12-31,12201.0,12226,http://www.wikidata.org/entity/Q285453
5,Altikon,211,Q65929,2018-12-31,2018-12-31,705.0,706,http://www.wikidata.org/entity/Q285453
6,Andelfingen,30,Q68423,2018-12-31,2018-12-31,2207.0,2215,http://www.wikidata.org/entity/Q285453
7,Bachenbülach,51,Q69887,2018-12-31,2018-12-31,4159.0,4165,http://www.wikidata.org/entity/Q285453
8,Bachs,81,Q70273,2018-12-31,2018-12-31,580.0,580,http://www.wikidata.org/entity/Q285453
9,Bäretswil,111,Q65868,2018-12-31,2018-12-31,5009.0,5025,http://www.wikidata.org/entity/Q285453


In [16]:
ttt.shape

(8871, 15)

In [17]:
ttt.columns

Index(['BFS_NR', 'GEBIET_NAME', 'date_x', 'population_x', 'Name', 'bfs',
       'wikidata_id_x', 'check', 'bfs_id', 'date_y', 'population_y',
       'qualifier', 'refpublisher', 'refurl', 'wikidata_id_y'],
      dtype='object')

In [18]:
#ttt[["check", "wikidata_id_x", "wikidata_id_y","Name", "population_x", "population_y", "qualifier", "refpublisher", "refurl"]]