In [1]:
#!/usr/bin/env python
import json
import pprint
import requests     
import pandas as pd 

In [2]:
# package list of the Swiss open data portal
packages = 'https://ckan.opendata.swiss/api/3/action/package_list'

In [3]:
# HTTP request
response = requests.get(packages)

#json module to load CKAN's response into a dictionary
response_dict = json.loads(response.content)

# Ccheck the contents of the response
assert response_dict['success'] is True  # make sure if response is OK
result = response_dict['result']         # packages extractions
#pprint.pprint(result)                   

In [4]:
def get_df_from_dicts(packages): #get pandas df with needed information from the packages
    
    # initialise number of datasets
    datasetsnumber = 0
    
    # get url for package information
    base_url = 'https://opendata.swiss/api/3/action/package_show?id='
    temp = {} # build temporary dictionary 

    for package in packages:
        # url for the package from the packages list
        package_information_url = base_url + package

        # the HTTP request
        package_information = requests.get(package_information_url)

        # json module to load CKAN's response into a dictionary
        package_dict = json.loads(package_information.content)

        # check the contents of the response
        assert package_dict['success'] is True  # again make sure if response is OK
        package_dict = package_dict['result']   # take'result' part from the dictionary
        
        # fill temporary dictionary with needed information 
        temp.setdefault('owner_org', []).append(package_dict.get("owner_org", 0))
        temp.setdefault('metadata_created', []).append(package_dict.get("metadata_created", 0))
        temp.setdefault('metadata_modified', []).append(package_dict.get("metadata_modified", 0))
        temp.setdefault('author', []).append(package_dict.get("author", 0))
        temp.setdefault('name', []).append(package_dict.get("name", 0))
       
    # construct from pandas df from dictionary
    df = pd.DataFrame(temp, 
               columns=['owner_org','metadata_created', 'metadata_modified','author','name']) 
    return df    
   

In [5]:
df=get_df_from_dicts(result)

In [6]:
# get extract year from datetime
df['metadata_created'] = pd.to_datetime(df['metadata_created'])

In [7]:
df['year'] = df['metadata_created'].dt.year

In [8]:
df.head(3)

Unnamed: 0,owner_org,metadata_created,metadata_modified,author,name,year
0,3d6d9295-0b90-48df-988c-68100e82b8c5,2016-01-29 16:16:11.320742,2021-08-23T00:35:58.262101,Bundesamt für Landwirtschaft,__,2016
1,aa742a0e-7ba4-4f65-ab9f-c587dec73891,2017-03-22 10:14:06.441259,2021-08-23T01:28:12.718340,Office cantonal de la culture et du sport,__1,2017
2,aa742a0e-7ba4-4f65-ab9f-c587dec73891,2017-03-30 07:19:29.752609,2021-08-23T01:40:02.004042,Office de l'urbanisme,__10,2017


In [25]:
# get df for the year 2020 only
df_2020 = df[df.year == 2020]

# get datasets added to opendata.swiss by different organisations in 2020
ds_2020_authors = df_2020.pivot_table(columns=['author'], aggfunc='size')

# show top 5 organisations that added the largest number of datasets 
print(f"The largest number of datasets uploaded to the opendata.swiss corresponds to the {ds_2020_authors.nlargest(1)}, where {ds_2020_authors.max()} is the number of datasets")
ds_2020_authors.nlargest(5)


The largest number of datasets uploaded to the opendata.swiss corresponds to the author
BFS/OFS    862
dtype: int64, where 862 is the number of datasets


author
BFS/OFS                                            862
Amt für Raumentwicklung und Geoinformation (SG)     89
Kanton Thurgau, Amt für Geoinformation              62
Geoinformation Stadt Bern                           35
Statistik Stadt Zürich, Präsidialdepartement        31
dtype: int64