## Data sourcing

Source data from various source systems and ingest them using python code.

1. Parquet files
2. CSV files
3. APIs
4. RDBMS databases
5. HTML

In [3]:
# import modules
import certifi
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3
import urllib3
from urllib3 import request
import requests
from unicodedata import normalize

### Sourcing Parquet data

Please visit the url https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page

In [None]:
# Read data from the Parquet file. We use pandas read_parquet method for ease and speed.
df_parquet = pd.read_parquet("yellow_tripdata_2022-01.parquet")
df_parquet.head()

### Sourcing CSV data 

Please visit the url https://data.cityofnewyork.us/resource/h9gi-nx95.csv?$limit=500


In [None]:
# Read data from the CSV file. We use pandas read_csv method for ease and speed.
df_csv = pd.read_csv("h9gi-nx95.csv")
df_csv.head()

### Sourcing data from APIs

Please make sure to install the certifi library using - pipenv install certifi

In [None]:
# get api data from url
url = 'https://data.cityofnewyork.us/resource/h9gi-nx95.json?$limit=500'

# Check if API is available to retrive the data
apt_status = http.request('GET', url).status
print(apt_status)
if apt_status == 200:
    # Sometimes we get certificate error . We shoul never silence this error as this may cause a securirty threat.
    # Create a Pool manager that can be used to read the API response 
    http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',ca_certs=certifi.where())
    data = json.loads(http.request('GET', url).data.decode('utf-8'))
    df_api = pd.json_normalize(data)
else:
    df_api = pd.Dataframe()
df_api.head(10)

In [4]:
import sys
import requests
import json
import logging

logging.captureWarnings(True)

##
##    function to obtain a new OAuth 2.0 token from the authentication server
##
def get_new_token():

    auth_server_url = "https://auth-pisa-qa.staging.gcp-eu.taocloud.org/v1/oauth2/tokens"
    client_id = 'Q2caBAhwvNF8NgmrEPKrUw'
    client_secret = 'wHtKRe6eA7BWRBNIISiTQk3uM210OJdQ'

    token_req_payload = {'grant_type': 'client_credentials'}

    token_response = requests.post(auth_server_url,
    data=token_req_payload, verify=False, allow_redirects=False,
    auth=(client_id, client_secret))
                
    if token_response.status_code !=200:
        print("Failed to obtain token from the OAuth 2.0 server", file=sys.stderr)
        sys.exit(1)

    print("Successfuly obtained a new token")
    tokens = json.loads(token_response.text)
    return tokens['access_token']

## 
## 	obtain a token before calling the API for the first time
##
token = get_new_token()


Successfuly obtained a new token


In [5]:
# Get request function

def req_get(url,token,params=None,data=None):

##
##   call the API with the token
##
    
    api_call_headers = {'Authorization': 'Bearer ' + token}
    api_call_response = requests.get(url, params, headers=api_call_headers, json=data, verify=False, timeout = 60)

    if	api_call_response.status_code == 401 | api_call_response.status_code == 403:
        token = get_new_token()

    data = json.loads(api_call_response.text)
    df_api = pd.json_normalize(data)
    return df_api

# Get request function

def req_post(url,token,params=None,data=None):

##
##   call the API with the token
##
    
    api_call_headers = {'Authorization': 'Bearer ' + token}
    api_call_response = requests.post(url, params, headers=api_call_headers, json=data, verify=False, timeout = 60)

    if	api_call_response.status_code == 401 | api_call_response.status_code == 403:
        token = get_new_token()

    data = json.loads(api_call_response.text)
    df_api = pd.json_normalize(data)
    return df_api


In [6]:
api_url_health = "https://dynamic-query-api-pisa-qa.staging.gcp-eu.taocloud.org/api/v1/health"
df_health = req_get(url=api_url_health,token=token)
df_health

Unnamed: 0,status
0,ok


In [7]:
api_url_entity_list = "https://dynamic-query-api-pisa-qa.staging.gcp-eu.taocloud.org/api/v1/entity/list"
df_entity_list = req_get(url=api_url_entity_list,token=token)
df_entity_list

Unnamed: 0,name,description
0,portalSessions,Holds information on portal sessions
1,portalUserGroups,Holds information on portal user groups
2,portalGroups,Holds information on portal groups
3,portalUserSessions,Holds information on portal user sessions
4,portalUser,Holds information on portal user
5,portalBattery,Holds information on portal battery
6,testRunnerDeliveries,Holds information on test runner deliveries
7,datastoreDeliveries,Holds information on datastore deliveries
8,datastoreDeliveryResults,Holds information on datastore delivery results
9,datastoreItemResults,Holds information on datastore item results


In [8]:
def json_schema_get(entity,token):
    api_entity_get = "https://dynamic-query-api-pisa-qa.staging.gcp-eu.taocloud.org/api/v1/entity/%s" % ent
    df_entity_schema = req_get(url=api_entity_get,token=token)
    return df_entity_schema

In [7]:
import pandas as pd

entities = df_entity_list['name']

with pd.ExcelWriter('entity_specx.xlsx') as writer:
    for ent in entities:    
        df = json_schema_get(entity=ent,token=token)
        df.to_excel(writer,sheet_name = ent, index = False)
        worksheet = writer.sheets[ent]  # pull worksheet object
        for idx, col in enumerate(df):  # loop through all columns
            series = df[col]
            max_len = max((
                series.astype(str).map(len).max(),  # len of largest item
                len(str(series.name))  # len of column name/header
                )) + 1  # adding a little extra space
            worksheet.set_column(idx, idx, max_len)  # set column width

In [9]:
entityId = df_entity_list.iloc[8]['name']
entityId
api_url_del_results = "https://dynamic-query-api-pisa-qa.staging.gcp-eu.taocloud.org/api/v1/search/%s" % entityId
data = {
  # 'pageSize': 1,
  'pageNumber': 0,
  'filters': [
    {
      'field': 'testQtiId',
      'type': 'terms',
      'values': ['R432']
    }
  ]
}

df_del_results_all = req_post(url=api_url_del_results,token=token, data = data)


In [10]:
def api_json_extract(df_raw,meta=None):
  if(meta is None):
    df = pd.json_normalize(df_raw.iloc[0]['data'][0])
  else:
    if(type(meta) is not str):
      exit()
    if not (meta in df_raw.columns):
      exit()
    df = pd.json_normalize(df_raw.iloc[0]['data'][0][meta])

  return df

In [None]:
df_del_results = api_json_extract(df_del_results_all, meta = "metadataRaw")
df_del_results

In [None]:
df_del_results_meta_vals = pd.json_normalize(df_del_results_all.iloc[0]['data'][0]['metadataRaw'],record_path='values',meta=['propertyLabel','alias','propertyUri','type'])
df_del_results_meta_vals

In [129]:
# df_del_results_meta_vals = df_del_results.explode('values',ignore_index=False)
# pd.json_normalize(df_del_results,record_path = 'values')

df_del_results_meta_vals = df_del_results.join(pd.json_normalize(pd.json_normalize(df_del_results['values'])[0])).drop(columns=['values'])
df_del_results_meta_vals['varType'] = df_del_results_meta_vals['type'].str.split(pat = "#").str[1]
df_del_results_meta_vals


Unnamed: 0,propertyLabel,alias,propertyUri,type,vuri,vlabel,varType
0,PISA25 Languages,,https://pisa2025.eu.premium.taocloud.org/#i63b...,http://www.tao.lu/datatypes/WidgetDefinitions....,https://www.oecd.org/en-ZZ,English (ZZ),ComboBox
1,PISA25 Is Translated Unit?,pisa25_is_translated_unit,https://pisa2025.eu.premium.taocloud.org/#i63b...,http://www.tao.lu/datatypes/WidgetDefinitions....,http://www.tao.lu/Ontologies/generis.rdf#False,False,RadioBox
2,PISA25 Source Unit State,pisa25_sourse_unit_state,https://pisa2025.eu.premium.taocloud.org/#i63b...,http://www.tao.lu/datatypes/WidgetDefinitions....,https://www.oecd.org/translationStateTranslated,Translated,ComboBox
3,PISA25 Domains,pisa25_domains,https://pisa2025.eu.premium.taocloud.org/#i63e...,http://www.tao.lu/datatypes/WidgetDefinitions....,https://www.oecd.org/REA,REA,ComboBox
4,PISA25 Group,pisa25_group,https://pisa2025.eu.premium.taocloud.org/#i641...,http://www.tao.lu/datatypes/WidgetDefinitions....,batch1,,TextBox
5,UI Engine,ui-engine,https://pisa2025.eu.premium.taocloud.org/#i649...,http://www.tao.lu/datatypes/WidgetDefinitions....,testRunnerConfiguration_ui-engine:core1,Core 1,ComboBox
6,Label,,http://www.w3.org/2000/01/rdf-schema#label,http://www.tao.lu/datatypes/WidgetDefinitions....,About A Book (R432),,TextBox


In [89]:
pd.json_normalize(dict_vals)

Unnamed: 0,vuri,vlabel
0,https://www.oecd.org/en-ZZ,English (ZZ)


In [37]:
deliveryId = "4d31793696fc"
api_url_delid_result = "https://dynamic-query-api-pisa-qa.staging.gcp-eu.taocloud.org/api/v1/deliveries/%s/results" % deliveryId
params = (('page',1),('results',10),('deliveryId',deliveryId))
# df_results_delid = req_get(url=api_url_results_all,token=token, params = params)
# df_results_delid

'[["page", 1], ["results", 10], ["deliveryId", "4d31793696fc"]]'

### Sourcing Data from RDBMS tables

In [None]:
# Read sqlite query results into a pandas DataFrame
with sqlite3.connect("movies.sqlite") as conn:
    df = pd.read_sql("SELECT * from movies", conn)
df.head()

# Sourcing data from Webpages

Please visit the url https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)

In [None]:
# get data from url
df_html = pd.read_html('https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)',match = 'by country')
# Let's see how many tables are there with tage ' by county'
print(len(df_html)) # There are 4 tables
# Let's see the first table
df_html[0]