## Data sourcing

Source data from various source systems and ingest them using python code.

1. Parquet files
2. CSV files
3. APIs
4. RDBMS databases
5. HTML

In [1]:
# import modules
import certifi
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3
import urllib3
from urllib3 import request
import requests
from unicodedata import normalize

### Sourcing Parquet data

Please visit the url https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page

In [None]:
# Read data from the Parquet file. We use pandas read_parquet method for ease and speed.
df_parquet = pd.read_parquet("yellow_tripdata_2022-01.parquet")
df_parquet.head()

### Sourcing CSV data 

Please visit the url https://data.cityofnewyork.us/resource/h9gi-nx95.csv?$limit=500


In [None]:
# Read data from the CSV file. We use pandas read_csv method for ease and speed.
df_csv = pd.read_csv("h9gi-nx95.csv")
df_csv.head()

### Sourcing data from APIs

Please make sure to install the certifi library using - pipenv install certifi

In [None]:
# get api data from url
url = 'https://data.cityofnewyork.us/resource/h9gi-nx95.json?$limit=500'

# Check if API is available to retrive the data
apt_status = http.request('GET', url).status
print(apt_status)
if apt_status == 200:
    # Sometimes we get certificate error . We shoul never silence this error as this may cause a securirty threat.
    # Create a Pool manager that can be used to read the API response 
    http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',ca_certs=certifi.where())
    data = json.loads(http.request('GET', url).data.decode('utf-8'))
    df_api = pd.json_normalize(data)
else:
    df_api = pd.Dataframe()
df_api.head(10)

### PISA API Testing

In [130]:
import sys
import requests
import json
import logging

logging.captureWarnings(True)

##
##    function to obtain a new OAuth 2.0 token from the authentication server
##
def get_new_token():

    auth_server_url = "https://auth-pisa-qa.staging.gcp-eu.taocloud.org/v1/oauth2/tokens"
    client_id = 'Q2caBAhwvNF8NgmrEPKrUw'
    client_secret = 'wHtKRe6eA7BWRBNIISiTQk3uM210OJdQ'

    token_req_payload = {'grant_type': 'client_credentials'}

    token_response = requests.post(auth_server_url,
    data=token_req_payload, verify=False, allow_redirects=False,
    auth=(client_id, client_secret))
                
    if token_response.status_code !=200:
        print("Failed to obtain token from the OAuth 2.0 server", file=sys.stderr)
        sys.exit(1)

    print("Successfuly obtained a new token")
    tokens = json.loads(token_response.text)
    return tokens['access_token']

## 
## 	obtain a token before calling the API for the first time
##
token = get_new_token()


Successfuly obtained a new token


In [128]:
# Get request function

def req_get(url,token,params=None,data=None):

##
##   call the API with the token
##
    
    api_call_headers = {'Authorization': 'Bearer ' + token}
    api_call_response = requests.get(url, params, headers=api_call_headers, json=data, verify=False, timeout = 60)

    if	api_call_response.status_code == 401 | api_call_response.status_code == 403:
        token = get_new_token()

    data = json.loads(api_call_response.text)
    df_api = pd.json_normalize(data)
    return df_api

# Get request function

def req_post(url,token,params=None,data=None):

##
##   call the API with the token
##
    
    api_call_headers = {'Authorization': 'Bearer ' + token}
    api_call_response = requests.post(url, params, headers=api_call_headers, json=data, verify=False, timeout = 60)

    if	api_call_response.status_code == 401 | api_call_response.status_code == 403:
        token = get_new_token()

    data = json.loads(api_call_response.text)
    df_api = pd.json_normalize(data)
    return df_api


In [131]:
api_url_health = "https://dynamic-query-api-pisa-qa.staging.gcp-eu.taocloud.org/api/v1/health"
df_health = req_get(url=api_url_health,token=token)
df_health

Unnamed: 0,status
0,ok


In [52]:
api_url_entity_list = "https://dynamic-query-api-pisa-qa.staging.gcp-eu.taocloud.org/api/v1/entity/list"
df_entity_list = req_get(url=api_url_entity_list,token=token)
df_entity_list

Unnamed: 0,name,description
0,portalSessions,Holds information on portal sessions
1,portalUserGroups,Holds information on portal user groups
2,portalGroups,Holds information on portal groups
3,portalUserSessions,Holds information on portal user sessions
4,portalUser,Holds information on portal user
5,portalBattery,Holds information on portal battery
6,testRunnerDeliveries,Holds information on test runner deliveries
7,datastoreDeliveries,Holds information on datastore deliveries
8,datastoreDeliveryResults,Holds information on datastore delivery results
9,datastoreItemResults,Holds information on datastore item results


In [6]:
def json_schema_get(entity,token):
    api_entity_get = "https://dynamic-query-api-pisa-qa.staging.gcp-eu.taocloud.org/api/v1/entity/%s" % entity
    df_entity_schema = req_get(url=api_entity_get,token=token)
    return df_entity_schema

In [7]:
import pandas as pd

entities = df_entity_list['name']

with pd.ExcelWriter('entity_specx.xlsx') as writer:
    for ent in entities:    
        df = json_schema_get(entity=ent,token=token)
        df.to_excel(writer,sheet_name = ent, index = False)
        worksheet = writer.sheets[ent]  # pull worksheet object
        for idx, col in enumerate(df):  # loop through all columns
            series = df[col]
            max_len = max((
                series.astype(str).map(len).max(),  # len of largest item
                len(str(series.name))  # len of column name/header
                )) + 1  # adding a little extra space
            worksheet.set_column(idx, idx, max_len)  # set column width

In [132]:
def df_entity_extr(entity,token,data=None):
  api_url_del_results = "https://dynamic-query-api-pisa-qa.staging.gcp-eu.taocloud.org/api/v1/search/%s" % entity
  df = req_post(url=api_url_del_results,token=token, data = data)
  return df

In [133]:
data = {
  # 'pageSize': 1,
  # 'pageNumber': 0,
  "filters": [
    {
      "field": "login",
      "type": "terms",
      "values": ["MT0002"]
    },
    {
      "field": "testQtiId",
      "type": "terms",
      "values": ["ZZPNANUL"]
    }
  ]
}

entity_del = df_entity_list.iloc[8]['name']
df_del_results_all = df_entity_extr(entity = entity_del, token=token,data=data)

entity_itm = df_entity_list.iloc[9]['name']
df_itm_results_all = df_entity_extr(entity = entity_itm, token=token,data=data)

In [107]:
def api_json_extract(df_raw,entity,options=None):
  if (entity == 'datastoreDeliveryResults'):
    if(options is not None):
      if('json_var' in options):
        json_var = options['json_var']
        tmp = pd.json_normalize(df_raw.iloc[0]['data'][0][json_var])
        df = (
          tmp
          .join(pd.json_normalize(pd.json_normalize(tmp['values'])[0]))
          .drop(columns=['values'])
        )
        df['varType'] = df['type'].str.split(pat = "#").str[1]
      elif(('item_resp' in options) & (options['item_resp'] is True)):
        tmp = (
            pd.json_normalize(df_raw.iloc[0]['data'][0])
            .filter(regex = '^(?!metadata)(?!ltiParameters)(?!rawLtiParameters)(?!outcomes)')
          )
      
        idvars = tmp.columns[~tmp.columns.str.startswith('items.item-')]
        pivotvars = tmp.columns[tmp.columns.str.startswith('items.item-')]
        
        df = (
          pd.melt(
              tmp,
              id_vars = idvars,
              var_name = 'key',
              value_name = 'val',
              value_vars = pivotvars
          )
          .assign(
            key=lambda df: df.key.replace('items.item-','',regex=True)
          )
          .assign(
            q_num = lambda df: df.key.str.split('.',n=1).str[0],
            q_lab = lambda df: df.key.str.split('.',n=1).str[1],
          )
        )
    else:
      df = (
        pd
        .json_normalize(df_raw.iloc[0]['data'][0])
      )
  elif (entity == 'datastoreItemResults'):
    df = (
      pd
      .json_normalize(df_raw.iloc[0]['data'][0])
    )

  return df

In [174]:
df_del_results = api_json_extract(df_del_results_all, entity = entity_del, options = {'item_resp': True})
(
  df_del_results
  # .query('q_lab.str.match("score")')
  .query('q_num == "3"')
  # .query('val.str.match("CR560Q03")')
  # .assign(
  #   val = lambda df: df.val.astype(int)
  # )
  # .groupby(['deliveryId','login','testQtiId'],as_index = False)
  # .size()
  # .agg({'val': 'sum'})
  .pivot_table(
    index=['deliveryId','login','testQtiId'],
    columns = 'key',
    values = 'val',
    aggfunc='first'
  )
  .rename(
    columns = lambda x: x.strip('3.')
  )
)



Unnamed: 0_level_0,Unnamed: 1_level_0,key,completionStatus,duration,itemEndTime,itemStartTime,maxScore,numAttempts,outcomes.SCORE,outcomes.completionStatus,qtiId,qtiLabel,qtiTitle,responses.RESPONSE.correct,responses.RESPONSE.value,score,statusCorrect,submissionTime
deliveryId,login,testQtiId,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
d60f257c8d73,MT0002,ZZPNANUL,completed,67,1699421454000,1699421386000,0,1,1,completed,item4,CR560Q03,CR560Q03,False,"{'{ \tMATCH(R560Q03RADIO_3,""1"") }'; 1; 'R560Q0...",1,partially,1699505356000


In [173]:
df_itm_results = api_json_extract(df_itm_results_all, entity = entity_del)
df_itm_results.filter(regex = '^(?!metadata)(?!ltiParameters)(?!rawLtiParameters)(?!outcomes)(?!_id)')

Unnamed: 0,numAttempts,rawDuration,statusCorrect,sessionEndTime,login,testQtiId,testQtiTitle,submissionTime,rawResponses,qtiId,...,maxScore,qtiLabel,itemId,testQtiLabel,publicationTime,tenantId,responses,completionStatus,itemStartTime,isCorrect
0,1,PT1M7S,partially,1699505356000,MT0002,ZZPNANUL,Reading RA1,1699505356000,"[{'identifier': 'numAttempts', 'correct': None...",item4,...,0,CR560Q03,item-3,MSAT Misrouting Test_3 Nov,1698986592000,18,"[{'RESPONSE': {'correct': False, 'value': '{'{...",completed,1699421386000,False


In [37]:
deliveryId = "4d31793696fc"
api_url_delid_result = "https://dynamic-query-api-pisa-qa.staging.gcp-eu.taocloud.org/api/v1/deliveries/%s/results" % deliveryId
params = (('page',1),('results',10),('deliveryId',deliveryId))
# df_results_delid = req_get(url=api_url_results_all,token=token, params = params)
# df_results_delid

'[["page", 1], ["results", 10], ["deliveryId", "4d31793696fc"]]'

### Sourcing Data from RDBMS tables

In [None]:
# Read sqlite query results into a pandas DataFrame
with sqlite3.connect("movies.sqlite") as conn:
    df = pd.read_sql("SELECT * from movies", conn)
df.head()

# Sourcing data from Webpages

Please visit the url https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)

In [None]:
# get data from url
df_html = pd.read_html('https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)',match = 'by country')
# Let's see how many tables are there with tage ' by county'
print(len(df_html)) # There are 4 tables
# Let's see the first table
df_html[0]