## Data sourcing

Source data from various source systems and ingest them using python code.

1. Parquet files
2. CSV files
3. APIs
4. RDBMS databases
5. HTML

In [1]:
# import modules
import certifi
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3
import urllib3
from urllib3 import request
import requests
from unicodedata import normalize

### Sourcing Parquet data

Please visit the url https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page

In [None]:
# Read data from the Parquet file. We use pandas read_parquet method for ease and speed.
df_parquet = pd.read_parquet("yellow_tripdata_2022-01.parquet")
df_parquet.head()

### Sourcing CSV data 

Please visit the url https://data.cityofnewyork.us/resource/h9gi-nx95.csv?$limit=500


In [None]:
# Read data from the CSV file. We use pandas read_csv method for ease and speed.
df_csv = pd.read_csv("h9gi-nx95.csv")
df_csv.head()

### Sourcing data from APIs

Please make sure to install the certifi library using - pipenv install certifi

In [None]:
# get api data from url
url = 'https://data.cityofnewyork.us/resource/h9gi-nx95.json?$limit=500'

# Check if API is available to retrive the data
apt_status = http.request('GET', url).status
print(apt_status)
if apt_status == 200:
    # Sometimes we get certificate error . We shoul never silence this error as this may cause a securirty threat.
    # Create a Pool manager that can be used to read the API response 
    http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',ca_certs=certifi.where())
    data = json.loads(http.request('GET', url).data.decode('utf-8'))
    df_api = pd.json_normalize(data)
else:
    df_api = pd.Dataframe()
df_api.head(10)

### PISA API Testing

In [1]:
import pyspark,psycopg2
from pyspark.sql import SparkSession,SQLContext

spark = SparkSession.builder \
  .master("local[1]") \
  .appName("chapter6_schemas") \
  .config("spark.executor.memory", '3g') \
  .config("spark.executor.cores", '1') \
  .config("spark.cores.max", '1') \
  .config("spark.jars.packages","org.postgresql:postgresql:42.0.0") \
  .getOrCreate()

:: loading settings :: url = jar:file:/workspace/pisa2025-api-etl/.venv/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/rstudio/.ivy2/cache
The jars for the packages stored in: /home/rstudio/.ivy2/jars
org.postgresql#postgresql added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-0bdc9eab-5a5f-4e5d-ba5e-86564c3c82c2;1.0
	confs: [default]
	found org.postgresql#postgresql;42.0.0 in central
downloading https://repo1.maven.org/maven2/org/postgresql/postgresql/42.0.0/postgresql-42.0.0.jar ...
	[SUCCESSFUL ] org.postgresql#postgresql;42.0.0!postgresql.jar(bundle) (1088ms)
:: resolution report :: resolve 4586ms :: artifacts dl 1091ms
	:: modules in use:
	org.postgresql#postgresql;42.0.0 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   1   |   1   |   1   |   0   ||

In [2]:
import psycopg2
import psycopg2.extras
from etl.create_table import *
from etl.psycopg2_database_helper import *

params = {
  'host': 'localhost',
  'database': 'postgres',
  'user': 'postgres',
  'password': 'postgres',
  'port': 5432
}

conn = get_postgres_connection(
  host=params['host'],
  database=params['database'],
  user = params['user'],
  password = params['password'],
  port = params['port']
)

cursor = conn.cursor()
sql = '''DROP TABLE IF EXISTS "delivery_results";'''
# Executing the query 
cursor.execute(sql) 
sql = '''
        CREATE TABLE delivery_results (
            delivery_execution_id VARCHAR(100) PRIMARY KEY,
            delivery_id character(12) NOT NULL,
            is_deleted BOOL NOT NULL,
            last_update_date BIGINT NOT NULL,
            login VARCHAR(50) NOT NULL,
            test_qti_id VARCHAR(255) NOT NULL,
            test_qti_label VARCHAR(255) NOT NULL,
            test_qti_title VARCHAR(255) NOT NULL,
            raw_data TEXT NOT NULL
        );
        '''
cursor.execute(sql) 
# close communication with the PostgreSQL database server
cursor.close()
# commit the changes
conn.commit()


In [3]:
import etl.postgresqlschemareader as pgsr

pgsr_tables = pgsr.get_tables(conn)
pgsr_cols = pgsr.get_columns(conn,'public','delivery_results')
pgsr_tree = pgsr.get_tree(conn)

pgsr.print_tree(pgsr_tree)
pgsr.print_columns(pgsr_cols)

public.delivery_results
 |-delivery_execution_id (character varying)
 |-delivery_id (character)
 |-is_deleted (boolean)
 |-last_update_date (bigint)
 |-login (character varying)
 |-test_qti_id (character varying)
 |-test_qti_label (character varying)
 |-test_qti_title (character varying)
 |-raw_data (text)
Column Name:              delivery_execution_id
Ordinal Position:         1
Is Nullable:              NO
Data Type:                character varying
Character Maximum Length: 100

Column Name:              delivery_id
Ordinal Position:         2
Is Nullable:              NO
Data Type:                character
Character Maximum Length: 12

Column Name:              is_deleted
Ordinal Position:         3
Is Nullable:              NO
Data Type:                boolean
Character Maximum Length: None

Column Name:              last_update_date
Ordinal Position:         4
Is Nullable:              NO
Data Type:                bigint
Character Maximum Length: None

Column Name:              

In [375]:
import sys
import requests
import json
import pandas as pd
import logging

logging.captureWarnings(True)

##
##    function to obtain a new OAuth 2.0 token from the authentication server
##
def get_new_token(type: str):

    if(type=="staging"):
        auth_server_url = "https://auth-pisa-qa.staging.gcp-eu.taocloud.org/v1/oauth2/tokens"
        client_id = 'Q2caBAhwvNF8NgmrEPKrUw'
        client_secret = 'wHtKRe6eA7BWRBNIISiTQk3uM210OJdQ'
    elif(type=='prod'):
        auth_server_url = "https://auth-pisa-prod.prod.gcp-eu.taocloud.org/v1/oauth2/tokens"
        client_id = 'Amb9_PvLT4I2ui8flocJsA'
        client_secret = 'a8OIzcsHn0z8Z7mtYG9zfjXZotrIJQS1lNuEKEVSFHs'

    token_req_payload = {'grant_type': 'client_credentials'}

    token_response = requests.post(auth_server_url,
    data=token_req_payload, verify=False, allow_redirects=False,
    auth=(client_id, client_secret))
                
    if token_response.status_code !=200:
        print("Failed to obtain token from the OAuth 2.0 server", file=sys.stderr)
        sys.exit(1)

    print("Successfuly obtained a new token")
    tokens = json.loads(token_response.text)
    return tokens['access_token']

## 
## 	obtain a token before calling the API for the first time
##
token = get_new_token(type='prod')


Successfuly obtained a new token


In [376]:
# API request function

def req_func(req_type,url,token,data=None):

##
##   call the API with the token
##
    
    headers = {
    'Content-Type': 'application/json',
    'Authorization': 'Bearer ' + token
    }

    req_req = str.upper(req_type)

    response = requests.request(req_req, url, headers=headers, data=data)

    if	(response.status_code == 401 | response.status_code == 403):
        token = get_new_token()
        response = requests.request(req_req, url, headers=headers, data=data)


    return response


In [377]:
api_url_health = "https://dynamic-query-api-pisa-prod.prod.gcp-eu.taocloud.org/api/v1/health"
df_health = req_func(req_type = "get", url=api_url_health,token=token)
df_health.text

'{"status":"ok"}'

In [378]:
api_url_entity_list = "https://dynamic-query-api-pisa-prod.prod.gcp-eu.taocloud.org/api/v1/entity/list"
response = req_func(req_type = "get", url=api_url_entity_list,token=token)
df_entity_list = pd.json_normalize(json.loads(response.text))
df_entity_list

Unnamed: 0,name,description
0,portalSessions,Holds information on portal sessions
1,portalUserGroups,Holds information on portal user groups
2,portalGroups,Holds information on portal groups
3,portalUserSessions,Holds information on portal user sessions
4,portalUser,Holds information on portal user
5,portalBattery,Holds information on portal battery
6,testRunnerDeliveries,Holds information on test runner deliveries
7,datastoreDeliveries,Holds information on datastore deliveries
8,datastoreDeliveryResults,Holds information on datastore delivery results
9,datastoreItemResults,Holds information on datastore item results


In [379]:
def json_schema_get(entity,token):
    api_entity_get = "https://dynamic-query-api-pisa-qa.staging.gcp-eu.taocloud.org/api/v1/entity/%s" % entity
    df_entity_schema = req_func(req_type = "get",url=api_entity_get,token=token)
    return df_entity_schema

In [380]:
import pandas as pd

entities = df_entity_list['name']

with pd.ExcelWriter('entity_specx.xlsx') as writer:
    for ent in entities:    
        resp = json_schema_get(entity=ent,token=token)
        df = pd.json_normalize(resp.json())
        df.to_excel(writer,sheet_name = ent, index = False)
        worksheet = writer.sheets[ent]  # pull worksheet object
        for idx, col in enumerate(df):  # loop through all columns
            series = df[col]
            max_len = max((
                series.astype(str).map(len).max(),  # len of largest item
                len(str(series.name))  # len of column name/header
                )) + 1  # adding a little extra space
            worksheet.set_column(idx, idx, max_len)  # set column width

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [381]:
def api_json_extract(response,entity,options=None):

  json_raw = response.json()
  df_raw = pd.json_normalize(json_raw)

  if(options is not None):
    if (entity == 'datastoreDeliveryResults'):
      if('json_var' in options):
        json_var = options['json_var']
        tmp = df_raw.iloc[0]['data'][0][json_var]
        df = (
          tmp
          .join(pd.json_normalize(pd.json_normalize(tmp['values'])[0]))
          .drop(columns=['values'])
        )
        df['varType'] = df['type'].str.split(pat = "#").str[1]
      elif(('item_resp' in options) & (options['item_resp'] is True)):
        tmp = (
            pd.json_normalize(df_raw.iloc[0]['data'][0])
            .filter(regex = '^(?!metadata)(?!ltiParameters)(?!rawLtiParameters)(?!outcomes)')
          )
      
        idvars = tmp.columns[~tmp.columns.str.startswith('items.item-')]
        pivotvars = tmp.columns[tmp.columns.str.startswith('items.item-')]
        
        out = (
          pd.melt(
              tmp,
              id_vars = idvars,
              var_name = 'key',
              value_name = 'q_val',
              value_vars = pivotvars
          )
          .assign(
            key=lambda df: df.key.replace('items.item-','',regex=True)
          )
          .assign(
            q_num = lambda df: df.key.str.split('.',n=1).str[0],
            q_lab = lambda df: df.key.str.split('.',n=1).str[1],
          )
          .drop(
            ['key'],
            axis = 1
          )
        )
    else:
      out = (
        pd
        .json_normalize(df_raw.iloc[0]['data'][0])
      )
  else:
    out = json_raw

  return out

In [382]:
# payload = json.dumps({
#   # "pageSize": 1,
#   "filters": [
#     # {
#     #   "field": "deliveryId",
#     #   "type": "terms",
#     #   "values": [
#     #     "d60f257c8d73"
#     #   ]
#     # }
#   ]
# })

# payload = json.dumps({
#   "response": {
#     "fields": [
#       "deliveryId",
#       "login",
#       "deliveryExecutionId",
#       "submissionTime",
#       "testQtiId",
#       "testQtiTitle",
#       "testQtiLabel",
#       "last_update_date",
#       "sessionStartTime",
#       "sessionEndTime",
#       "score",
#       "duration",
#       "isDeleted",
#       # "items"
#     ],
#     "flatResponse": False,
#   }
# })

payload = json.dumps({
  "response": {
    # "fields": [
    #   "deliveryId",
    #   "cutScoreRatio",
    #   "id",
    #   "items",
    #   "last_update_date",
    #   "messageId",
    #   "metadata",
    #   "metadataRaw",
    #   "outcomeDeclarations",
    #   "publicationTime",
    #   "qtiId",
    #   "qtiPackage",
    #   "qtiTitle",
    #   "structure",
    #   "tenantId",
    #   "testQtiId",
    #   "testQtiLabel",
    #   "testQtiTitle",
    # ],
    "flatResponse": False,
  }
})

entities = df_entity_list[7:8]['name']
ent_df = {}

for ent in entities:
  url = "https://dynamic-query-api-pisa-prod.prod.gcp-eu.taocloud.org/api/v1/search/%s" % ent
  df = req_func(req_type='post',url=url,token=token, data=payload)
  ent_df[ent] = df

# entity_del = df_entity_list.iloc[8]['name']

# df_del_results_all = df_entity_search(entity = entity_del, token=token,data=payload)

# entity_itm = df_entity_list.iloc[9]['name']
# df_itm_results_all = df_entity_search(entity = entity_itm, token=token,data=data)

# entity_ui = df_entity_list.iloc[10]['name']
# df_evnts_results_all = df_entity_search(entity = entity_ui, token=token,data=data)

# entity_acs = df_entity_list.iloc[11]['name']
# df_acs_results_all = df_entity_search(entity = entity_itm, token=token,data=data)

In [383]:
ent_keys = list(ent_df)
# output = api_json_extract(ent_df[ent_keys[0]], entity = ent_keys[0], options = {'item_resp': True})
output = api_json_extract(ent_df[ent_keys[0]], entity = ent_keys[0], options = None)
# (
  # output
  # .query('q_lab.str.match("score")')
  # .query('q_num == "3"')
  # .query('val.str.match("CR560Q03")')
  # .assign(
  #   val = lambda df: df.val.astype(int)
  # )
  # .groupby(['deliveryId','login','testQtiId'],as_index = False)
  # .size()
  # .agg({'val': 'sum'})
  # .pivot_table(
  #   index=['deliveryId','login','testQtiId'],
  #   columns = 'key',
  #   values = 'val',
  #   aggfunc='first'
  # )
  # .rename(
  #   columns = lambda x: x.strip('3.')
  # )
# )



In [384]:
from datetime import date

all_deliveries_results = pd.json_normalize(output,record_path=['data'],max_level=0)
# all_deliveries_results['uniqueId'] = all_deliveries_results['deliveryId'] + all_deliveries_results['login']
del_tab = all_deliveries_results[['deliveryId','testQtiLabel','testQtiId']].drop_duplicates()
del_tab.to_excel('all_delivery results '+ date.today().strftime('%Y%m%d') + '.xlsx')
del_tab

Unnamed: 0,deliveryId,testQtiLabel,testQtiId
0,17c4f8525090,,
1,80db7dadd975,,
2,bafb277da874,,
3,9797742b79d7,,
4,8992cf2174bf,,
...,...,...,...
570,325a08c094c5,StQ-development,ZOVLEOIO
571,16f2829ef709,Magnetic Cleaners (CACERS016),CACERS016
572,bacb066f5d37,FTSTQ,FTSTQ
573,249e76c333fa,FTSTQ,FTSTQ


In [361]:
all_deliveries = pd.json_normalize(output,record_path=['data'])
all_deliveries.astype(str).drop_duplicates().to_excel('all_deliveries '+ date.today().strftime('%Y%m%d') + '.xlsx')
all_deliveries['items']

0      [{'taskFormat': None, 'itemId': '09251c89-86cd...
1      [{'taskFormat': None, 'itemId': 'ca3327a9-6a19...
2      [{'taskFormat': None, 'itemId': '111f1624-57ec...
3      [{'taskFormat': None, 'itemId': '0c8bc35d-6955...
4      [{'taskFormat': None, 'itemId': '551cd067-fcba...
                             ...                        
838    [{'taskFormat': None, 'itemId': 'e9c82669-d9c5...
839    [{'taskFormat': None, 'itemId': 'cf8db62a-dabd...
840    [{'taskFormat': None, 'itemId': 'fb2f9d1a-8074...
841    [{'taskFormat': None, 'itemId': 'c0a32bc5-3f04...
842    [{'taskFormat': None, 'itemId': '5a5a92e6-1705...
Name: items, Length: 843, dtype: object

In [368]:
from flatten_json import flatten

jsondata = all_deliveries['items'][0]

dic_flattened = (flatten(d,".") for d in jsondata)

df = pd.DataFrame(dic_flattened)

df

Unnamed: 0,taskFormat,itemId,qtiTitle,stringType,authoringPosition,maxScore,masteryLevel,assessmentItemId,qtiId,qtiLabel
0,,09251c89-86cd-4e04-9fd1-b98b5b5e4075,GeneralOrientationQ01,,,,,item-6,i6490e1eee277711575ae0296ff9eb1b,GeneralOrientationQ01
1,,7ce9ab9e-6bd8-4862-ba52-308ca415fe10,GeneralOrientationQ04,,,,,item-7,i6490e3347112c1036785dc677de779ab6e,GeneralOrientationQ04
2,,13800544-a40f-4ebe-be13-df690331f8d4,GeneralOrientationQ05,,,,,item-8,i6490e44ba3144395427e968658b0192ec2,GeneralOrientationQ05
3,,da41cc4f-56ed-4fb3-8afa-94042a9193e9,Effort Thermometer,,1.0,0.0,,item-5,i6516042d91518654402e463123059bb,Effort Thermometer
4,,a1d9f273-2e35-49e0-972b-a00ca79abbd2,GeneralOrientationQ01,,,,,item-1,i6490e1eee277711575ae0296ff9eb1b,GeneralOrientationQ01
5,,18c2524f-72e4-413a-aabc-e7901f19c533,GeneralOrientationQ04,,,,,item-2,i6490e3347112c1036785dc677de779ab6e,GeneralOrientationQ04
6,,20f236af-1ba5-47ba-a4c2-03e968dee0ab,GeneralOrientationQ05,,,,,item-3,i6490e44ba3144395427e968658b0192ec2,GeneralOrientationQ05
7,,2e804e9f-b0e3-4f95-874a-4a33657e48f4,Effort Thermometer,,2.0,0.0,,item-4,i6516042d91518654402e463123059bb,Effort Thermometer


In [385]:
def get_dat(resp):
  meta_cols = [
    # 'battery_id',
    'delivery_execution_id',
    'delivery_id',
    'isDeleted',
    'last_update_date',
    'login',
    'test_qti_id',
    'test_qti_label',
    'test_qti_title',
    'raw_data'
  ]

  dat = (
    pd.json_normalize(
      resp,record_path = 'data',max_level=0
    )
    .rename(
      columns={
        'batteryId':'battery_id',
        'deliveryExecutionId':'delivery_execution_id',
        'deliveryId':'delivery_id',
        'testQtiId':'test_qti_id',
        'testQtiLabel':'test_qti_label',
        'testQtiTitle':'test_qti_title',
        'items':'raw_data'
      }
    )
    .filter(
      # regex = "|".join(meta_cols[0:-1]) + '|^items\.',
      regex = "|".join(meta_cols),
      axis=1
    )
  )

  return dat

In [386]:
payload = json.dumps({
  "pageSize": 100,
  "pageNumber": 0,
  "response": {
    "fields": [
      "deliveryId",
      "login",
      "deliveryExecutionId",
      "submissionTime",
      "testQtiId",
      "testQtiTitle",
      "testQtiLabel",
      "last_update_date",
      "sessionStartTime",
      "sessionEndTime",
      "score",
      "duration",
      "isDeleted",
      "items"
    ],
    "flatResponse": False
  },
  "filters": [
      {
        "field": "login",
        "type": "terms",
        "values": [
          'FD01'
        ]
      }
    ]
  }
)
url = "https://dynamic-query-api-pisa-prod.prod.gcp-eu.taocloud.org/api/v1/search/datastoreDeliveries"
df = req_func(req_type='post',url=url,token=token, data=payload)


In [387]:
deliveries = ["79b54e16d502","39e9347aa8c8","07016da68d7b","dfdde55aaece","e9fc52080732","a1bdb0a3185e","641b1e05580d","d561bf5ce367","b0b83a7e9969","e29c0cc22dcf",
"87119f1f417d","561c998bf18b","c9fd8364c6b1","f56ba61bd014","7763b8b04da2","1ba0375e10fd","1b10aa34c033","3e9c1de5ef9a","bd9cf1e7c5df","0ce8f8dbe8de",
"d69d568926a0","fad16761c154","02b2852898bd","b436bb866c3b","0e09be904f95","beeebbc54665","c31e53ffed42","285aad37edb4","6f96d46b022f","8f7b6f61d444",
"6a8258024b56","7fab12a01b47","688e7c15f2b5","b308fb967b34","6710b6ec7d43","23abba445a95","e36fccf9ad49","a64a57fb30c1","321a699e9c3e","ae7d3326bfbe",
"162151fce556","d292dc5fbc18","d5ab7c1a2b38","f7ca8dade8c0","d0f9e58197ba","76ef955b2e49","87040b6da779","7b82d089e97e","972841946b52","58e42b858a4b",
"cb09cea2e190","f8a87921b1e5","dc21aed34111","9a30336b0415","244d62db9eaf","667a5500e9dd","47569843a96d","82c00e6ddbee","eecc20e16409","926ed4b705e4",
"5713a09e2097","67343cc95cc8","04971c3ae10f","cd3ec8c1be3c","2d7a9a4ab557","a0ccf30faf32","1623b5d1be9e","f258dd938bf6","6b9191dbb566","b0a98886dd1f",
"98f2c133db90","44aabff58425","6a3a47935d7f","467c0252393e","df4d1751fc33","5a1f678877c1","325a08c094c5","eeaf46c4e024","c6ca376af5ff","118597f0763c",
"26d6f8f40bb4","162849003c3b","0e885a6d7a4b","a502b8492ae9","18cd8f8a19cf","32e69abadbb6","16f2829ef709","bacb066f5d37","249e76c333fa","225475124b6f",
"cc8c011217c2","f6b172c781ac","a82b61f299ec","942e22db7247","64b1ebad6380","27635942534f","e8cb0e780e91","8eb31a0c6ca7","6f4e30702907","27bd69464802",
"e461663b777b","383ad88bc693","c0eb1dfd8720","c877e5b781ad","3bed6792e030","ec632dc90ba5","e74bfd4ec632","d2416cc89c76","3e2c6324e749","25e64287fb73",
"0904e4bfd584","7c5b8a79ef24","0fdfecbddcf5","4bd00025df05","b21f77be722e","5746308f9196","7c2a9db8b179","212e9c147d62","cfb1d779e1d6","99dae0924597",
"9129fedb3f4b","8ea6c7c5f894","7d969f08ef9b","069053efcd52","49fb5cab0341","9d195d02603a","b7217290b1f6","d47ab2c4bcab","7f5275e566c9","996fa3a2eff8",
"a048b08baeed","df0b637de7fc","673a7889d5e2","e16097db4972","62c5f4df516d","990d24f8da8e","b5df0cdc7263","b881cb6097aa","eb557cde0ded","eccc2375cb5d",
"0a198366c32b","4b0b666d1bfb","5e63d89bfb32","828b4305680e","d7fd57a70945","14dd1eb09172","37a8f1d68eb7","fc15902dfcd9","a583160eb6cf","db8f2fa6add8",
"399b960735b9","94d5ac1643c4","9f65c1484930","158e091a9b33","c0d765cdebae","6b63268684c5","dd955fbfe20e","f00480fb0ebb","63105b303385","0b1d0f3eca16",
"c796a3dd3bed","da3388e324ec","c91600ab2b61","ac10b7f2a418","4aec0571e7e2","e37961524ff9","fd001f4d5d8c","f7a6d261ce20","24c506f12b26","d2dcef515b33",
]

In [389]:
# deliveries = ['40c59a1f04ed'] #FLA
# deliveries = ['ca057225ca76'] #LDW
# deliveries = del_tab.deliveryId[-10:]
# deliveries = [
#   'db328a265d14',
#   'e0a006c7d3e9',
#   'ae549e93a118',
#   '62d8fd246de4',
#   '296bcd832a12',
#   'db328a265d14',
#   '5cc5bc3e8b84',
# ] # Science QTI items
# deliveries = [
#   '342033b73c66',
#   'a9855cf1f1e1',
#   'ec779c669407',
# ] # Science PCI items

del_df = {}

for dl in deliveries:
  payload = json.dumps({
    "pageSize": 1000,
    "pageNumber": 0,
    "response": {
      "fields": [
        "deliveryId",
        "login",
        "deliveryExecutionId",
        "submissionTime",
        "testQtiId",
        "testQtiTitle",
        "testQtiLabel",
        "last_update_date",
        "sessionStartTime",
        "sessionEndTime",
        "score",
        "duration",
        "isDeleted",
        "items"
      ],
      "flatResponse": False
    },
    "filters": [
        {
          "field": "deliveryId",
          "type": "terms",
          "values": [
            "%s" % dl
          ]
        }
      ]
    }
  )
  url = "https://dynamic-query-api-pisa-prod.prod.gcp-eu.taocloud.org/api/v1/search/datastoreDeliveryResults"
  df = req_func(req_type='post',url=url,token=token, data=payload)
  del_df[dl] = df

In [30]:
for k,v in del_df.items():
  filename = './data/uiEvents_' + k + '.json'
  json_data = api_json_extract(v, entity = k, options = None)
  with open(filename, 'w') as output_data:
    output_data.write(
      json.dumps(json_data, ensure_ascii=False)
    )

In [17]:
import glob, re
from pyspark.sql.functions import *
from pyspark.sql.types import *

json_files = glob.glob("./data/deliveryResults*.json")
# json_files = [x for x in json_files if 'db328a265d14' in x]

for f in json_files:
  df_json = spark.read.option("multiline","true") \
    .json(f) \
    .select(
      '*',
      explode('data').alias("dataExplode")
    ) \
    .select("dataExplode.*") \
    .selectExpr(
      'deliveryExecutionId as delivery_execution_id',
      'deliveryId as delivery_id',
      'isDeleted as is_deleted',
      'last_update_date',
      'login',
      'testQtiid as test_qti_id',
      'testQtiLabel as test_qti_label',
      'testQtiTitle as test_qti_title',
      'items as raw_data'
    ) \
    .withColumn(
      'raw_data',
      to_json('raw_data')
    )
  
  df_json.show()
  df_json.printSchema()

  upsert_spark_df_to_postgres(
    dataframe_to_upsert=df_json,
    table_name='delivery_results',
    table_unique_key=['delivery_execution_id'],
    database_credentials=params
  )


                                                                                

+---------------------+------------+----------+----------------+------------+-----------+----------------+--------------+--------------------+
|delivery_execution_id| delivery_id|is_deleted|last_update_date|       login|test_qti_id|  test_qti_label|test_qti_title|            raw_data|
+---------------------+------------+----------+----------------+------------+-----------+----------------+--------------+--------------------+
| 80050004551Z#36b7...|36b7bb968e6b|     false|   1705033914095|Z15540005008|   FLA-S-10|FLA-S-10 (en-ZZ)|      FLA-S-10|{"cluster1-FLAS10...|
+---------------------+------------+----------+----------------+------------+-----------+----------------+--------------+--------------------+

root
 |-- delivery_execution_id: string (nullable = true)
 |-- delivery_id: string (nullable = true)
 |-- is_deleted: boolean (nullable = true)
 |-- last_update_date: long (nullable = true)
 |-- login: string (nullable = true)
 |-- test_qti_id: string (nullable = true)
 |-- test_qti_

                                                                                


#################################################
 Total records loaded - 1
 Total records rejected - 0
#################################################

 Started Printing Error Messages ....
[]
 Completed Printing Error Messages ....


24/01/29 23:41:05 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+---------------------+------------+----------+----------------+------------+-----------+--------------+--------------+--------------------+
|delivery_execution_id| delivery_id|is_deleted|last_update_date|       login|test_qti_id|test_qti_label|test_qti_title|            raw_data|
+---------------------+------------+----------+----------------+------------+-----------+--------------+--------------+--------------------+
| 70020004211Z#b44f...|b44f2d5980cb|     false|   1705442133409|Z11240002007|       ML19|  ML19 (en-ZZ)|          ML19|{"cluster1-M411-i...|
+---------------------+------------+----------+----------------+------------+-----------+--------------+--------------+--------------------+

root
 |-- delivery_execution_id: string (nullable = true)
 |-- delivery_id: string (nullable = true)
 |-- is_deleted: boolean (nullable = true)
 |-- last_update_date: long (nullable = true)
 |-- login: string (nullable = true)
 |-- test_qti_id: string (nullable = true)
 |-- test_qti_label: str

                                                                                

+---------------------+------------+----------+----------------+-------+--------------+--------------------+--------------+--------------------+
|delivery_execution_id| delivery_id|is_deleted|last_update_date|  login|   test_qti_id|      test_qti_label|test_qti_title|            raw_data|
+---------------------+------------+----------+----------------+-------+--------------+--------------------+--------------+--------------------+
| 7WDLMUD#5391727f2...|5391727f248b|     false|   1705283810621|DUMLDW7|S7_FUO-S15_FUO|S7_FUO-S15_FUO (e...|S7_FUO-S15_FUO|{"cluster1-CACERS...|
| 8WDLMUD#5391727f2...|5391727f248b|     false|   1705284200969|DUMLDW8|S7_FUO-S15_FUO|S7_FUO-S15_FUO (e...|S7_FUO-S15_FUO|{"cluster1-CACERS...|
| 6WDLMUD#5391727f2...|5391727f248b|     false|   1705278535674|DUMLDW6|S7_FUO-S15_FUO|S7_FUO-S15_FUO (e...|S7_FUO-S15_FUO|{"cluster1-CACERS...|
+---------------------+------------+----------+----------------+-------+--------------+--------------------+--------------+-------

                                                                                

+---------------------+------------+----------+----------------+------------+-----------+--------------+--------------+--------------------+
|delivery_execution_id| delivery_id|is_deleted|last_update_date|       login|test_qti_id|test_qti_label|test_qti_title|            raw_data|
+---------------------+------------+----------+----------------+------------+-----------+--------------+--------------+--------------------+
| 30040006301Z#c838...|c83865996c0e|     false|   1705868390405|Z10360004003|        RA5|   RA5 (en-ZZ)|           RA5|{"cluster-id-stag...|
+---------------------+------------+----------+----------------+------------+-----------+--------------+--------------+--------------------+

root
 |-- delivery_execution_id: string (nullable = true)
 |-- delivery_id: string (nullable = true)
 |-- is_deleted: boolean (nullable = true)
 |-- last_update_date: long (nullable = true)
 |-- login: string (nullable = true)
 |-- test_qti_id: string (nullable = true)
 |-- test_qti_label: str

                                                                                

+---------------------+------------+----------+----------------+------------+-----------+--------------+--------------+--------------------+
|delivery_execution_id| delivery_id|is_deleted|last_update_date|       login|test_qti_id|test_qti_label|test_qti_title|            raw_data|
+---------------------+------------+----------+----------------+------------+-----------+--------------+--------------+--------------------+
| 70020004211Z#5689...|56897fe1e8a0|     false|   1705442285965|Z11240002007|        RA3|   RA3 (en-ZZ)|           RA3|{"cluster-id-stag...|
| 10040006301Z#5689...|56897fe1e8a0|     false|   1705409146258|Z10360004001|        RA3|   RA3 (en-ZZ)|           RA3|{"cluster-id-stag...|
+---------------------+------------+----------+----------------+------------+-----------+--------------+--------------+--------------------+

root
 |-- delivery_execution_id: string (nullable = true)
 |-- delivery_id: string (nullable = true)
 |-- is_deleted: boolean (nullable = true)
 |-- last

In [391]:
del_keys = list(del_df)
raw_data = pd.DataFrame()
raw_data_all = []
for f in del_keys:
  dat = get_dat(api_json_extract(del_df[f], entity = None, options = None))
  raw_data_all.append(dat)

raw_data = pd.concat(raw_data_all,axis=0,ignore_index=True)
raw_data.to_excel('all_delivery results_PROD_'+ date.today().strftime('%Y%m%d') + '.xlsx')

# raw_data.head(5)

In [197]:
def validateJSON(jsonData):
  try:
    json.loads(json.dumps(jsonData))
  except ValueError as err:
    return False
  return True

raw_data['valid_json'] = raw_data.apply(
  lambda d: validateJSON(d['raw_data']),
  axis = 1
)

raw_data.head(5)

Unnamed: 0,delivery_execution_id,login,test_qti_id,test_qti_title,delivery_id,isDeleted,test_qti_label,raw_data,last_update_date,valid_json
0,10DF#db328a265d14#3cb07e3ef7c177280d25ce6521c1...,FD01,USKVXCLI,Data Verification_20231208 1,db328a265d14,False,Data Verification_QTI_Repeat visits,"{'item-4': {'numAttempts': 3, 'statusCorrect':...",1702017161071,True
1,30DF#db328a265d14#3cb07e3ef7c177280d25ce6521c1...,FD03,USKVXCLI,Data Verification_20231208 1,db328a265d14,False,Data Verification_QTI_Repeat visits,"{'item-4': {'numAttempts': 2, 'statusCorrect':...",1702173701030,True
2,10DF#e0a006c7d3e9#b003c030cae8b72018c249b1c54a...,FD01,HFJCPOPX,Data Verification_20231208 2,e0a006c7d3e9,False,Data Verification_Cut and copy Extended Text a...,"{'item-6': {'numAttempts': 1, 'statusCorrect':...",1702017930917,True
3,30DF#ae549e93a118#54392597f9613fe4b7f904f1b1d4...,FD03,SXSSHDVS,Data Verification_20231208 3,ae549e93a118,False,Data Verification_Int Line draw,"{'item-2': {'numAttempts': 1, 'statusCorrect':...",1702173415761,True
4,10DF#62d8fd246de4#a9a2f9c8249085f81655d6a72efa...,FD01,OVYYYVRW,Data Verification_20231208 4,62d8fd246de4,False,Data Verification_Int D&D,"{'item-5': {'numAttempts': 1, 'statusCorrect':...",1702171420979,True


In [198]:
def json_key_item(jsonData):
  try:
    keys = list(jsonData.keys())
    check = all(('item-' in s) for s in keys)
  except ValueError as err:
    return False
  return check

raw_data['item_keys'] = raw_data.apply(
  lambda d: json_key_item(d['raw_data']),
  axis = 1
)

raw_data.head()

Unnamed: 0,delivery_execution_id,login,test_qti_id,test_qti_title,delivery_id,isDeleted,test_qti_label,raw_data,last_update_date,valid_json,item_keys
0,10DF#db328a265d14#3cb07e3ef7c177280d25ce6521c1...,FD01,USKVXCLI,Data Verification_20231208 1,db328a265d14,False,Data Verification_QTI_Repeat visits,"{'item-4': {'numAttempts': 3, 'statusCorrect':...",1702017161071,True,True
1,30DF#db328a265d14#3cb07e3ef7c177280d25ce6521c1...,FD03,USKVXCLI,Data Verification_20231208 1,db328a265d14,False,Data Verification_QTI_Repeat visits,"{'item-4': {'numAttempts': 2, 'statusCorrect':...",1702173701030,True,True
2,10DF#e0a006c7d3e9#b003c030cae8b72018c249b1c54a...,FD01,HFJCPOPX,Data Verification_20231208 2,e0a006c7d3e9,False,Data Verification_Cut and copy Extended Text a...,"{'item-6': {'numAttempts': 1, 'statusCorrect':...",1702017930917,True,True
3,30DF#ae549e93a118#54392597f9613fe4b7f904f1b1d4...,FD03,SXSSHDVS,Data Verification_20231208 3,ae549e93a118,False,Data Verification_Int Line draw,"{'item-2': {'numAttempts': 1, 'statusCorrect':...",1702173415761,True,True
4,10DF#62d8fd246de4#a9a2f9c8249085f81655d6a72efa...,FD01,OVYYYVRW,Data Verification_20231208 4,62d8fd246de4,False,Data Verification_Int D&D,"{'item-5': {'numAttempts': 1, 'statusCorrect':...",1702171420979,True,True


In [49]:
array = raw_data.raw_data[0].values()
keys = raw_data.raw_data[0].keys()
values = [raw_data.raw_data[0][lab]['maxScore'] for lab in keys]
# values = [json.loads(raw_data.raw_data[0][lab]['responses']['RESPONSE']['value'])['ts'] for lab in keys]
res = dict(map(lambda i,j : (i,j) , keys,values))
l = [isinstance(s,str) for k,s in res.items()]
all(l)

True

In [50]:
result_list = []
for k, v in raw_data.raw_data.items():
  qti_label = list(pd.json_normalize(raw_data.raw_data[0],max_level=0).transpose().apply(
    lambda df: df[0]['qtiLabel']
  ))
  result_list.append(qti_label)

In [199]:
from schemadict import schemadict, STANDARD_VALIDATORS

my_validators = STANDARD_VALIDATORS

my_validators['$required_keys'] = [
  'numAttempts',
  'statusCorrect',
  'qtiTitle',
  'maxScore',
  'submissionTime',
  'qtiId',
  'qtiLabel',
  'duration',
  'score',
  'itemEndTime',
  'outcomes',
  'responses',
  'completionStatus',
  'itemStartTime'
]

schema_outcome = schemadict({
  # "SCORE": {"type": str},
  "completionStatus": {"type": str}
})

schema_response = schemadict({
  "correct": {"type": bool},
  "value": {"type": str}
})

schema = schemadict(
  {
    "numAttempts": {"type": int, '>=': 0},
    "statusCorrect": {"type": str},
    'qtiTitle': {"type": str},
    'maxScore': {"type": str},
    'submissionTime': {"type": int},
    'qtiId': {"type": str},
    'qtiLabel': {"type": str},
    'duration': {"type": int},
    'score': {"type": str},
    'itemEndTime': {"type": int},
    'outcomes': {
      "type": dict,
      'item_type': dict,
      'item_schemadict': schema_outcome
    },
    'responses': {
      "type": dict,
      'item_type': dict,
      'item_schemadict': schema_response
    },
    'completionStatus': {"type": str},
    'itemStartTime' : {"type": int}
  },
  validators=my_validators
)

schema.keys()


KeysView(schemadict({'numAttempts': {'type': <class 'int'>, '>=': 0}, 'statusCorrect': {'type': <class 'str'>}, 'qtiTitle': {'type': <class 'str'>}, 'maxScore': {'type': <class 'str'>}, 'submissionTime': {'type': <class 'int'>}, 'qtiId': {'type': <class 'str'>}, 'qtiLabel': {'type': <class 'str'>}, 'duration': {'type': <class 'int'>}, 'score': {'type': <class 'str'>}, 'itemEndTime': {'type': <class 'int'>}, 'outcomes': {'type': <class 'dict'>, 'item_type': <class 'dict'>, 'item_schemadict': schemadict({'completionStatus': {'type': <class 'str'>}})}, 'responses': {'type': <class 'dict'>, 'item_type': <class 'dict'>, 'item_schemadict': schemadict({'correct': {'type': <class 'bool'>}, 'value': {'type': <class 'str'>}})}, 'completionStatus': {'type': <class 'str'>}, 'itemStartTime': {'type': <class 'int'>}}))

In [200]:

# Describe what kind of json you expect.
raw_resp_schema = {
  "type": "object",
  "properties": {
    "numAttempts": {"type": "number"},
    "statusCorrect": {"type": "string"},
    'qtiTitle': {"type": "string"},
    # 'maxScore': {"type": "number"},
    'submissionTime': {"type": "number"},
    'qtiId': {"type": "str"},
    'qtiLabel': {"type": "str"},
    'duration': {"type": "number"},
    # 'score': {"type": "number"},
    'itemEndTime': {"type": "number"},
    'outcomes': {
      "type": "object",
      "properties": {
        #  "SCORE": {"type": "number"},
         "completionStatus": {"type": "string"}
      },
      "required": ['SCORE','completionStatus']
    },
    'responses': {
      "type": "object",
      "properties": {
        "RESPONSE": {
          "type": "object",
          "properties":{
             "correct": {"type": "boolean"},
            #  "value": {"type": "string"}
          },
          "required": ['correct','value']
        },
        "completionStatus": {"type": "string"},
        "itemStartTime": {"type": "number"}
      }
    },
    'completionStatus': {"type": "string"},
    'itemStartTime' : {"type": "number"},
  },
  "required": ['numAttempts',
    'statusCorrect',
    'qtiTitle',
    'maxScore',
    'submissionTime',
    'qtiId',
    'qtiLabel',
    'duration',
    'score',
    'itemEndTime',
    'outcomes',
    'responses',
    'completionStatus',
    'itemStartTime'
  ]
}


In [201]:
import jsonschema
from jsonschema import validate

def validateJsonSchema(jsonData,schema):
    try:
      # validate(instance=jsonData, schema=schema)
      # schema.validate(jsonData)
      schema.validate(list(raw_data.raw_data[0].values())[0])
    except TypeError as err:
      return False
    return True

def json_structure(jsonData,schema):
  try:
    check = all({k: validateJsonSchema(v,schema) for k, v in jsonData.items()}.values())
    # check = validateJsonSchema(jsonData)
  except ValueError as err:
    return False
  return check

raw_data['item_json_schema'] = raw_data.apply(
  lambda d: json_structure(d['raw_data'],schema),
  axis = 1
)

raw_data.head(5)

Unnamed: 0,delivery_execution_id,login,test_qti_id,test_qti_title,delivery_id,isDeleted,test_qti_label,raw_data,last_update_date,valid_json,item_keys,item_json_schema
0,10DF#db328a265d14#3cb07e3ef7c177280d25ce6521c1...,FD01,USKVXCLI,Data Verification_20231208 1,db328a265d14,False,Data Verification_QTI_Repeat visits,"{'item-4': {'numAttempts': 3, 'statusCorrect':...",1702017161071,True,True,True
1,30DF#db328a265d14#3cb07e3ef7c177280d25ce6521c1...,FD03,USKVXCLI,Data Verification_20231208 1,db328a265d14,False,Data Verification_QTI_Repeat visits,"{'item-4': {'numAttempts': 2, 'statusCorrect':...",1702173701030,True,True,True
2,10DF#e0a006c7d3e9#b003c030cae8b72018c249b1c54a...,FD01,HFJCPOPX,Data Verification_20231208 2,e0a006c7d3e9,False,Data Verification_Cut and copy Extended Text a...,"{'item-6': {'numAttempts': 1, 'statusCorrect':...",1702017930917,True,True,True
3,30DF#ae549e93a118#54392597f9613fe4b7f904f1b1d4...,FD03,SXSSHDVS,Data Verification_20231208 3,ae549e93a118,False,Data Verification_Int Line draw,"{'item-2': {'numAttempts': 1, 'statusCorrect':...",1702173415761,True,True,True
4,10DF#62d8fd246de4#a9a2f9c8249085f81655d6a72efa...,FD01,OVYYYVRW,Data Verification_20231208 4,62d8fd246de4,False,Data Verification_Int D&D,"{'item-5': {'numAttempts': 1, 'statusCorrect':...",1702171420979,True,True,True


In [202]:
outcome = schema.validate(list(raw_data.raw_data[0].values())[0])
print(outcome)

None


In [205]:
test = pd.DataFrame(raw_data['raw_data'].tolist())

test.columns

Index(['item-4', 'item-3', 'item-2', 'item-1', 'item-6', 'item-5'], dtype='object')

In [219]:
test = pd.concat({k: pd.DataFrame(v).T for k, v in raw_data.raw_data.items()}, axis=0)
test.loc[test['qtiLabel'].isin(['CACERS001Q01'])]

Unnamed: 0,Unnamed: 1,numAttempts,statusCorrect,qtiTitle,maxScore,submissionTime,qtiId,qtiLabel,duration,score,itemEndTime,outcomes,responses,completionStatus,itemStartTime
0,item-1,2,incorrect,CACERS001Q01,1,1702017158000,i63281dac4aa0b2148c008f8120d5aec,CACERS001Q01,56,0,1702016919000,"{'SCORE': '0', 'MAXSCORE': '1', 'completionSta...","{'CACERS001Q01': {'correct': None, 'value': 'B'}}",completed,1702016917000
1,item-1,2,incorrect,CACERS001Q01,1,1702173698000,i63281dac4aa0b2148c008f8120d5aec,CACERS001Q01,83,0,1702173689000,"{'SCORE': '0', 'MAXSCORE': '1', 'completionSta...","{'CACERS001Q01': {'correct': None, 'value': No...",completed,1702173685000


In [72]:
array = raw_data.raw_data[0].values()
keys = list(raw_data.raw_data[0].keys())
values = [raw_data.raw_data[0][lab]['responses'] for lab in keys]
# values = [json.loads(raw_data.raw_data[0][lab]['responses']) for lab in keys]
res = dict(map(lambda i,j : (i,j) , keys,values))
# l = [isinstance(s,str) for k,s in res.items()]
# all(l)

In [88]:
print("Valid JSON: " + str(all(raw_data.valid_json)))
print("Items as keys: " + str(all(raw_data.item_keys)))
print("Correct JSON Schema: " + str(all(raw_data.item_json_schema)))

Valid JSON: True
Items as keys: True
Correct JSON Schema: True


In [206]:
tmp = (
  raw_data
  .join(
    pd.json_normalize(
      raw_data.raw_data,
      max_level = 0
    )
  )
  .drop(
    ['raw_data'],
    axis = 1
  )
)

tmp.head(5)

stub_cols = tmp.columns[tmp.columns.str.startswith("item-")]
id_cols = set(tmp.columns) - set(stub_cols)

raw_data_melt = tmp.melt(
  var_name = "items",
  value_name= "vars",
  id_vars = id_cols,
  value_vars = stub_cols
)

raw_data_melt.head(5)

Unnamed: 0,delivery_id,last_update_date,test_qti_id,valid_json,test_qti_label,item_keys,isDeleted,delivery_execution_id,test_qti_title,login,item_json_schema,items,vars
0,db328a265d14,1702017161071,USKVXCLI,True,Data Verification_QTI_Repeat visits,True,False,10DF#db328a265d14#3cb07e3ef7c177280d25ce6521c1...,Data Verification_20231208 1,FD01,True,item-4,"{'numAttempts': 3, 'statusCorrect': 'correct',..."
1,db328a265d14,1702173701030,USKVXCLI,True,Data Verification_QTI_Repeat visits,True,False,30DF#db328a265d14#3cb07e3ef7c177280d25ce6521c1...,Data Verification_20231208 1,FD03,True,item-4,"{'numAttempts': 2, 'statusCorrect': 'correct',..."
2,e0a006c7d3e9,1702017930917,HFJCPOPX,True,Data Verification_Cut and copy Extended Text a...,True,False,10DF#e0a006c7d3e9#b003c030cae8b72018c249b1c54a...,Data Verification_20231208 2,FD01,True,item-4,"{'numAttempts': 1, 'statusCorrect': 'correct',..."
3,ae549e93a118,1702173415761,SXSSHDVS,True,Data Verification_Int Line draw,True,False,30DF#ae549e93a118#54392597f9613fe4b7f904f1b1d4...,Data Verification_20231208 3,FD03,True,item-4,
4,62d8fd246de4,1702171420979,OVYYYVRW,True,Data Verification_Int D&D,True,False,10DF#62d8fd246de4#a9a2f9c8249085f81655d6a72efa...,Data Verification_20231208 4,FD01,True,item-4,"{'numAttempts': 1, 'statusCorrect': 'correct',..."


In [125]:
raw_data_melt['vars']

0     {'numAttempts': 3, 'statusCorrect': 'correct',...
1     {'numAttempts': 2, 'statusCorrect': 'correct',...
2     {'numAttempts': 1, 'statusCorrect': 'correct',...
3                                                   NaN
4     {'numAttempts': 1, 'statusCorrect': 'correct',...
                            ...                        
79                                                  NaN
80                                                  NaN
81    {'numAttempts': 1, 'statusCorrect': 'incorrect...
82    {'numAttempts': 1, 'statusCorrect': 'correct',...
83    {'numAttempts': 1, 'statusCorrect': 'incorrect...
Name: vars, Length: 84, dtype: object

In [94]:
json_dat = pd.json_normalize(raw_data_melt.vars.tolist())

meta_cols = [
  # 'battery_id',
  'delivery_execution_id',
  'delivery_id',
  'isDeleted',
  'last_update_date',
  'login',
  'test_qti_id',
  'test_qti_label',
  'test_qti_title',
  'raw_data'
]

id_cols2 = set(raw_data_melt.columns) - set(['vars'])
extra_cols = ['items','vars','values']
final_cols = meta_cols
final_cols.extend(extra_cols)

dat_long = (
  raw_data_melt
  .join(
    json_dat
  )
  .drop(
    ['vars'],
    axis = 1
  )
  .melt(
    var_name = "vars",
    value_name= "values",
    id_vars = id_cols2,
    value_vars = json_dat.columns
  )
  .reindex(
    columns=final_cols
  )
)

dat_long.head(20)

Unnamed: 0,delivery_execution_id,delivery_id,isDeleted,last_update_date,login,test_qti_id,test_qti_label,test_qti_title,raw_data,items,vars,values
0,10DF#db328a265d14#3cb07e3ef7c177280d25ce6521c1...,db328a265d14,False,1702017161071,FD01,USKVXCLI,Data Verification_QTI_Repeat visits,Data Verification_20231208 1,,item-4,numAttempts,3.0
1,30DF#db328a265d14#3cb07e3ef7c177280d25ce6521c1...,db328a265d14,False,1702173701030,FD03,USKVXCLI,Data Verification_QTI_Repeat visits,Data Verification_20231208 1,,item-4,numAttempts,2.0
2,10DF#e0a006c7d3e9#b003c030cae8b72018c249b1c54a...,e0a006c7d3e9,False,1702017930917,FD01,HFJCPOPX,Data Verification_Cut and copy Extended Text a...,Data Verification_20231208 2,,item-4,numAttempts,1.0
3,30DF#ae549e93a118#54392597f9613fe4b7f904f1b1d4...,ae549e93a118,False,1702173415761,FD03,SXSSHDVS,Data Verification_Int Line draw,Data Verification_20231208 3,,item-4,numAttempts,
4,10DF#62d8fd246de4#a9a2f9c8249085f81655d6a72efa...,62d8fd246de4,False,1702171420979,FD01,OVYYYVRW,Data Verification_Int D&D,Data Verification_20231208 4,,item-4,numAttempts,1.0
5,10DF#296bcd832a12#f0e97b921659c1e0a056143bbf92...,296bcd832a12,False,1702171767109,FD01,TKEJDWUW,Data Verification_Int hotspot,Data Verification_20231208 5,,item-4,numAttempts,1.0
6,4osammoT#296bcd832a12#9f748ad6e4f9a5cac03f75c6...,296bcd832a12,False,1705396744682,Tommaso4,TKEJDWUW,Data Verification_Int hotspot,Data Verification_20231208 5,,item-4,numAttempts,1.0
7,3osammoT#296bcd832a12#9f748ad6e4f9a5cac03f75c6...,296bcd832a12,False,1705395805340,Tommaso3,TKEJDWUW,Data Verification_Int hotspot,Data Verification_20231208 5,,item-4,numAttempts,1.0
8,2osammoT#296bcd832a12#9f748ad6e4f9a5cac03f75c6...,296bcd832a12,False,1705393639371,Tommaso2,TKEJDWUW,Data Verification_Int hotspot,Data Verification_20231208 5,,item-4,numAttempts,1.0
9,osammoT#296bcd832a12#9f748ad6e4f9a5cac03f75c6b...,296bcd832a12,False,1705392051528,Tommaso,TKEJDWUW,Data Verification_Int hotspot,Data Verification_20231208 5,,item-4,numAttempts,1.0


In [112]:
dat = dat_long[dat_long['vars'].str.startswith('responses.')]
dat = dat[dat['vars'].str.endswith('value')]
dat

Unnamed: 0,delivery_execution_id,delivery_id,isDeleted,last_update_date,login,test_qti_id,test_qti_label,test_qti_title,raw_data,items,vars,values
1344,10DF#db328a265d14#3cb07e3ef7c177280d25ce6521c1...,db328a265d14,False,1702017161071,FD01,USKVXCLI,Data Verification_QTI_Repeat visits,Data Verification_20231208 1,,item-4,responses.CACERS001Q04.value,choice_4
1345,30DF#db328a265d14#3cb07e3ef7c177280d25ce6521c1...,db328a265d14,False,1702173701030,FD03,USKVXCLI,Data Verification_QTI_Repeat visits,Data Verification_20231208 1,,item-4,responses.CACERS001Q04.value,choice_4
1346,10DF#e0a006c7d3e9#b003c030cae8b72018c249b1c54a...,e0a006c7d3e9,False,1702017930917,FD01,HFJCPOPX,Data Verification_Cut and copy Extended Text a...,Data Verification_20231208 2,,item-4,responses.CACERS001Q04.value,
1347,30DF#ae549e93a118#54392597f9613fe4b7f904f1b1d4...,ae549e93a118,False,1702173415761,FD03,SXSSHDVS,Data Verification_Int Line draw,Data Verification_20231208 3,,item-4,responses.CACERS001Q04.value,
1348,10DF#62d8fd246de4#a9a2f9c8249085f81655d6a72efa...,62d8fd246de4,False,1702171420979,FD01,OVYYYVRW,Data Verification_Int D&D,Data Verification_20231208 4,,item-4,responses.CACERS001Q04.value,
...,...,...,...,...,...,...,...,...,...,...,...,...
6295,osammoT#296bcd832a12#9f748ad6e4f9a5cac03f75c6b...,296bcd832a12,False,1705392051528,Tommaso,TKEJDWUW,Data Verification_Int hotspot,Data Verification_20231208 5,,item-5,responses.CACERS045Q05AS.value,
6296,5osammoT#296bcd832a12#9f748ad6e4f9a5cac03f75c6...,296bcd832a12,False,1705479015472,Tommaso5,TKEJDWUW,Data Verification_Int hotspot,Data Verification_20231208 5,,item-5,responses.CACERS045Q05AS.value,
6297,20DF#5cc5bc3e8b84#1fd90ed5761864d7ac8c87d6018b...,5cc5bc3e8b84,False,1702172558386,FD02,MGLEEXNU,Data Verification_Responses,Data Verification_20231208 6,,item-5,responses.CACERS045Q05AS.value,
6298,10DF#5cc5bc3e8b84#1fd90ed5761864d7ac8c87d6018b...,5cc5bc3e8b84,False,1702172274949,FD01,MGLEEXNU,Data Verification_Responses,Data Verification_20231208 6,,item-5,responses.CACERS045Q05AS.value,All correct responses


In [None]:
d = raw_data.raw_data[0]
def flatten_dict(nested_dict):
    res = {}
    if isinstance(nested_dict, dict):
        for k in nested_dict:
            flattened_dict = flatten_dict(nested_dict[k])
            for key, val in flattened_dict.items():
                key = list(key)
                key.insert(0, k)
                res[tuple(key)] = val
    else:
        res[()] = nested_dict
    return res


def nested_dict_to_df(values_dict):
    flat_dict = flatten_dict(values_dict)
    df = pd.DataFrame.from_dict(flat_dict, orient="index")
    df.index = pd.MultiIndex.from_tuples(df.index)
    df = df.unstack(level=-1)
    df.columns = df.columns.map("{0[1]}".format)
    return df

df = nested_dict_to_df(d)

In [106]:
import datetime

timestamp = "1705282053574"
your_dt = datetime.datetime.fromtimestamp(int(timestamp)/1000)  # using the local timezone
print(your_dt.strftime("%Y-%m-%d %H:%M:%S"))

2024-01-15 01:27:33


In [61]:
filename = 'datastoreDeliveryResults_d60f257c8d73.json'
string = json.dumps(output)
json_data = json.loads(
  string
    .replace("\\t", "\\\\t")
    .replace("\\n", "\\\\n")
)

# with open(filename, 'w') as output_data:
#   output_data.write(
#     json.dumps(json_data, ensure_ascii=False)
#   )
  

In [65]:
with open(filename) as json_data:
  data = json.load(json_data)
  df = pd.DataFrame(data['data'])
  meta_data = pd.DataFrame(df.metadata.values.tolist()).drop_duplicates()

meta_data


Unnamed: 0,PISA25 Domains,Label,PISA25 Languages,UI Engine
0,https://www.oecd.org/REA,MSAT Misrouting Test_3 Nov,https://www.oecd.org/en-ZZ,testRunnerConfiguration_ui-engine:core1


In [19]:
def flatten_json(nested_json, exclude=['']):
    """Flatten json object with nested keys into a single level.
        Args:
            nested_json: A nested json object.
            exclude: Keys to exclude from output.
        Returns:
            The flattened json object if successful, None otherwise.
    """
    out = {}

    def flatten(x, name='', exclude=exclude):
        if type(x) is dict:
            for a in x:
                if a not in exclude: flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(nested_json)
    return out

In [142]:
def flatten(data, new_data):
    """Recursive helper function.

    Args:
        data: nested dictionary.
        new_data: empty dictionary.

    Returns:
        Flattened dictionary.

    """
    for key, value in data.items():
        if isinstance(value, dict):
            flatten(value, new_data)
        if isinstance(value, str) or isinstance(value, int) or isinstance(value, list):
            new_data[key] = value
    return new_data

In [356]:
from functools import reduce

def _explode(df, col):
    df = df.explode(col)

    if isinstance(df.iloc[0][col], list):
        df = _explode(df, col)
    elif isinstance(df.iloc[0][col], object):
        df_child = pd.json_normalize(df[col])
        # To prevent column name collision, add the parent column name as prefix.
        df_child.columns = [f'{col}.{x}' for x in df_child.columns]
        df = pd.concat([df.loc[:, ~df.columns.isin([col])].reset_index(drop=True), df_child], axis=1)
    
    return df

def full_explode_normalize(df):
    # Extract list columns 
    explode_cols = [x for x in df.columns if isinstance(df.iloc[0][x], list)]
    if len(explode_cols) < 1:
        return df
    
    # Explode and normalize the list
    df = reduce(_explode, explode_cols, df)

    return df

df_explode = full_explode_normalize(all_deliveries)
# df
# df_explode.columns
# df_explode.to_csv('df_explode.csv')

KeyboardInterrupt: 

In [317]:
df_explode.head(5)

Unnamed: 0,testParts.id,testParts.sections,items.taskFormat,items.itemId,items.qtiTitle,items.stringType,items.authoringPosition,items.maxScore,items.masteryLevel,items.assessmentItemId,items.qtiId,items.qtiLabel
0,testPart-3,"[{'id': 'assessmentSection-4', 'title': 'Secti...",,09251c89-86cd-4e04-9fd1-b98b5b5e4075,GeneralOrientationQ01,,,,,item-6,i6490e1eee277711575ae0296ff9eb1b,GeneralOrientationQ01
1,testPart-3,"[{'id': 'assessmentSection-4', 'title': 'Secti...",,7ce9ab9e-6bd8-4862-ba52-308ca415fe10,GeneralOrientationQ04,,,,,item-7,i6490e3347112c1036785dc677de779ab6e,GeneralOrientationQ04
2,testPart-3,"[{'id': 'assessmentSection-4', 'title': 'Secti...",,13800544-a40f-4ebe-be13-df690331f8d4,GeneralOrientationQ05,,,,,item-8,i6490e44ba3144395427e968658b0192ec2,GeneralOrientationQ05
3,testPart-3,"[{'id': 'assessmentSection-4', 'title': 'Secti...",,da41cc4f-56ed-4fb3-8afa-94042a9193e9,Effort Thermometer,,1.0,0.0,,item-5,i6516042d91518654402e463123059bb,Effort Thermometer
4,testPart-3,"[{'id': 'assessmentSection-4', 'title': 'Secti...",,a1d9f273-2e35-49e0-972b-a00ca79abbd2,GeneralOrientationQ01,,,,,item-1,i6490e1eee277711575ae0296ff9eb1b,GeneralOrientationQ01


### Sourcing Data from RDBMS tables

In [None]:
# Read sqlite query results into a pandas DataFrame
with sqlite3.connect("movies.sqlite") as conn:
    df = pd.read_sql("SELECT * from movies", conn)
df.head()

# Sourcing data from Webpages

Please visit the url https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)

In [None]:
# get data from url
df_html = pd.read_html('https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)',match = 'by country')
# Let's see how many tables are there with tage ' by county'
print(len(df_html)) # There are 4 tables
# Let's see the first table
df_html[0]