## Data sourcing

Source data from various source systems and ingest them using python code.

1. Parquet files
2. CSV files
3. APIs
4. RDBMS databases
5. HTML

In [1]:
# import modules
import certifi
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3
import urllib3
from urllib3 import request
import requests
from unicodedata import normalize

### Sourcing Parquet data

Please visit the url https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page

In [None]:
# Read data from the Parquet file. We use pandas read_parquet method for ease and speed.
df_parquet = pd.read_parquet("yellow_tripdata_2022-01.parquet")
df_parquet.head()

### Sourcing CSV data 

Please visit the url https://data.cityofnewyork.us/resource/h9gi-nx95.csv?$limit=500


In [None]:
# Read data from the CSV file. We use pandas read_csv method for ease and speed.
df_csv = pd.read_csv("h9gi-nx95.csv")
df_csv.head()

### Sourcing data from APIs

Please make sure to install the certifi library using - pipenv install certifi

In [None]:
# get api data from url
url = 'https://data.cityofnewyork.us/resource/h9gi-nx95.json?$limit=500'

# Check if API is available to retrive the data
apt_status = http.request('GET', url).status
print(apt_status)
if apt_status == 200:
    # Sometimes we get certificate error . We shoul never silence this error as this may cause a securirty threat.
    # Create a Pool manager that can be used to read the API response 
    http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',ca_certs=certifi.where())
    data = json.loads(http.request('GET', url).data.decode('utf-8'))
    df_api = pd.json_normalize(data)
else:
    df_api = pd.Dataframe()
df_api.head(10)

### PISA API Testing

In [4]:
import sys
import requests
import json
import pandas as pd
import logging

logging.captureWarnings(True)

##
##    function to obtain a new OAuth 2.0 token from the authentication server
##
def get_new_token():

    auth_server_url = "https://auth-pisa-qa.staging.gcp-eu.taocloud.org/v1/oauth2/tokens"
    client_id = 'Q2caBAhwvNF8NgmrEPKrUw'
    client_secret = 'wHtKRe6eA7BWRBNIISiTQk3uM210OJdQ'

    token_req_payload = {'grant_type': 'client_credentials'}

    token_response = requests.post(auth_server_url,
    data=token_req_payload, verify=False, allow_redirects=False,
    auth=(client_id, client_secret))
                
    if token_response.status_code !=200:
        print("Failed to obtain token from the OAuth 2.0 server", file=sys.stderr)
        sys.exit(1)

    print("Successfuly obtained a new token")
    tokens = json.loads(token_response.text)
    return tokens['access_token']

## 
## 	obtain a token before calling the API for the first time
##
token = get_new_token()


Successfuly obtained a new token


In [1]:
# API request function

def req_func(req_type,url,token,data=None):

##
##   call the API with the token
##
    
    headers = {
    'Content-Type': 'application/json',
    'Authorization': 'Bearer ' + token
    }

    req_req = str.upper(req_type)

    response = requests.request(req_req, url, headers=headers, data=data)

    if	(response.status_code == 401 | response.status_code == 403):
        token = get_new_token()
        response = requests.request(req_req, url, headers=headers, data=data)


    return response


In [5]:
api_url_health = "https://dynamic-query-api-pisa-qa.staging.gcp-eu.taocloud.org/api/v1/health"
df_health = req_func(req_type = "get", url=api_url_health,token=token)
df_health.text

'{"status":"ok"}'

In [6]:
api_url_entity_list = "https://dynamic-query-api-pisa-qa.staging.gcp-eu.taocloud.org/api/v1/entity/list"
response = req_func(req_type = "get", url=api_url_entity_list,token=token)
df_entity_list = pd.json_normalize(json.loads(response.text))
df_entity_list

Unnamed: 0,name,description
0,portalSessions,Holds information on portal sessions
1,portalUserGroups,Holds information on portal user groups
2,portalGroups,Holds information on portal groups
3,portalUserSessions,Holds information on portal user sessions
4,portalUser,Holds information on portal user
5,portalBattery,Holds information on portal battery
6,testRunnerDeliveries,Holds information on test runner deliveries
7,datastoreDeliveries,Holds information on datastore deliveries
8,datastoreDeliveryResults,Holds information on datastore delivery results
9,datastoreItemResults,Holds information on datastore item results


In [7]:
def json_schema_get(entity,token):
    api_entity_get = "https://dynamic-query-api-pisa-qa.staging.gcp-eu.taocloud.org/api/v1/entity/%s" % entity
    df_entity_schema = req_func(req_type = "get",url=api_entity_get,token=token)
    return df_entity_schema

In [6]:
import pandas as pd

entities = df_entity_list['name']

with pd.ExcelWriter('entity_specx.xlsx') as writer:
    for ent in entities:    
        resp = json_schema_get(entity=ent,token=token)
        df = pd.json_normalize(resp.json())
        df.to_excel(writer,sheet_name = ent, index = False)
        worksheet = writer.sheets[ent]  # pull worksheet object
        for idx, col in enumerate(df):  # loop through all columns
            series = df[col]
            max_len = max((
                series.astype(str).map(len).max(),  # len of largest item
                len(str(series.name))  # len of column name/header
                )) + 1  # adding a little extra space
            worksheet.set_column(idx, idx, max_len)  # set column width

In [8]:
def api_json_extract(response,entity,options=None):

  json_raw = response.json()
  df_raw = pd.json_normalize(json_raw)

  if(options is not None):
    if (entity == 'datastoreDeliveryResults'):
      if('json_var' in options):
        json_var = options['json_var']
        tmp = df_raw.iloc[0]['data'][0][json_var]
        df = (
          tmp
          .join(pd.json_normalize(pd.json_normalize(tmp['values'])[0]))
          .drop(columns=['values'])
        )
        df['varType'] = df['type'].str.split(pat = "#").str[1]
      elif(('item_resp' in options) & (options['item_resp'] is True)):
        tmp = (
            pd.json_normalize(df_raw.iloc[0]['data'][0])
            .filter(regex = '^(?!metadata)(?!ltiParameters)(?!rawLtiParameters)(?!outcomes)')
          )
      
        idvars = tmp.columns[~tmp.columns.str.startswith('items.item-')]
        pivotvars = tmp.columns[tmp.columns.str.startswith('items.item-')]
        
        out = (
          pd.melt(
              tmp,
              id_vars = idvars,
              var_name = 'key',
              value_name = 'q_val',
              value_vars = pivotvars
          )
          .assign(
            key=lambda df: df.key.replace('items.item-','',regex=True)
          )
          .assign(
            q_num = lambda df: df.key.str.split('.',n=1).str[0],
            q_lab = lambda df: df.key.str.split('.',n=1).str[1],
          )
          .drop(
            ['key'],
            axis = 1
          )
        )
    else:
      out = (
        pd
        .json_normalize(df_raw.iloc[0]['data'][0])
      )
  else:
    out = json_raw

  return out

In [9]:
# payload = json.dumps({
#   # "pageSize": 1,
#   "filters": [
#     # {
#     #   "field": "deliveryId",
#     #   "type": "terms",
#     #   "values": [
#     #     "d60f257c8d73"
#     #   ]
#     # }
#   ]
# })

payload = json.dumps({
  "response": {
    "fields": [
      "deliveryId",
      "login",
      "deliveryExecutionId",
      "submissionTime",
      "testQtiId",
      "testQtiTitle",
      "testQtiLabel",
      "last_update_date",
      "sessionStartTime",
      "sessionEndTime",
      "score",
      "duration",
      "isDeleted",
      # "items"
    ],
    "flatResponse": False,
  }
})

entities = df_entity_list[8:9]['name']
ent_df = {}

for ent in entities:
  url = "https://dynamic-query-api-pisa-qa.staging.gcp-eu.taocloud.org/api/v1/search/%s" % ent
  df = req_func(req_type='post',url=url,token=token, data=payload)
  ent_df[ent] = df

# entity_del = df_entity_list.iloc[8]['name']

# df_del_results_all = df_entity_search(entity = entity_del, token=token,data=payload)

# entity_itm = df_entity_list.iloc[9]['name']
# df_itm_results_all = df_entity_search(entity = entity_itm, token=token,data=data)

# entity_ui = df_entity_list.iloc[10]['name']
# df_evnts_results_all = df_entity_search(entity = entity_ui, token=token,data=data)

# entity_acs = df_entity_list.iloc[11]['name']
# df_acs_results_all = df_entity_search(entity = entity_itm, token=token,data=data)

In [10]:
ent_keys = list(ent_df)
# output = api_json_extract(ent_df[ent_keys[0]], entity = ent_keys[0], options = {'item_resp': True})
output = api_json_extract(ent_df[ent_keys[0]], entity = ent_keys[0], options = None)
(
  # output
  # .query('q_lab.str.match("score")')
  # .query('q_num == "3"')
  # .query('val.str.match("CR560Q03")')
  # .assign(
  #   val = lambda df: df.val.astype(int)
  # )
  # .groupby(['deliveryId','login','testQtiId'],as_index = False)
  # .size()
  # .agg({'val': 'sum'})
  # .pivot_table(
  #   index=['deliveryId','login','testQtiId'],
  #   columns = 'key',
  #   values = 'val',
  #   aggfunc='first'
  # )
  # .rename(
  #   columns = lambda x: x.strip('3.')
  # )
)



()

In [11]:
all_deliveries_results = pd.json_normalize(output,record_path=['data'],max_level=0)
# all_deliveries_results['uniqueId'] = all_deliveries_results['deliveryId'] + all_deliveries_results['login']
del_tab = all_deliveries_results[['deliveryId','testQtiLabel','testQtiId']].drop_duplicates().reset_index()
# del_tab.to_excel('all_delivery results 20240118.xlsx')
del_tab

Unnamed: 0,index,deliveryId,testQtiLabel,testQtiId
0,0,e1a2828d2bd3,S1-S2 (en-ZZ),S1-S2
1,1,6ce33cee128a,,MRJIEZHG
2,2,edcc2ca335b4,,EVJOGBLH
3,3,00b6196d410c,MA1 (en-ZZ),MA1
4,4,484294214f16,,EZAWIVMU
...,...,...,...,...
283,3444,70d8a2183711,,RORCVWGO
284,3454,5391727f248b,S7_FUO-S15_FUO (en-ZZ),S7_FUO-S15_FUO
285,3472,ca057225ca76,LDW8-LDW3 (en-ZZ),LDW8-LDW3
286,3566,b44f2d5980cb,ML19 (en-ZZ),ML19


In [12]:
def get_dat(resp):
  meta_cols = [
    # 'battery_id',
    'delivery_execution_id',
    'delivery_id',
    'isDeleted',
    'last_update_date',
    'login',
    'test_qti_id',
    'test_qti_label',
    'test_qti_title',
    'raw_data'
  ]

  dat = (
    pd.json_normalize(
      resp,record_path = 'data',max_level=0
    )
    .rename(
      columns={
        'batteryId':'battery_id',
        'deliveryExecutionId':'delivery_execution_id',
        'deliveryId':'delivery_id',
        'testQtiId':'test_qti_id',
        'testQtiLabel':'test_qti_label',
        'testQtiTitle':'test_qti_title',
        'items':'raw_data'
      }
    )
    .filter(
      # regex = "|".join(meta_cols[0:-1]) + '|^items\.',
      regex = "|".join(meta_cols),
      axis=1
    )
  )

  return dat

In [95]:
# deliveries = ['40c59a1f04ed'] #FLA
deliveries = ['ca057225ca76'] #LDW
del_df = {}

for dl in deliveries:
  payload = json.dumps({
    "pageSize": 100,
    "pageNumber": 0,
    "response": {
      "fields": [
        "deliveryId",
        "login",
        "deliveryExecutionId",
        "submissionTime",
        "testQtiId",
        "testQtiTitle",
        "testQtiLabel",
        "last_update_date",
        "sessionStartTime",
        "sessionEndTime",
        "score",
        "duration",
        "isDeleted",
        "items"
      ],
      "flatResponse": False
    },
    "filters": [
        {
          "field": "deliveryId",
          "type": "terms",
          "values": [
            "%s" % dl
          ]
        }
      ]
    }
  )
  url = "https://dynamic-query-api-pisa-qa.staging.gcp-eu.taocloud.org/api/v1/search/datastoreDeliveryResults"
  df = req_func(req_type='post',url=url,token=token, data=payload)
  del_df[dl] = df

In [96]:
del_keys = list(del_df)
raw_data = get_dat(api_json_extract(del_df[del_keys[0]], entity = del_keys[0], options = None))
raw_data.head(5)

Unnamed: 0,delivery_execution_id,login,test_qti_id,test_qti_title,delivery_id,isDeleted,test_qti_label,raw_data,last_update_date
0,6WDLMUD#ca057225ca76#f171422234ee42dbc8315d521...,DUMLDW6,LDW8-LDW3,LDW8-LDW3,ca057225ca76,False,LDW8-LDW3 (en-ZZ),{'cluster1-CACERL008-item-8': {'numAttempts': ...,1705282053574


In [97]:
def validateJSON(jsonData):
  try:
    json.loads(json.dumps(jsonData))
  except ValueError as err:
    return False
  return True

raw_data['valid_json'] = raw_data.apply(
  lambda d: validateJSON(d['raw_data']),
  axis = 1
)

raw_data.head(5)

Unnamed: 0,delivery_execution_id,login,test_qti_id,test_qti_title,delivery_id,isDeleted,test_qti_label,raw_data,last_update_date,valid_json
0,6WDLMUD#ca057225ca76#f171422234ee42dbc8315d521...,DUMLDW6,LDW8-LDW3,LDW8-LDW3,ca057225ca76,False,LDW8-LDW3 (en-ZZ),{'cluster1-CACERL008-item-8': {'numAttempts': ...,1705282053574,True


In [98]:
def json_key_item(jsonData):
  try:
    keys = list(jsonData.keys())
    check = all(('item-' in s) for s in keys)
  except ValueError as err:
    return False
  return check

raw_data['item_keys'] = raw_data.apply(
  lambda d: json_key_item(d['raw_data']),
  axis = 1
)

raw_data.head()

Unnamed: 0,delivery_execution_id,login,test_qti_id,test_qti_title,delivery_id,isDeleted,test_qti_label,raw_data,last_update_date,valid_json,item_keys
0,6WDLMUD#ca057225ca76#f171422234ee42dbc8315d521...,DUMLDW6,LDW8-LDW3,LDW8-LDW3,ca057225ca76,False,LDW8-LDW3 (en-ZZ),{'cluster1-CACERL008-item-8': {'numAttempts': ...,1705282053574,True,True


In [99]:
array = raw_data.raw_data[0].values()
keys = raw_data.raw_data[0].keys()
values = [raw_data.raw_data[0][lab]['maxScore'] for lab in keys]
# values = [json.loads(raw_data.raw_data[0][lab]['responses']['RESPONSE']['value'])['ts'] for lab in keys]
res = dict(map(lambda i,j : (i,j) , keys,values))

In [100]:
l = [isinstance(s,int) for k,s in res.items()]
all(l)

True

In [101]:
result_list = []
for k, v in raw_data.raw_data.items():
  qti_label = list(pd.json_normalize(raw_data.raw_data[0],max_level=0).transpose().apply(
    lambda df: df[0]['qtiLabel']
  ))
  result_list.append(qti_label)

In [102]:
from schemadict import schemadict, STANDARD_VALIDATORS

my_validators = STANDARD_VALIDATORS

my_validators['$required_keys'] = [
  'numAttempts',
  'statusCorrect',
  'qtiTitle',
  'maxScore',
  'submissionTime',
  'qtiId',
  'qtiLabel',
  'duration',
  'score',
  'itemEndTime',
  'outcomes',
  'responses',
  'completionStatus',
  'itemStartTime'
]

schema_outcome = schemadict({
  # "SCORE": {"type": str},
  "completionStatus": {"type": str}
})

schema_response = schemadict({
  "correct": {"type": bool},
  "value": {"type": str}
})

schema = schemadict(
  {
    "numAttempts": {"type": int, '>=': 0},
    "statusCorrect": {"type": str},
    'qtiTitle': {"type": str},
    'maxScore': {"type": int},
    'submissionTime': {"type": int},
    'qtiId': {"type": str},
    'qtiLabel': {"type": str},
    'duration': {"type": int},
    'score': {"type": str},
    'itemEndTime': {"type": int},
    'outcomes': {
      "type": dict,
      'item_type': dict,
      'item_schemadict': schema_outcome
    },
    'responses': {
      "type": dict,
      'item_type': dict,
      'item_schemadict': schema_response
    },
    'completionStatus': {"type": str},
    'itemStartTime' : {"type": int}
  },
  validators=my_validators
)

schema.keys()


KeysView(schemadict({'numAttempts': {'type': <class 'int'>, '>=': 0}, 'statusCorrect': {'type': <class 'str'>}, 'qtiTitle': {'type': <class 'str'>}, 'maxScore': {'type': <class 'int'>}, 'submissionTime': {'type': <class 'int'>}, 'qtiId': {'type': <class 'str'>}, 'qtiLabel': {'type': <class 'str'>}, 'duration': {'type': <class 'int'>}, 'score': {'type': <class 'str'>}, 'itemEndTime': {'type': <class 'int'>}, 'outcomes': {'type': <class 'dict'>, 'item_type': <class 'dict'>, 'item_schemadict': schemadict({'completionStatus': {'type': <class 'str'>}})}, 'responses': {'type': <class 'dict'>, 'item_type': <class 'dict'>, 'item_schemadict': schemadict({'correct': {'type': <class 'bool'>}, 'value': {'type': <class 'str'>}})}, 'completionStatus': {'type': <class 'str'>}, 'itemStartTime': {'type': <class 'int'>}}))

In [103]:

# Describe what kind of json you expect.
raw_resp_schema = {
  "type": "object",
  "properties": {
    "numAttempts": {"type": "number"},
    "statusCorrect": {"type": "string"},
    'qtiTitle': {"type": "string"},
    'maxScore': {"type": "number"},
    'submissionTime': {"type": "number"},
    'qtiId': {"type": "str"},
    'qtiLabel': {"type": "str"},
    'duration': {"type": "number"},
    # 'score': {"type": "number"},
    'itemEndTime': {"type": "number"},
    'outcomes': {
      "type": "object",
      "properties": {
        #  "SCORE": {"type": "number"},
         "completionStatus": {"type": "string"}
      },
      "required": ['SCORE','completionStatus']
    },
    'responses': {
      "type": "object",
      "properties": {
        "RESPONSE": {
          "type": "object",
          "properties":{
             "correct": {"type": "boolean"},
            #  "value": {"type": "string"}
          },
          "required": ['correct','value']
        },
        "completionStatus": {"type": "string"},
        "itemStartTime": {"type": "number"}
      }
    },
    'completionStatus': {"type": "string"},
    'itemStartTime' : {"type": "number"},
  },
  "required": ['numAttempts',
    'statusCorrect',
    'qtiTitle',
    'maxScore',
    'submissionTime',
    'qtiId',
    'qtiLabel',
    'duration',
    'score',
    'itemEndTime',
    'outcomes',
    'responses',
    'completionStatus',
    'itemStartTime'
  ]
}


In [104]:
import jsonschema
from jsonschema import validate

def validateJsonSchema(jsonData,schema):
    try:
      # validate(instance=jsonData, schema=schema)
      schema.validate(jsonData)
      schema.validate(list(raw_data.raw_data[0].values())[0])
    except TypeError as err:
      return False
    return True

def json_structure(jsonData,schema):
  try:
    check = all({k: validateJsonSchema(v,schema) for k, v in jsonData.items()}.values())
    # check = validateJsonSchema(jsonData)
  except ValueError as err:
    return False
  return check

raw_data['item_json_schema'] = raw_data.apply(
  lambda d: json_structure(d['raw_data'],schema),
  axis = 1
)

raw_data.head(5)

Unnamed: 0,delivery_execution_id,login,test_qti_id,test_qti_title,delivery_id,isDeleted,test_qti_label,raw_data,last_update_date,valid_json,item_keys,item_json_schema
0,6WDLMUD#ca057225ca76#f171422234ee42dbc8315d521...,DUMLDW6,LDW8-LDW3,LDW8-LDW3,ca057225ca76,False,LDW8-LDW3 (en-ZZ),{'cluster1-CACERL008-item-8': {'numAttempts': ...,1705282053574,True,True,True


In [105]:
outcome = schema.validate(list(raw_data.raw_data[0].values())[0])
print(outcome)

None


In [23]:
print("Valid JSON: " + str(all(raw_data.valid_json)))
print("Items as keys: " + str(all(raw_data.item_keys)))
print("Correct JSON Schema: " + str(all(raw_data.item_json_schema)))

Valid JSON: True
Items as keys: True
Correct JSON Schema: True


In [26]:
tmp = (
  raw_data
  .join(
    pd.json_normalize(
      raw_data.raw_data,
      max_level = 0
    )
  )
  .drop(
    ['raw_data'],
    axis = 1
  )
)

tmp.head(5)

stub_cols = tmp.columns[tmp.columns.str.startswith("item-")]
id_cols = set(tmp.columns) - set(stub_cols)

raw_data = tmp.melt(
  var_name = "items",
  value_name= "vars",
  id_vars = id_cols,
  value_vars = stub_cols
)

raw_data.head(5)

In [127]:
json_dat = pd.json_normalize(raw_data.vars.tolist())

id_cols2 = set(raw_data.columns) - set(['vars'])
extra_cols = ['items','vars','values']
final_cols = meta_cols
final_cols.extend(extra_cols)

dat_long = (
  raw_data
  .join(
    json_dat
  )
  .drop(
    ['vars'],
    axis = 1
  )
  .melt(
    var_name = "vars",
    value_name= "values",
    id_vars = id_cols2,
    value_vars = json_dat.columns
  )
  .reindex(
    columns=final_cols
  )
)

dat_long.head(20)

Unnamed: 0,delivery_execution_id,delivery_id,isDeleted,last_update_date,login,test_qti_id,test_qti_label,test_qti_title,raw_data,items,vars,values
0,2000TM#d60f257c8d73#91cd8518a3cf186444b9c2e97a...,d60f257c8d73,False,1699505361940,MT0002,ZZPNANUL,MSAT Misrouting Test_3 Nov,Reading RA1,,item-18,numAttempts,1
1,3000TM#d60f257c8d73#91cd8518a3cf186444b9c2e97a...,d60f257c8d73,False,1699507274827,MT0003,ZZPNANUL,MSAT Misrouting Test_3 Nov,Reading RA1,,item-18,numAttempts,1
2,7000TM#d60f257c8d73#91cd8518a3cf186444b9c2e97a...,d60f257c8d73,False,1699586529605,MT0007,ZZPNANUL,MSAT Misrouting Test_3 Nov,Reading RA1,,item-18,numAttempts,1
3,4000TM#d60f257c8d73#91cd8518a3cf186444b9c2e97a...,d60f257c8d73,False,1699571858189,MT0004,ZZPNANUL,MSAT Misrouting Test_3 Nov,Reading RA1,,item-18,numAttempts,1
4,6000TM#d60f257c8d73#91cd8518a3cf186444b9c2e97a...,d60f257c8d73,False,1699579189765,MT0006,ZZPNANUL,MSAT Misrouting Test_3 Nov,Reading RA1,,item-18,numAttempts,1
5,5000TM#d60f257c8d73#91cd8518a3cf186444b9c2e97a...,d60f257c8d73,False,1699577071086,MT0005,ZZPNANUL,MSAT Misrouting Test_3 Nov,Reading RA1,,item-18,numAttempts,1
6,9000TM#d60f257c8d73#91cd8518a3cf186444b9c2e97a...,d60f257c8d73,False,1699933312113,MT0009,ZZPNANUL,MSAT Misrouting Test_3 Nov,Reading RA1,,item-18,numAttempts,1
7,0100TM#d60f257c8d73#91cd8518a3cf186444b9c2e97a...,d60f257c8d73,False,1699933788924,MT0010,ZZPNANUL,MSAT Misrouting Test_3 Nov,Reading RA1,,item-18,numAttempts,1
8,4100TM#d60f257c8d73#91cd8518a3cf186444b9c2e97a...,d60f257c8d73,False,1699939099586,MT0014,ZZPNANUL,MSAT Misrouting Test_3 Nov,Reading RA1,,item-18,numAttempts,1
9,5100TM#d60f257c8d73#91cd8518a3cf186444b9c2e97a...,d60f257c8d73,False,1699939339914,MT0015,ZZPNANUL,MSAT Misrouting Test_3 Nov,Reading RA1,,item-18,numAttempts,1


In [None]:
d = output
def flatten_dict(nested_dict):
    res = {}
    if isinstance(nested_dict, dict):
        for k in nested_dict:
            flattened_dict = flatten_dict(nested_dict[k])
            for key, val in flattened_dict.items():
                key = list(key)
                key.insert(0, k)
                res[tuple(key)] = val
    else:
        res[()] = nested_dict
    return res


def nested_dict_to_df(values_dict):
    flat_dict = flatten_dict(values_dict)
    df = pd.DataFrame.from_dict(flat_dict, orient="index")
    df.index = pd.MultiIndex.from_tuples(df.index)
    df = df.unstack(level=-1)
    df.columns = df.columns.map("{0[1]}".format)
    return df

df = nested_dict_to_df(d)

In [106]:
import datetime

timestamp = "1705282053574"
your_dt = datetime.datetime.fromtimestamp(int(timestamp)/1000)  # using the local timezone
print(your_dt.strftime("%Y-%m-%d %H:%M:%S"))

2024-01-15 01:27:33


In [61]:
filename = 'datastoreDeliveryResults_d60f257c8d73.json'
string = json.dumps(output)
json_data = json.loads(
  string
    .replace("\\t", "\\\\t")
    .replace("\\n", "\\\\n")
)

# with open(filename, 'w') as output_data:
#   output_data.write(
#     json.dumps(json_data, ensure_ascii=False)
#   )
  

In [65]:
with open(filename) as json_data:
  data = json.load(json_data)
  df = pd.DataFrame(data['data'])
  meta_data = pd.DataFrame(df.metadata.values.tolist()).drop_duplicates()

meta_data


Unnamed: 0,PISA25 Domains,Label,PISA25 Languages,UI Engine
0,https://www.oecd.org/REA,MSAT Misrouting Test_3 Nov,https://www.oecd.org/en-ZZ,testRunnerConfiguration_ui-engine:core1


In [19]:
def flatten_json(nested_json, exclude=['']):
    """Flatten json object with nested keys into a single level.
        Args:
            nested_json: A nested json object.
            exclude: Keys to exclude from output.
        Returns:
            The flattened json object if successful, None otherwise.
    """
    out = {}

    def flatten(x, name='', exclude=exclude):
        if type(x) is dict:
            for a in x:
                if a not in exclude: flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(nested_json)
    return out

In [163]:
def flatten(data, new_data):
    """Recursive helper function.

    Args:
        data: nested dictionary.
        new_data: empty dictionary.

    Returns:
        Flattened dictionary.

    """
    for key, value in data.items():
        if isinstance(value, dict):
            flatten(value, new_data)
        if isinstance(value, str) or isinstance(value, int) or isinstance(value, list):
            new_data[key] = value
    return new_data

In [None]:
from functools import reduce

def _explode(df, col):
    df = df.explode(col)

    if isinstance(df.iloc[0][col], list):
        df = _explode(df, col)
    elif isinstance(df.iloc[0][col], object):
        df_child = pd.json_normalize(df[col])
        # To prevent column name collision, add the parent column name as prefix.
        df_child.columns = [f'{col}.{x}' for x in df_child.columns]
        df = pd.concat([df.loc[:, ~df.columns.isin([col])].reset_index(drop=True), df_child], axis=1)
    
    return df

def full_explode_normalize(df):
    # Extract list columns 
    explode_cols = [x for x in df.columns if isinstance(df.iloc[0][x], list)]
    if len(explode_cols) < 1:
        return df
    
    # Explode and normalize the list
    df = reduce(_explode, explode_cols, df)

    return df

df_explode = full_explode_normalize(df)
# df
# df_explode.columns
# df_explode.to_csv('df_explode.csv')

### Sourcing Data from RDBMS tables

In [None]:
# Read sqlite query results into a pandas DataFrame
with sqlite3.connect("movies.sqlite") as conn:
    df = pd.read_sql("SELECT * from movies", conn)
df.head()

# Sourcing data from Webpages

Please visit the url https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)

In [None]:
# get data from url
df_html = pd.read_html('https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)',match = 'by country')
# Let's see how many tables are there with tage ' by county'
print(len(df_html)) # There are 4 tables
# Let's see the first table
df_html[0]