## Prepare / Setup

### Export Maple Data and Environment Variables

In [1]:
import yaml
import os
from etl.prepare import *

with open('./config.yaml', 'r') as file:
    config_data = yaml.safe_load(file)

if not(os.path.exists('./data/maple-s3')):
    !git clone https://github.com/standardSetting/maple-s3.git ./data/maple-s3
    !python -m venv ./data/maple-s3/venv
    !touch ./data/maple-s3/.env

student_participants = extract_student_participants(filepath = './data/maple-s3/participants.xlsx')
nc_dat = extract_country_codes(filepath = './data/maple-s3/ISOT_table.xlsx')

domain_all = ["FLA"]

Student participant data exported
Country code data exported


In [2]:
nc_dat = extract_country_codes(filepath = './data/maple-s3/ISOT_table.xlsx')
nc_dat = nc_dat.loc[nc_dat['isoalpha3'] == "BEL",]
nc_dat.head()

Country code data exported


Unnamed: 0,isocntcd,isoalpha3,isoname
24,56,BEL,Belgium


## Extract required data

In [33]:
import sys, importlib
importlib.reload(sys.modules['test.data_quality.DataQuality'])

<module 'test.data_quality.DataQuality' from 'd:\\Users\\leon.head\\Documents\\pisa2025-api-etl\\test\\data_quality\\DataQuality.py'>

In [3]:
from etl.extract import *
from etl.transform import *


for domain in domain_all:
    cbk = create_codebook(domain = domain)
    
    if(domain == 'FLA'):
        gap_vars = True

    for idr,row in nc_dat.iterrows():
        country_print = str(row['isoalpha3'])
        print(f"Processing data for: {country_print}")

        extract_json(domain = domain, nc_dat = row ,overwrite = False, con = postgresql_conn(params = config_data['postgresql']))
        filepath = f"./data/db/{domain.lower()}/{domain}_{country_print}.json"
    
        df = read_json_file(filepath)
        print("Step 1: rows = " + str(df.shape[0]) + ' & columns = ' + str(df.shape[1]))
        
        df1 = explode_raw_data(df = df, nc_dat = nc_dat)
        print("Step 2: rows = " + str(df1.shape[0]) + ' & columns = ' + str(df1.shape[1]))

        df3 = explode_items(df1)
        print("Step 3: rows = " + str(df3.shape[0]) + ' & columns = ' + str(df3.shape[1]))

        df4 = explode_values(df3)
        df4 = rename_variables(df4, domain = domain)
        df4 = check_duplicates(df4)
        df4 = replace_blank_json(df4)
        print("Step 4: rows = " + str(df4.shape[0]) + ' & columns = ' + str(df4.shape[1]))

        df6 = explode_responses(df4, domain = domain)
        if(domain == 'FLA'):
            df6 = fla_recode_FLALDTB1002(df6)
        if(gap_vars):
            df6 = gap_recode(df6,cbk)
        print("Step 5: rows = " + str(df6.shape[0]) + ' & columns = ' + str(df6.shape[1]))

        df8 = merge_cbk_status(df6,cbk,domain = 'FLA')
        df8 = time_var_recode(df8)
        df8 = score_resp_recode(df8,domain = 'FLA') 
        df8 = trailing_missing(df8,cbk=cbk)
        df8 = cmc_item_create(df8,cbk=cbk, domain = 'FLA')
        df9 = merge_participant_info(df8,nc_dat = nc_dat, student_participants=student_participants)
        print("Step 6: rows = " + str(df9.shape[0]) + ' & columns = ' + str(df9.shape[1]))

Extracting codebook data from sheet FLA_Reading_CQ
Extracting codebook data from sheet FLA_Listening_CQ
Codebook created for FLA
Processing data for: BEL
Connection to DB established, searching for new records...
Connection to DB closed
Step 1: rows = 1061 & columns = 10
Step 4: rows = 24744 & columns = 23
Step 5: rows = 72650 & columns = 22
Step 6: rows = 73535 & columns = 48


In [4]:
from test.data_quality.DataQuality import DataQuality
from test.utils.utils import create_df_from_dq_results

df_check = df9.loc[df9['in_cq'] == '1',:]
df_check.head()

Unnamed: 0,index,login,last_update_date,testQtiLabel,sessionStartTime,sessionEndTime,language,unit_id,itemId,score,...,isoname,grade,gender,dob_mm,dob_yy,sen,mpop1,ppart1,testAttendance,questionnaireAttendance
1,1.0,10560022052,1718053000000.0,FLA-L-3,2024-05-02 17:30:50,2024-05-02 17:52:57,en-ZZ,FLALDGA1001,cluster1-FLAL03-item-1,1,...,Belgium,4,2,5,2008,0,1.0,1,1.0,1.0
5,5.0,10560031028,1718064000000.0,FLA-L-3,2024-05-02 21:43:39,2024-05-02 22:04:49,en-ZZ,FLALDGA1001,cluster1-FLAL03-item-1,1,...,Belgium,4,1,2,2008,0,1.0,1,1.0,1.0
9,9.0,10560058023,1718078000000.0,FLA-L-3,2024-05-03 17:49:27,2024-05-03 18:12:44,en-ZZ,FLALDGA1001,cluster1-FLAL03-item-1,1,...,Belgium,4,1,7,2008,0,1.0,1,1.0,1.0
13,13.0,10560004066,1718081000000.0,FLA-L-3,2024-04-19 17:36:27,2024-04-19 17:57:10,en-ZZ,FLALDGA1001,cluster1-FLAL03-item-1,1,...,Belgium,4,2,10,2008,1,1.0,1,1.0,1.0
17,17.0,10560004001,1718081000000.0,FLA-L-3,2024-04-19 17:36:49,2024-04-19 18:03:21,en-ZZ,FLALDGA1001,cluster1-FLAL03-item-1,1,...,Belgium,3,1,9,2008,0,1.0,1,1.0,1.0


In [5]:
from test.data_quality.DataQuality import DataQuality
from test.utils.utils import create_df_from_dq_results

conditions = [
    df_check['db_score_code'].eq('1'),
    df_check['db_score_code'].eq('0'),
    df_check['db_score_code'].eq('9'),
]
codes = [
    1,0,0
]

df_check['score_check'] = np.select(conditions,codes,None)

df_check_sum_score = df_check.groupby(['login','unit_id','score']).agg({'score_check':sum}).reset_index(inplace=False)
df_check_sum_score = df_check_sum_score[~df_check_sum_score['unit_id'].isin(cbk.loc[cbk['resp_cat'].str.contains('gap',na=False)].unit_id.unique().tolist())]
df_check_sum_score[['score','score_check']] = df_check_sum_score[['score','score_check']].apply(pd.to_numeric)

In [34]:
from test.data_quality.DataQuality import DataQuality

dq = DataQuality(df_check_sum_score,config_path='./test/config/config.json')
dq_results = dq.run_test()

In [35]:
from test.utils.utils import create_df_from_dq_results

create_df_from_dq_results(dq_results=dq_results)

Unnamed: 0,column,dimension,status,expectation_type,unexpected_count,element_count,unexpected_percent,percent
0,score; score_check,Validity,PASSED,expect_column_pair_values_to_be_equal,0,16453,0.0,100.0
1,login,Completeness,PASSED,expect_column_values_to_not_be_null,0,16453,0.0,100.0
2,login,Completeness,PASSED,expect_column_value_lengths_to_equal,0,16453,0.0,100.0
3,score,Completeness,PASSED,expect_column_values_to_not_be_null,0,16453,0.0,100.0
