## Prepare / Setup

### Export Maple Data and Environment Variables

In [1]:
import yaml
import os
from etl.prepare import *

with open('./config.yaml', 'r') as file:
    config_data = yaml.safe_load(file)


In [None]:
import redshift_connector
from etl.postgresqlschemareader import *

with redshift_connector.connect(
    host=config_data['ams']['host'],
    database=config_data['ams']['database'],
    user=config_data['ams']['username'],
    password=config_data['ams']['password'],
    timeout=999999,
    port=5439
) as conn:
    with conn.cursor() as cur:
        cur.execute("""SELECT table_schema, table_name
                      FROM information_schema.tables
                      WHERE table_schema != 'pg_catalog'
                      AND table_schema != 'information_schema'
                      AND table_type='BASE TABLE'
                      ORDER BY table_schema, table_name""")

        tables = cur.fetchall()
        print(tables)

## Extract required data

In [2]:
from etl.extract import *
from etl.transform import *

In [3]:
with postgresql_conn(params = config_data['postgresql_prod']) as conn:
    with conn.cursor() as cur:
        # student_participants = extract_student_participants(filepath = '../maple-s3/participants_post.xlsx')
        cur.execute("SELECT * FROM maple.isot_table")
        isot_table = pd.DataFrame(cur.fetchall())
        cur.execute("SELECT * FROM maple.maple_student_post_val")
        student_participants = pd.DataFrame(cur.fetchall())

        countries = list(student_participants.loc[student_participants['batch'] == '1',:].isoalpha3.unique())
        nc_dat = isot_table.loc[isot_table['isoalpha3'].isin(countries)]
        # nc_dat = extract_country_codes(filepath = './data/maple-s3/ISOT_table.xlsx')

In [32]:
import sys, importlib
importlib.reload(sys.modules['etl.load'])

<module 'etl.load' from 'd:\\Users\\leon.head\\Documents\\pisa2025-api-etl\\etl\\load.py'>

In [4]:
con = postgresql_conn(params = config_data['postgresql_prod'])
con.autocommit = True
cur = con.cursor()

In [6]:
import glob, time
from etl.extract import *
from etl.transform import *

count = 0
domain_all = ["FLA"]

for domain in domain_all:
    cbk = create_codebook(domain = domain)
    
    if(domain == 'FLA'):
        gap_vars = True

    for idr,row in nc_dat.iterrows():
        start_time = time.time()
        country_print = str(row['isoalpha3'])
        print(f"Processing data for: {country_print}")

        extract_json(domain = domain, nc_dat = row ,overwrite = True, con = postgresql_conn(params = config_data['postgresql_prod']))

        filepath = f"./data/db/{domain.lower()}/{domain}_{country_print}.json"

        if(os.path.isfile(filepath)):
            df = read_json_file(filepath)
            print("Step 1: rows = " + str(df.shape[0]) + ' & columns = ' + str(df.shape[1]))
            
            df1 = explode_raw_data(df = df, nc_dat = nc_dat)
            print("Step 2: rows = " + str(df1.shape[0]) + ' & columns = ' + str(df1.shape[1]))

            df3 = explode_items(df1)
            print("Step 3: rows = " + str(df3.shape[0]) + ' & columns = ' + str(df3.shape[1]))

            df4 = explode_values(df3)
            df4 = rename_variables(df4, domain = domain)
            df4 = check_duplicates(df4)
            df4 = replace_blank_json(df4)
            print("Step 4: rows = " + str(df4.shape[0]) + ' & columns = ' + str(df4.shape[1]))

            df6 = explode_responses(df4, domain = domain)
            if(domain == 'FLA'):
                df6 = fla_recode_FLALDTB1002(df6)
            if(gap_vars):
                df6 = gap_recode(df6,cbk)
            print("Step 5: rows = " + str(df6.shape[0]) + ' & columns = ' + str(df6.shape[1]))

            df8 = merge_cbk_status(df6,cbk,domain = 'FLA')
            df8 = time_var_recode(df8)
            df8 = score_resp_recode(df8,domain = 'FLA') 
            df8 = trailing_missing(df8,cbk=cbk)
            df8 = cmc_item_create(df8,cbk=cbk, domain = 'FLA')
            df9 = merge_participant_info(df8,nc_dat = nc_dat, student_participants=student_participants)
            print("Step 6: rows = " + str(df9.shape[0]) + ' & columns = ' + str(df9.shape[1]))

            if(count == 0):
                df_long = df9
            else:
                df_long = pd.concat([df_long,df9],axis = 0)
            
            count =+ 1

            end_time = time.time()
            elapsed_time = end_time - start_time
            print(f"Time taken for {filepath}: {elapsed_time:.2f} seconds")

        # df9.export_to_postgresql()

Extracting codebook data from sheet FLA_Reading_CQ
Extracting codebook data from sheet FLA_Listening_CQ
Codebook created for FLA
Processing data for: AUT
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: BRN
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: QCY
Connection to DB established, searching for new records...
JSON file created for QCY
Connection to DB closed
Step 1: rows = 897 & columns = 9
Step 2: rows = 897 & columns = 33
Step 3: rows = 20945 & columns = 8
Step 4: rows = 20945 & columns = 23
Step 5: rows = 61431 & columns = 22
Step 6: rows = 62153 & columns = 49
Time taken for ./data/db/fla/FLA_QCY.json: 98.59 seconds
Processing data for: DEU
Connection to DB established, searching for new records...
JSON file created for DEU
Connection to DB closed
Step 1: rows = 1239 & columns = 9
Step 2: rows = 1239 & columns = 33
Step 3: rows = 28930 & columns = 8
Step 4: rows = 2

In [8]:
from test.data_quality.DataQuality import DataQuality
from test.utils.utils import create_df_from_dq_results

df_check = df_long.loc[df_long['in_cq'] == '1',:]
df_check.head()

Unnamed: 0,index,login,last_update_date,testQtiLabel,sessionStartTime,sessionEndTime,language,unit_id,itemId,score,...,username,grade,gender,dob_mm,dob_yy,sen,mpop1,ppart1,test_attendance,questionnaire_attendance
1,1.0,11960023044,1720221000000.0,FLA-L-4,2024-04-09 17:45:48,2024-04-09 18:06:45,en-ZZ,FLALDGA1008,cluster1-FLAL04-item-1,1,...,11960023044,10,2,11,2008,0,1,1,1,1
5,5.0,11960037034,1711444000000.0,FLA-L-4,2024-03-26 19:44:51,2024-03-26 20:10:00,en-ZZ,FLALDGA1008,cluster1-FLAL04-item-1,0,...,11960037034,10,1,6,2008,0,1,1,1,1
9,9.0,11960042007,1711530000000.0,FLA-L-4,2024-03-27 19:34:16,2024-03-27 20:07:17,en-ZZ,FLALDGA1008,cluster1-FLAL04-item-1,1,...,11960042007,10,1,4,2008,0,1,1,1,1
13,13.0,11960011051,1711531000000.0,FLA-L-4,2024-03-27 19:51:40,2024-03-27 20:15:42,en-ZZ,FLALDGA1008,cluster1-FLAL04-item-1,1,...,11960011051,10,2,6,2008,0,1,1,1,1
17,17.0,11960048016,1711613000000.0,FLA-L-4,2024-03-28 18:45:08,2024-03-28 19:11:19,en-ZZ,FLALDGA1008,cluster1-FLAL04-item-1,1,...,11960048016,10,1,7,2008,0,1,1,1,1


In [9]:
df_summ = {}

conditions = [
    df_check['db_score_code'].eq('1'),
    df_check['db_score_code'].eq('0'),
    df_check['db_score_code'].eq('9'),
]
codes = [
    1,0,0
]

df_check_sum_score = df_check.copy(deep = True)
df_check_sum_score['score_check'] = np.select(conditions,codes,None)

df_check_sum_score = df_check_sum_score.groupby(['username','unit_id','score']).agg({'score_check':sum}).reset_index(inplace=False)
df_check_sum_score = df_check_sum_score[~df_check_sum_score['unit_id'].isin(cbk.loc[cbk['resp_cat'].str.contains('gap',na=False)].unit_id.unique().tolist())]
df_check_sum_score[['score','score_check']] = df_check_sum_score[['score','score_check']].apply(pd.to_numeric)

df_summ_config = {
    "df_check_sum_score": "config_check_sum_score"
}

In [10]:
from test.data_quality.DataQuality import DataQuality
from test.utils.utils import create_df_from_dq_results

df_summ_tab = {}

for k, v in df_summ_config.items():
    dq = DataQuality(globals()[k],config_path=f"./test/config/config.json")
    dq_results = dq.run_test()
    dq_table = create_df_from_dq_results(dq_results=dq_results)

    df_summ_tab = {}

In [11]:
dq_table.head()

Unnamed: 0,column,dimension,status,expectation_type,unexpected_count,element_count,unexpected_percent,percent
0,score; score_check,Validity,PASSED,expect_column_pair_values_to_be_equal,0,164941,0.0,100.0
1,username,Completeness,PASSED,expect_column_values_to_not_be_null,0,164941,0.0,100.0
2,username,Completeness,PASSED,expect_column_value_lengths_to_equal,0,164941,0.0,100.0
3,score,Completeness,PASSED,expect_column_values_to_not_be_null,0,164941,0.0,100.0


In [12]:
df_long.loc[df_long['in_cq']=='1',:].groupby(['qtiLabel','score_code','isoalpha3']).size().unstack(fill_value=0).to_excel('./data/FLA_freq_Score_byCnt.xlsx')
df_long.loc[df_long['in_cq']=='1',:].groupby(['qtiLabel','score_code']).size().unstack(fill_value=0).to_excel('./data/FLA_freq_Score_Overall.xlsx')
df_long.loc[df_long['in_cq']=='1',:].groupby(['qtiLabel','cq_cat','isoalpha3']).size().unstack(fill_value=0).to_excel('./data/FLA_freq_Resp_byCnt.xlsx')
df_long.loc[df_long['in_cq']=='1',:].groupby(['qtiLabel','cq_cat']).size().unstack(fill_value=0).to_excel('./data/FLA_freq_Resp_Overall.xlsx')

In [40]:
from etl.load import *

make_long_file(df_long, domain = 'FLA')
make_wide_file(df_long,cbk = cbk)