## Prepare / Setup

### Export Maple Data and Environment Variables

In [1]:
import yaml
import os
from etl.prepare import *

with open('./config.yaml', 'r') as file:
    config_data = yaml.safe_load(file)

In [2]:
import psycopg2
import psycopg2.extras
from psycopg2.extras import RealDictCursor
import itertools

params = config_data['ams']

with psycopg2.connect(
    f"dbname={params['database']} user={params['username']} host={params['host']} port = {params['port']} password = {params['password']}",
    cursor_factory=RealDictCursor
) as con:

    with con.cursor() as cur:
        cur.execute(
            f"""
            SELECT student_login as login, item_code as unit_id,criteria_1 as db_resp, right(item_code,1) as item_order
                FROM "mv_pisa_single_results"
                WHERE item_code ~ '^FLA';
            """
        )

        ams_data = pd.DataFrame(cur.fetchall()).rename(columns={'unit_id': 'unit_id'})
        ams_data = ams_data.astype('string')
        ams_data['db_resp'] = ams_data['db_resp'].apply(lambda x: x.rstrip('0').rstrip('.'))

        dat = pd.DataFrame(itertools.product(list(ams_data.login.unique()),["FLA25SP" + str(x + 1) for x in range(0,4)]),columns = ['login','unit_id']).sort_values('login').assign(db_resp = '9')
        dat['item_order'] = dat['unit_id'].apply(lambda x: x[-1:])

        ams_data = pd.merge(dat, ams_data, on=['login','unit_id','item_order'], how='outer', suffixes=('_dat', '_ams'))
        ams_data['db_resp'] = ams_data['db_resp_ams'].combine_first(ams_data['db_resp_dat'])
        ams_data.drop(columns=['db_resp_dat','db_resp_ams'],inplace=True)

        recode_tab = pd.DataFrame(
            {
                'original': list(range(1,14,1),),
                'new': [x * 0.5 for x in range(0,13,1)]
            }
        )

ams_data.head()

Unnamed: 0,login,unit_id,item_order,db_resp
0,10560004001,FLA25SP1,1,4.0
1,10560004001,FLA25SP2,2,4.5
2,10560004001,FLA25SP3,3,4.5
3,10560004001,FLA25SP4,4,5.0
4,10560004005,FLA25SP2,2,4.0


## Extract required data

In [3]:
from etl.extract import *
from etl.transform import *

In [5]:
batch_num = '2'

with postgresql_conn(params = config_data['postgresql_prod']) as conn:
    conn.autocommit = True
    with conn.cursor() as cur:
        # student_participants = extract_student_participants(filepath = '../maple-s3/participants_post.xlsx')
        cur.execute("SELECT * FROM maple.isot_table")
        isot_table = pd.DataFrame(cur.fetchall()).drop_duplicates('isocntcd',keep = 'last')
        isot_table.loc[isot_table['isoalpha3'] == 'QBR',['isoalpha3']] = 'BEL'
        isot_table.loc[isot_table['isoalpha3'] == 'TWN',['isoalpha3']] = 'TAP'
        cur.execute("SELECT * FROM maple.maple_student_post_val")
        student_participants_post = pd.DataFrame(cur.fetchall())
        student_participants_post['username'] = student_participants_post['username'].astype(str)
        student_participants_post['isocntcd'] = student_participants_post['username'].str.slice(1,4)
        student_participants_post.loc[student_participants_post['isoalpha3'].isin(['QBL','QBR']),['isoalpha3']] = 'BEL'

        countries_all = isot_table.isoalpha3.unique()
        countries_now = list(student_participants_post.loc[student_participants_post['batch'] == batch_num,:].isoalpha3.unique())
        countries_post = list(student_participants_post.loc[student_participants_post['batch'] != batch_num,:].isoalpha3.unique())
        countries_pre_init = list(set(countries_all) - set(countries_now) - set(countries_post))

        student_participants_pre = pd.read_excel('../maple-s3/participants_with_entity.xlsx').drop_duplicates(['username'],keep = 'last')
        student_participants_pre['username'] = student_participants_pre['username'].astype(str)
        student_participants_pre['isocntcd'] = student_participants_pre['username'].str.slice(1,4)
        student_participants_pre = student_participants_pre.loc[student_participants_pre['isoalpha3'].isin(countries_pre_init)].drop_duplicates(subset = ['username'],keep = 'last')
        student_participants_pre['login'] = student_participants_pre['username']
        student_participants_pre = student_participants_pre.rename({'testAttendance':'test_attendance','questionnaireAttendance': 'questionnaire_attendance'},axis = 1)
        countries_pre = list(set(student_participants_pre.isoalpha3.unique()) - set(['GBR']))

        student_participants = pd.concat(
            [
                student_participants_pre,
                student_participants_post
            ],
            axis = 0
        )

        student_participants['schid'] = student_participants['username'].str.slice(4,8).str.lstrip('0').astype(int)

        conditions = [
            (student_participants['schid'] <= 118) & (student_participants['isoalpha3'] == 'BEL'),
            (student_participants['schid'] >= 119) & (student_participants['isoalpha3'] == 'BEL')
        ]

        codes_cnt = [
            'QBL',
            'QBR'
        ]

        codes_name = [
            'Belgium (Flemish)',
            'Belgium (French)'
        ]

        student_participants['isoalpha3_new'] = np.select(conditions,codes_cnt,student_participants['isoalpha3'])
        student_participants = student_participants.drop(columns = ['isoalpha3']).rename({'isoalpha3_new':'isoalpha3'},axis = 1)

        student_participants['isoname_new'] = np.select(conditions,codes_name,student_participants['isoname'])
        student_participants = student_participants.drop(columns = ['isoname']).rename({'isoname_new':'isoname'},axis = 1)
        
        nc_dat_now = isot_table.loc[isot_table['isoalpha3'].isin(countries_now)].assign(process='now')
        nc_dat_post = isot_table.loc[isot_table['isoalpha3'].isin(countries_post)].assign(process='post')
        nc_dat_pre = isot_table.loc[isot_table['isoalpha3'].isin(countries_pre)].assign(process='pre')  
        nc_dat = pd.concat(
            [
                nc_dat_post,
                nc_dat_now,
                nc_dat_pre
            ],
            axis = 0
        )
        # nc_dat = extract_country_codes(filepath = './data/maple-s3/ISOT_table.xlsx')

In [6]:
fla_s_dat = pd.merge(
    student_participants.loc[:,['login','isoalpha3']],
    ams_data,
    on = 'login',
    how = 'inner'
).astype('string').assign(in_cq = '1')

fla_s_dat.loc[fla_s_dat['in_cq']=='1',:].groupby(['unit_id','db_resp','isoalpha3']).size().unstack(fill_value=0).to_excel(f'./data/FLASpeaking_freq_Resp_byCnt_{datetime.date.today().strftime('%Y%m%d')}.xlsx')


In [34]:
import sys, importlib
importlib.reload(sys.modules['etl.load'])

KeyError: 'etl.load'

In [7]:
nc_dat2 = nc_dat

In [8]:
test_form_dat = pd.read_csv(
    "./data/delivery_results_test_form.csv"
).rename(columns={"testqtilabel": "testQtiLabel"})

In [9]:
import time

count = 0
domain_all = ["FLA"]

for domain in domain_all:
    cbk = create_codebook(domain = domain)
    
    if(domain == 'FLA'):
        gap_vars = True

    for idr,row in nc_dat2.iterrows():
        start_time = time.time()
        country_print = str(row['isoalpha3'])
        print(f"Processing data for: {country_print}")

        extract_json(domain = domain, nc_dat = row ,overwrite = False, con = postgresql_conn(params = config_data['postgresql_prod']))

        sub_fold = 'pre' if row['process'] == 'pre' else 'post'

        filepath = f"./data/db/{domain.lower()}/{sub_fold}/{domain}_{country_print}.json"

        if(os.path.isfile(filepath)):

            filepath_csv = f"./data/db/{domain.lower()}/{sub_fold}/{domain}_{country_print}.csv"

            # df = read_json_file(filepath)
            # print("Step 1: rows = " + str(df.shape[0]) + ' & columns = ' + str(df.shape[1]))
            
            # df1 = explode_raw_data(df = df)
            # print("Step 2: rows = " + str(df1.shape[0]) + ' & columns = ' + str(df1.shape[1]))

            # df3 = explode_items(df1)
            # print("Step 3: rows = " + str(df3.shape[0]) + ' & columns = ' + str(df3.shape[1]))

            # df4 = explode_values(df3)
            # df4 = rename_variables(df4, domain = domain)
            # df4 = check_duplicates(df4)
            # df4 = replace_blank_json(df4)
            # print("Step 4: rows = " + str(df4.shape[0]) + ' & columns = ' + str(df4.shape[1]))

            # df6 = explode_responses(df4, domain = domain)
            # if(domain == 'FLA'):
            #     df6 = fla_recode_FLALDTB1002(df6)
            #     df6 = gap_recode(df6,cbk)
            #     df6 = rmmb_recode(df6,cbk)
            # print("Step 5: rows = " + str(df6.shape[0]) + ' & columns = ' + str(df6.shape[1]))

            # df8 = merge_cbk_status(df6,cbk,domain,ams_data)
            # df8 = time_var_recode(df8)
            # df8 = score_resp_recode(df8,domain = 'FLA') 
            # df8 = trailing_missing(df8)
            # df8 = cmc_item_create(df8,cbk=cbk, domain = 'FLA')
            # df8.to_csv(filepath_csv)
            df8 = pd.read_csv(filepath_csv)
            df9 = merge_participant_info(df8,student_participants=student_participants)
            print("Rows before merge test form: ",str(df9.shape[0]))
            df9 = pd.merge(
                df9,
                test_form_dat,
                how = 'left',
                on = ['login','testQtiLabel']
            )
            print("Step 6: rows = " + str(df9.shape[0]) + ' & columns = ' + str(df9.shape[1]))

            df_resp_check = sql_query_ge(nc_dat = row,cbk = cbk,con = postgresql_conn(params = config_data['postgresql_prod']))

            if(count == 0):
                df_long = df9
                df_long_check = df_resp_check
            else:
                df_long = pd.concat([df_long,df9],axis = 0)
                df_long_check = pd.concat([df_long_check,df_resp_check],axis = 0)

            if(row['isoalpha3'] != 'PER'):
                df_long_check = df_long_check.loc[df_long_check['unit_id'] != 'FLARMMB2001']
            
            count =+ 1

            end_time = time.time()
            elapsed_time = end_time - start_time
            print(f"Time taken for {filepath}: {elapsed_time:.2f} seconds")

        # df9.export_to_postgresql()

Extracting codebook data from sheet FLA_Reading_CQ
Extracting codebook data from sheet FLA_Listening_CQ
Codebook created for FLA
Processing data for: AUT
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: BRN
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: QCY
Connection to DB established, searching for new records...
Connection to DB closed


  df8 = pd.read_csv(filepath_csv)


Expected rows: 45001
Actual rows: 45001
Rows before merge test form:  45001
Step 6: rows = 45001 & columns = 52
Extracting SQL query checks...
Time taken for ./data/db/fla/post/FLA_QCY.json: 18.56 seconds
Processing data for: DEU
Connection to DB established, searching for new records...
Connection to DB closed


  df8 = pd.read_csv(filepath_csv)


Expected rows: 62245
Actual rows: 62245
Rows before merge test form:  62245
Step 6: rows = 62245 & columns = 52
Extracting SQL query checks...
Time taken for ./data/db/fla/post/FLA_DEU.json: 64.60 seconds
Processing data for: DNK
Connection to DB established, searching for new records...
Connection to DB closed


  df8 = pd.read_csv(filepath_csv)


Expected rows: 44865
Actual rows: 44865
Rows before merge test form:  44865
Step 6: rows = 44865 & columns = 52
Extracting SQL query checks...
Time taken for ./data/db/fla/post/FLA_DNK.json: 18.01 seconds
Processing data for: QUK
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: QSC
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: HRV
Connection to DB established, searching for new records...
Connection to DB closed


  df8 = pd.read_csv(filepath_csv)


Expected rows: 78649
Actual rows: 78649
Rows before merge test form:  78649
Step 6: rows = 78649 & columns = 52
Extracting SQL query checks...
Time taken for ./data/db/fla/post/FLA_HRV.json: 47.16 seconds
Processing data for: HUN
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: IDN
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: IRL
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: ITA
Connection to DB established, searching for new records...
Connection to DB closed


  df8 = pd.read_csv(filepath_csv)


Expected rows: 64804
Actual rows: 64804
Rows before merge test form:  64804
Step 6: rows = 64804 & columns = 52
Extracting SQL query checks...
Time taken for ./data/db/fla/post/FLA_ITA.json: 38.61 seconds
Processing data for: LTU
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: MAC
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: MNE
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: MNG
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: NLD
Connection to DB established, searching for new records...
Connection to DB closed


  df8 = pd.read_csv(filepath_csv)


Expected rows: 32158
Actual rows: 32158
Rows before merge test form:  32158
Step 6: rows = 32158 & columns = 52
Extracting SQL query checks...
Time taken for ./data/db/fla/post/FLA_NLD.json: 16.96 seconds
Processing data for: POL
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: PRT
Connection to DB established, searching for new records...
Connection to DB closed


  df8 = pd.read_csv(filepath_csv)


Expected rows: 72845
Actual rows: 72845
Rows before merge test form:  72845
Step 6: rows = 72845 & columns = 52
Extracting SQL query checks...
Time taken for ./data/db/fla/post/FLA_PRT.json: 81.32 seconds
Processing data for: QAT
Connection to DB established, searching for new records...
Connection to DB closed


  df8 = pd.read_csv(filepath_csv)


Expected rows: 67161
Actual rows: 67161
Rows before merge test form:  67161
Step 6: rows = 67161 & columns = 52
Extracting SQL query checks...
Time taken for ./data/db/fla/post/FLA_QAT.json: 35.13 seconds
Processing data for: SAU
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: SGP
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: SRB
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: SVN
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: SWE
Connection to DB established, searching for new records...
Connection to DB closed


  df8 = pd.read_csv(filepath_csv)


Expected rows: 65454
Actual rows: 65454
Rows before merge test form:  65454
Step 6: rows = 65454 & columns = 52
Extracting SQL query checks...
Time taken for ./data/db/fla/post/FLA_SWE.json: 31.78 seconds
Processing data for: TUR
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: URY
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: ARM
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: AUS
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: BEL
Connection to DB established, searching for new records...
Connection to DB closed


  df8 = pd.read_csv(filepath_csv)


Expected rows: 53429
Actual rows: 53429
Rows before merge test form:  53429
Step 6: rows = 53429 & columns = 52
Extracting SQL query checks...
Time taken for ./data/db/fla/post/FLA_BEL.json: 69.54 seconds
Processing data for: BGR
Connection to DB established, searching for new records...
Connection to DB closed


  df8 = pd.read_csv(filepath_csv)


Expected rows: 81947
Actual rows: 81947
Rows before merge test form:  81947
Step 6: rows = 81947 & columns = 52
Extracting SQL query checks...
Time taken for ./data/db/fla/post/FLA_BGR.json: 28.23 seconds
Processing data for: CAN
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: CHE
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: COL
Connection to DB established, searching for new records...
Connection to DB closed


  df8 = pd.read_csv(filepath_csv)


Expected rows: 62377
Actual rows: 62377
Rows before merge test form:  62377
Step 6: rows = 62377 & columns = 52
Extracting SQL query checks...
Time taken for ./data/db/fla/post/FLA_COL.json: 107.54 seconds
Processing data for: CZE
Connection to DB established, searching for new records...
Connection to DB closed


  df8 = pd.read_csv(filepath_csv)


Expected rows: 65351
Actual rows: 65351
Rows before merge test form:  65351
Step 6: rows = 65351 & columns = 52
Extracting SQL query checks...
Time taken for ./data/db/fla/post/FLA_CZE.json: 100.63 seconds
Processing data for: ECU
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: ESP
Connection to DB established, searching for new records...
Connection to DB closed


  df8 = pd.read_csv(filepath_csv)


Expected rows: 160250
Actual rows: 160250
Rows before merge test form:  160250
Step 6: rows = 160250 & columns = 52
Extracting SQL query checks...
Time taken for ./data/db/fla/post/FLA_ESP.json: 187.54 seconds
Processing data for: EST
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: FIN
Connection to DB established, searching for new records...
Connection to DB closed


  df8 = pd.read_csv(filepath_csv)


Expected rows: 61802
Actual rows: 61802
Rows before merge test form:  61802
Step 6: rows = 61802 & columns = 52
Extracting SQL query checks...
Time taken for ./data/db/fla/post/FLA_FIN.json: 65.18 seconds
Processing data for: FRA
Connection to DB established, searching for new records...
Connection to DB closed


  df8 = pd.read_csv(filepath_csv)


Expected rows: 59799
Actual rows: 59799
Rows before merge test form:  59799
Step 6: rows = 59799 & columns = 52
Extracting SQL query checks...
Time taken for ./data/db/fla/post/FLA_FRA.json: 105.91 seconds
Processing data for: GEO
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: GRC
Connection to DB established, searching for new records...
Connection to DB closed


  df8 = pd.read_csv(filepath_csv)


Expected rows: 60729
Actual rows: 60729
Rows before merge test form:  60729
Step 6: rows = 60729 & columns = 52
Extracting SQL query checks...
Time taken for ./data/db/fla/post/FLA_GRC.json: 23.00 seconds
Processing data for: ISL
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: ISR
Connection to DB established, searching for new records...
Connection to DB closed


  df8 = pd.read_csv(filepath_csv)


Expected rows: 41092
Actual rows: 41092
Rows before merge test form:  41092
Step 6: rows = 41092 & columns = 52
Extracting SQL query checks...
Time taken for ./data/db/fla/post/FLA_ISR.json: 67.05 seconds
Processing data for: JOR
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: JPN
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: KAZ
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: KEN
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: KHM
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: KOR
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: LUX
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: LVA
Connection to 

  df8 = pd.read_csv(filepath_csv)


Expected rows: 69152
Actual rows: 69152
Rows before merge test form:  69152
Step 6: rows = 69152 & columns = 52
Extracting SQL query checks...
Time taken for ./data/db/fla/post/FLA_ROU.json: 129.02 seconds
Processing data for: RWA
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: TAP
Connection to DB established, searching for new records...
Connection to DB closed


  df8 = pd.read_csv(filepath_csv)


Expected rows: 63839
Actual rows: 63839
Rows before merge test form:  63839
Step 6: rows = 63839 & columns = 52
Extracting SQL query checks...
Time taken for ./data/db/fla/post/FLA_TAP.json: 88.51 seconds
Processing data for: UKR
Connection to DB established, searching for new records...
Connection to DB closed


  df8 = pd.read_csv(filepath_csv)


Expected rows: 59196
Actual rows: 59196
Rows before merge test form:  59196
Step 6: rows = 59196 & columns = 52
Extracting SQL query checks...
Time taken for ./data/db/fla/post/FLA_UKR.json: 19.49 seconds
Processing data for: USA
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: UZB
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: VNM
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: ALB
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: ARG
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: BRA
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: CHL
Connection to DB established, searching for new records...
Connection to DB closed
Processing data for: CRI
Connection to 

In [10]:
df_check = df_long.copy(deep = True)
df_check = df_check.astype('string')
df_check = df_check.loc[df_check['in_cq'] == '1',:]
df_check.head()

Unnamed: 0.1,Unnamed: 0,index,login,last_update_date,testQtiLabel,sessionStartTime,sessionEndTime,language,unit_id,itemId,...,mpop1,ppart1,isoalpha3,isoname,isocntcd,test_attendance,questionnaire_attendance,batch,test_form,level_0
0,0,0.0,11960037071,1711439509263.0,FLA-R-M6-FLA-R-H3-FLA-R-L2,2024-03-26 18:51:21,2024-03-26 18:51:47,en-ZZ,FLARDGSA2006,cluster1-FLAR09-item-1,...,1,1,QCY,Cyprus,196,1,1,1,B087,
1,1,1.0,11960011035,1711526279566.0,FLA-R-M6-FLA-R-H3-FLA-R-L2,2024-03-27 18:44:46,2024-03-27 18:57:57,en-ZZ,FLARDGSA2006,cluster1-FLAR09-item-1,...,1,1,QCY,Cyprus,196,1,1,1,B087,
2,2,2.0,11960040018,1721739530332.0,FLA-R-M6-FLA-R-H3-FLA-R-L2,2024-04-03 17:33:27,2024-04-03 17:57:30,en-ZZ,FLARDGSA2006,cluster1-FLAR09-item-1,...,1,1,QCY,Cyprus,196,1,1,1,B087,
3,3,3.0,11960044002,1721739606878.0,FLA-R-M6-FLA-R-H3-FLA-R-L2,2024-04-03 16:38:52,2024-04-03 16:45:40,en-ZZ,FLARDGSA2006,cluster1-FLAR09-item-1,...,1,1,QCY,Cyprus,196,1,1,1,B087,
4,4,4.0,11960013042,1721741991024.0,FLA-R-M6-FLA-R-H3-FLA-R-L2,2024-04-12 16:34:10,2024-04-12 16:37:54,en-ZZ,FLARDGSA2006,cluster1-FLAR09-item-1,...,1,1,QCY,Cyprus,196,1,1,1,B087,


In [11]:
df_summ_config = {}

conditions = [
    df_check['score_code'].eq('1').astype(bool),
    df_check['score_code'].eq('0').astype(bool),
    df_check['score_code'].eq('9').astype(bool),
]
codes = [
    1,0,0
]

df_check_sum_score = df_check.copy(deep = True)
df_check_sum_score
df_check_sum_score['score_check'] = np.select(conditions,codes,None)

df_check_sum_score = df_check_sum_score.groupby(['username','unit_id','score']).agg({'score_check':sum}).reset_index(inplace=False)
df_check_sum_score = df_check_sum_score[~df_check_sum_score['unit_id'].isin(cbk.loc[cbk['resp_cat'].str.contains('gap',na=False)].unit_id.unique().tolist())]
df_check_sum_score[['score','score_check']] = df_check_sum_score[['score','score_check']].apply(pd.to_numeric)

df_summ_config["df_check_sum_score"] = "config_check_sum_score"

df_check_sum_score.head()

  df_check_sum_score = df_check_sum_score.groupby(['username','unit_id','score']).agg({'score_check':sum}).reset_index(inplace=False)


Unnamed: 0,username,unit_id,score,score_check
0,10560004001,FLAL5IMCB1001,4.0,4
1,10560004001,FLALDGA1001,1.0,1
2,10560004001,FLALDGA1002,1.0,1
3,10560004001,FLALDGA1003,1.0,1
4,10560004001,FLALDGA1004,1.0,1


In [12]:
df_check_sum_score.loc[df_check_sum_score['score'].astype(int) != df_check_sum_score['score_check'].astype(int),:]

Unnamed: 0,username,unit_id,score,score_check
101,10560004012,FLALDGA2014,0.0,1
152,10560004017,FLALDGA2014,0.0,1
206,10560004024,FLALDGA2014,0.0,1
335,10560004043,FLALDGA2014,0.0,1
584,10560004071,FLALDGA2014,0.0,1
...,...,...,...,...
437711,18040182012,FLALDGA2014,0.0,1
437873,18040182029,FLALDGA2014,0.0,1
437915,18040182036,FLALDGA2014,0.0,1
437998,18040182047,FLALDGA2014,0.0,1


In [13]:
df_summ_config["df_check"] = "config_df_check"

In [14]:
from test.data_quality.DataQuality import DataQuality
from test.utils.utils import create_df_from_dq_results

df_summ_tab = []

for k, v in df_summ_config.items():
    dq = DataQuality(globals()[k],config_path=f"./test/config/{v}.json")
    dq_results = dq.run_test()
    dq_table = create_df_from_dq_results(dq_results=dq_results).assign(table=k)
    cols = dq_table.columns.to_list()
    cols = cols[-1:] + cols[:-1]

    df_summ_tab.append(dq_table[cols])

dq_table_all = pd.concat(df_summ_tab)
dq_table_all

Unnamed: 0,table,column,dimension,status,expectation_type,unexpected_count,element_count,unexpected_percent,percent
0,df_check_sum_score,score; score_check,Validity,FAILED,expect_column_pair_values_to_be_equal,6752,424948,1.5889,98.4111
1,df_check_sum_score,username,Completeness,PASSED,expect_column_values_to_not_be_null,0,424948,0.0,100.0
2,df_check_sum_score,username,Completeness,PASSED,expect_column_value_lengths_to_equal,0,424948,0.0,100.0
3,df_check_sum_score,score,Completeness,PASSED,expect_column_values_to_not_be_null,0,424948,0.0,100.0
0,df_check,username,Completeness,PASSED,expect_column_values_to_not_be_null,0,694081,0.0,100.0
1,df_check,username,Completeness,PASSED,expect_column_value_lengths_to_equal,0,694081,0.0,100.0
2,df_check,qtiLabel,Completeness,PASSED,expect_column_values_to_be_in_set,0,694081,0.0,100.0


In [55]:
df_S = df_long.loc[df_long['domain'] == 'FLA-S',:]
login_test = df_S.loc[df_S['score_code'] == 'r','login'].drop_duplicates(keep = 'first').iloc[5]
# login_test = '15280016015'

df_x = df_long.loc[(df_long['login'] == login_test) & (df_long['domain'] == 'FLA-S'),['login','testQtiLabel','qtiLabel','db_resp','score_code']]
df_x.head()

Unnamed: 0,login,testQtiLabel,qtiLabel,db_resp,score_code
43314,11960015030,FLA-S-5,FLAS304,9,r
43315,11960015030,FLA-S-5,FLAS103,9,r
43316,11960015030,FLA-S-5,FLAS203,9,r
43317,11960015030,FLA-S-5,FLAS404,9,r


In [75]:
log_list = df_S.login.unique()
count = 0
for log in log_list:
    four_rows = df_S.loc[df_S['login'] == log,:].shape[0] == 4
    r_score = any(df_S.loc[df_S['login'] == log,:].score_code == 'r')
    if(four_rows and r_score):
        count +=1

print(count)

2238


In [76]:
len(df_S.loc[df_S['score_code'] == 'r','login'].unique())

2238

In [15]:
val_vars = cbk.loc[~cbk['resp_cat'].str.startswith('gap',na=False),:].qtiLabel2.to_list()

df_sql_check = df_long.loc[
    (~pd.isnull(df_long['qtiLabel'])) & (df_long['qtiLabel'].isin(val_vars)) & (~df_long['qtiLabel'].str.endswith('T',na = False)),
    ['login','unit_id','itemId','qtiLabel']
].assign(dat='1').sort_values(['login','qtiLabel']).merge(
    df_long_check[['login','qtiLabel','source']].assign(sql='1'),
    how = 'outer',
    on = ['login','qtiLabel']
)

conditions = [
    df_sql_check['dat'].eq('1') & df_sql_check['sql'].eq('1'),
    df_sql_check['dat'].eq('1') & ~df_sql_check['sql'].eq('1'),
    ~df_sql_check['dat'].eq('1') & df_sql_check['sql'].eq('1'),
]

codes = [
    'match',
    'dat',
    'sql'
]

df_sql_check['source'] = np.select(conditions,codes,'')
df_sql_check.drop(columns = ['sql','dat'],inplace=True)

df_sql_check.loc[df_sql_check['source'] != 'match',:].login.unique()

array(['12500013045', '12500015031', '12500074013', '16040013008',
       '16040013023', '16040013027', '16040013035', '16040019077',
       '16040036050', '16040036065', '16040036069', '16040036077',
       '16040036081', '16040036088', '16040037059', '16040037065',
       '16040040026', '16040040054', '16040040061', '16040061041',
       '16040061072', '16040070004', '16040070016', 'A16040100086',
       'A16040013046', '16040007006', '16040007012', '16040004037',
       '16040004049', '16040004033', '16040058021', '16040058009',
       '16040058033', '16040058063', '16040058014', '16040058068',
       '16040058002', '16040058026', '16040058080', '16040058052',
       '16040058091', '16040058056', '16040058044', '16040058037',
       '16040058087', '16040058075', '16040079020', '16040079027',
       '16040079001', '16040079039', '16040079051', '16040079032',
       '16040079055', '16040079043', '16040031057', '16040028025',
       '16040028008', '16040028001', '16040028074', '1604002

In [78]:
df_long = df_long.astype('string')
df_long.loc[df_long['in_cq']=='1',:].groupby(['qtiLabel','score_code','isoalpha3']).size().unstack(fill_value=0).to_excel(f'./data/FLA_freq_Score_byCnt_{datetime.date.today().strftime('%Y%m%d')}.xlsx')
df_long.loc[df_long['in_cq']=='1',:].groupby(['qtiLabel','score_code']).size().unstack(fill_value=0).to_excel(f'./data/FLA_freq_Score_Overall_{datetime.date.today().strftime('%Y%m%d')}.xlsx')
df_long.loc[df_long['in_cq']=='1',:].groupby(['qtiLabel','cq_cat','isoalpha3']).size().unstack(fill_value=0).to_excel(f'./data/FLA_freq_Resp_byCnt_{datetime.date.today().strftime('%Y%m%d')}.xlsx')
df_long.loc[df_long['in_cq']=='1',:].groupby(['qtiLabel','cq_cat']).size().unstack(fill_value=0).to_excel(f'./data/FLA_freq_Resp_Overall_{datetime.date.today().strftime('%Y%m%d')}.xlsx')

In [16]:
from etl.load import *

make_long_file(df_long, domain = 'FLA')
make_wide_file(df_long, cbk = cbk, domain = 'FLA')