- PY 2023, Q4: January 1, 2022, and June 30, 2024
- PY 2021, Q4: January 1, 2020 and June 30, 2022x
- PY 2019, Q4: January 1, 2018 and June 30, 2020
- PY 2017, Q4: July 1, 2016 and June 30, 2018
- PY 2015:  January 1, 2014 and June 30, 2016
- PY 2013: January 1, 2012 and June 30, 2014

In [1]:
import polars as pl
import numpy as np

In [2]:
lf2013 = pl.scan_csv("../wioa_performance_records/PublicWIASRD2013q4.csv")
lf2015 = pl.scan_csv("../wioa_performance_records/PublicWIASRD2015Q4.csv")
lf2017 = pl.scan_csv("../wioa_performance_records/WIOAPerformanceRecords_PY2017Q4_Public_csv")
lf2019 = pl.scan_csv("../wioa_performance_records/WIOAPerformanceRecords_PY2019Q4_Public_csv")
lf2021 = pl.scan_csv("../wioa_performance_records/WIOAPerformanceRecords_PY2021Q4_PUBLIC_csv")
lf2023 = pl.scan_csv("../wioa_performance_records/WIOAPerformanceRecords_PY2023Q4_PUBLIC.csv")
lf2024 = pl.scan_csv("../wioa_performance_records/WIOAPerformanceRecords_PY2024Q3_PUBLIC.csv")

In [26]:

sex_map = {
    1: "Male",
    2: "Female",
    9: "Participant did not self-identify"
}

race_map = {
    1: "Hispanic",
    2: "Asian (not Hispanic)",
    3: "Black (not Hispanic)",
    4: "Native Hawaiian or Pacific Islander (not Hispanic)",
    5: "American Indian or Alaska Native (not Hispanic)",
    6: "White (not Hispanic)",
    7: "Multiple Race (not Hispanic)",
}

highest_educational_level_map = {
    1: "Attained secondary school diploma",
    2: "Attained a secondary school equivalency",
    3: "The participant with a disability receives a certificate of attendance/completion as a result of successfully completing an Individualized Education Program (IEP)",
    4: "Completed one of more years of postsecondary education",
    5: "Attained a postsecondary technical or vocational certificate (non-degree)",
    6: "Attained an Associate's degree",
    7: "Attained a Bachelor's degree",
    8: "Attained a degree beyond a Bachelor's degree",
    0: "No Educational Level Completed"
}

In [29]:
df2024 = (
    lf2024.select(
        pl.col("PIRL100").alias("unique_id"),

        # Demographics Information
        pl.col("PIRL201").alias("sex"),
        pl.col("CALC4020").alias("race"),
        pl.col("CALC4039").alias("age"),
        pl.col("PIRL408").alias("highest_educational_level"),
        pl.col("PIRL802").alias("low_income_status"),
        pl.col("PIRL400").alias("employment_status"),

        # Pre-Program Employment
        pl.col("PIRL403").alias("occupational_code_pre"),
        pl.col("PIRL404").alias("industry_code_q1_pre"),
        pl.col("PIRL404").alias("industry_code_q2_pre"),
        pl.col("PIRL406").alias("industry_code_q3_pre"),
        pl.col("PIRL1700").alias("wages_3q_pre"),
        pl.col("PIRL1701").alias("wages_2q_pre"),
        pl.col("PIRL1702").alias("wages_1q_pre"),

        # Post-Program Employment
        pl.col("PIRL1610").alias("occupational_code_post"),
        pl.col("PIRL1614").alias("industry_code_q1_post"),
        pl.col("PIRL1615").alias("industry_code_q2_post"),
        pl.col("PIRL1616").alias("industry_code_q3_post"),
        pl.col("PIRL1617").alias("industry_code_q4_post"),
        pl.col("PIRL1703").alias("wages_1q_post"),
        pl.col("PIRL1704").alias("wages_2q_post"),
        pl.col("PIRL1705").alias("wages_3q_post"),
        pl.col("PIRL1706").alias("wages_4q_post"),
    
        # Program Information
        (pl.col("CALC4001") == 1).alias("is_adult"),
        ((pl.col("CALC4002") == 1) | (pl.col("CALC4004") == 1)).alias("is_dislocated_worker"),
        (pl.col("CALC4003") == 1).alias("is_youth"),
        (pl.col("CALC4005") == 1).alias("is_wagner_peyser"),
        ((pl.col("CALC4006") == 1)).alias("is_reportable_individual"),
        (pl.col("PIRL1300") == 1).alias("received_training"),
        pl.col("PIRL900")
            .cast(pl.Utf8)
            .str.strptime(pl.Date, "%Y%m%d")
            .alias("entry_date"),
        pl.col("PIRL901")
            .cast(pl.Utf8)
            .str.strptime(pl.Date, "%Y%m%d")
            .alias("exit_date"),
    )
    .with_columns(
        pl.col("entry_date").dt.year().alias("entry_year"),
        pl.col("entry_date").dt.quarter().alias("entry_quarter"),
        pl.col("exit_date").dt.year().alias("exit_year"),
        pl.col("exit_date").dt.quarter().alias("exit_quarter"),
    )
    .filter(
        ~pl.col("is_reportable_individual"),
        pl.col("exit_date").is_not_null()
    )
    .collect()
)

In [28]:
df2024

unique_id,sex,race,age,highest_educational_level,low_income_status,employment_status,occupational_code_pre,industry_code_q1_pre,industry_code_q2_pre,industry_code_q3_pre,wages_3q_pre,wages_2q_pre,wages_1q_pre,occupational_code_post,industry_code_q1_post,industry_code_q2_post,industry_code_q3_post,industry_code_q4_post,wages_1q_post,wages_2q_post,wages_3q_post,wages_4q_post,is_adult,is_dislocated_worker,is_youth,is_wagner_peyser,is_reportable_individual,received_training,entry_date,exit_date,entry_year,entry_quarter,exit_year,exit_quarter
str,str,str,i64,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,bool,bool,bool,bool,bool,bool,date,date,i32,i8,i32,i8
"""000FDcUacdAd""","""Male""",,33,"""Attained secondary school dipl…",1,1,,531190,531190,531190,14563,15497,16697,499071,531100,,,,14686,,,,false,false,false,true,false,false,2024-06-18,2024-06-18,2024,2,2024,2
"""000f954d4DfA""","""Male""","""White (not Hispanic)""",40,"""Attained a secondary school eq…",0,1,518000,457210,457210,622110,20485,30141,20751,518000,457210,424710,,,25780,27876,,,false,false,false,true,false,false,2024-03-15,2024-03-15,2024,1,2024,1
"""004A9FbAAa0F""","""Male""","""White (not Hispanic)""",23,"""Attained secondary school dipl…",1,0,,,,,,,,,,,,,,,,,false,false,true,false,false,true,2024-04-03,2024-09-30,2024,2,2024,3
"""004ASD4Se5fb""","""Male""","""Asian (not Hispanic)""",22,"""Attained a secondary school eq…",0,1,353031,722511,722511,722511,1613,4170,8704,,,,,,,,,,false,false,false,true,false,false,2024-12-27,2025-01-13,2024,4,2025,1
"""005Sf0abd50b""","""Male""","""White (not Hispanic)""",27,"""Completed one of more years of…",0,1,537000,213112,213112,441340,5476,11540,17212,533032,424810,424810,424810,424810,17168,23084,19993,19852,false,true,false,true,false,true,2022-10-28,2023-03-07,2022,4,2023,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""ffef5c0044Db""","""Male""","""White (not Hispanic)""",23,"""Attained secondary school dipl…",1,3,512099,722513,722513,722513,2999,10065,7674,,623990,623990,,,2337,3895,,,false,false,false,true,false,false,2024-03-01,2024-03-01,2024,1,2024,1
"""fffSdDLUcDDf""","""Male""","""American Indian or Alaska Nati…",20,"""Attained secondary school dipl…",0,0,,,,,,,,,455211,455211,455211,,7728,7793,6958,0,false,false,false,true,false,false,2023-08-03,2023-08-03,2023,3,2023,3
"""fffUS4004bFe""","""Male""","""White (not Hispanic)""",70,"""Attained a postsecondary techn…",1,1,339032,561612,561612,561612,7231,5548,6211,,,,,,,,,,false,false,false,true,false,false,2024-08-07,2024-08-08,2024,3,2024,3
"""fffe0fLUfUc5""","""Female""","""White (not Hispanic)""",26,"""Attained secondary school dipl…",0,0,435032,921140,921140,921140,13844,13669,11743,,,,,,0,,,,false,false,false,true,false,false,2024-04-08,2024-05-24,2024,2,2024,2


In [11]:
df2023 = (
    lf2023.select(
        pl.col("PIRL100").alias("unique_id"),

        # Demographics Information
        pl.col("PIRL201").alias("sex"),
        pl.col("CALC4020").alias("race"),
        pl.col("CALC4039").alias("age"),
        pl.col("PIRL408").alias("highest_educational_level"),
        pl.col("PIRL802").alias("low_income_status"),
        pl.col("PIRL400").alias("employment_status"),

        # Pre-Program Employment
        pl.col("PIRL403").alias("occupational_code_pre"),
        pl.col("PIRL404").alias("industry_code_q1_pre"),
        pl.col("PIRL404").alias("industry_code_q2_pre"),
        pl.col("PIRL406").alias("industry_code_q3_pre"),
        pl.col("PIRL1700").alias("wages_3q_pre"),
        pl.col("PIRL1701").alias("wages_2q_pre"),
        pl.col("PIRL1702").alias("wages_1q_pre"),

        # Post-Program Employment
        pl.col("PIRL1610").alias("occupational_code_post"),
        pl.col("PIRL1614").alias("industry_code_q1_post"),
        pl.col("PIRL1615").alias("industry_code_q2_post"),
        pl.col("PIRL1616").alias("industry_code_q3_post"),
        pl.col("PIRL1617").alias("industry_code_q4_post"),
        pl.col("PIRL1703").alias("wages_1q_post"),
        pl.col("PIRL1704").alias("wages_2q_post"),
        pl.col("PIRL1705").alias("wages_3q_post"),
        pl.col("PIRL1706").alias("wages_4q_post"),
        
        # Program Information
        (pl.col("CALC4001") == 1).alias("is_adult"),
        ((pl.col("CALC4002") == 1) | (pl.col("CALC4004") == 1)).alias("is_dislocated_worker"),
        (pl.col("CALC4003") == 1).alias("is_youth"),
        (pl.col("CALC4005") == 1).alias("is_wagner_peyser"),
        ((pl.col("CALC4006") == 1)).alias("is_reportable_individual"),
        (pl.col("PIRL1300") == 1).alias("received_training"),
        pl.col("PIRL900")
            .cast(pl.Utf8)
            .str.strptime(pl.Date, "%Y%m%d")
            .alias("entry_date"),
        pl.col("PIRL901")
            .cast(pl.Utf8)
            .str.strptime(pl.Date, "%Y%m%d")
            .alias("exit_date"),
    )
    .with_columns(
        pl.col("entry_date").dt.year().alias("entry_year"),
        pl.col("entry_date").dt.quarter().alias("entry_quarter"),
        pl.col("exit_date").dt.year().alias("exit_year"),
        pl.col("exit_date").dt.quarter().alias("exit_quarter")
    )
    .filter(
        ~pl.col("is_reportable_individual"),
        pl.col("exit_date").is_not_null()
    )
    .collect()
)

In [15]:
df2021 = (
    lf2021.select(
        pl.col("PIRL100").alias("unique_id"),

        # Demographics Information
        pl.col("PIRL201").alias("sex"),
        pl.col("PIRL4020").alias("race"),
        pl.col("PIRL4039").alias("age"),
        pl.col("PIRL408").alias("highest_educational_level"),
        pl.col("PIRL802").alias("low_income_status"),
        pl.col("PIRL400").alias("employment_status"),

        # Pre-Separation Employment
        pl.col("PIRL403").alias("occupational_code_pre"),
        pl.col("PIRL404").alias("industry_code_q1_pre"),
        pl.col("PIRL404").alias("industry_code_q2_pre"),
        pl.col("PIRL406").alias("industry_code_q3_pre"),
        pl.col("PIRL1700").alias("wages_3q_pre"),
        pl.col("PIRL1701").alias("wages_2q_pre"),
        pl.col("PIRL1702").alias("wages_1q_pre"),

        # Post-Separation Employment
        pl.col("PIRL1610").alias("occupational_code_post"),
        pl.col("PIRL1614").alias("industry_code_q1_post"),
        pl.col("PIRL1615").alias("industry_code_q2_post"),
        pl.col("PIRL1616").alias("industry_code_q3_post"),
        pl.col("PIRL1617").alias("industry_code_q4_post"),
        pl.col("PIRL1703").alias("wages_1q_post"),
        pl.col("PIRL1704").alias("wages_2q_post"),
        pl.col("PIRL1705").alias("wages_3q_post"),
        pl.col("PIRL1706").alias("wages_4q_post"),
        
        # Program Information
        (pl.col("PIRL4001") == 1).alias("is_adult"),
        ((pl.col("PIRL4002") == 1) | (pl.col("PIRL4004") == 1)).alias("is_dislocated_worker"),
        (pl.col("PIRL4003") == 1).alias("is_youth"),
        (pl.col("PIRL4005") == 1).alias("is_wagner_peyser"),
        ((pl.col("PIRL4006") == 1)).alias("is_reportable_individual"),
        (pl.col("PIRL1300") == 1).alias("received_training"),
        pl.col("PIRL900")
            .cast(pl.Utf8)
            .str.strptime(pl.Date, "%Y%m%d")
            .alias("entry_date"),
        pl.col("PIRL901")
            .cast(pl.Utf8)
            .str.strptime(pl.Date, "%Y%m%d")
            .alias("exit_date"),
    )
    .with_columns(
        pl.col("entry_date").dt.year().alias("entry_year"),
        pl.col("entry_date").dt.quarter().alias("entry_quarter"),
        pl.col("exit_date").dt.year().alias("exit_year"),
        pl.col("exit_date").dt.quarter().alias("exit_quarter")
    )
    .filter(
        ~pl.col("is_reportable_individual"),
        pl.col("exit_date").is_not_null()
    )
    .collect()
)

In [17]:
df2019 = (
    lf2019.select(
        pl.col("PIRL100").alias("unique_id"),

        # Demographics Information
        pl.col("PIRL201").alias("sex"),
        pl.col("PIRL 3023").alias("race"),
        pl.col("PIRL 3042").alias("age"),
        pl.col("PIRL408").alias("highest_educational_level"),
        pl.col("PIRL802").alias("low_income_status"),
        pl.col("PIRL400").alias("employment_status"),

        # Pre-Separation Employment
        pl.col("PIRL403").alias("occupational_code_pre").cast(pl.Int64),
        pl.col("PIRL404").alias("industry_code_q1_pre"),
        pl.col("PIRL404").alias("industry_code_q2_pre"),
        pl.col("PIRL406").alias("industry_code_q3_pre"),
        pl.col("PIRL1700").alias("wages_3q_pre"),
        pl.col("PIRL1701").alias("wages_2q_pre"),
        pl.col("PIRL1702").alias("wages_1q_pre"),

        # Post-Separation Employment
        pl.col("PIRL1610").alias("occupational_code_post").cast(pl.Int64),
        pl.col("PIRL1614").alias("industry_code_q1_post"),
        pl.col("PIRL1615").alias("industry_code_q2_post"),
        pl.col("PIRL1616").alias("industry_code_q3_post"),
        pl.col("PIRL1617").alias("industry_code_q4_post"),
        pl.col("PIRL1703").alias("wages_1q_post"),
        pl.col("PIRL1704").alias("wages_2q_post"),
        pl.col("PIRL1705").alias("wages_3q_post"),
        pl.col("PIRL1706").alias("wages_4q_post"),
        
        # Program Information
        (pl.col("PIRL 3001") == 1).alias("is_adult"),
        ((pl.col("PIRL 3002") == 1) | (pl.col("PIRL 3004") == 1)).alias("is_dislocated_worker"),
        (pl.col("PIRL 3003") == 1).alias("is_youth"),
        (pl.col("PIRL 3005") == 1).alias("is_wagner_peyser"),
        ((pl.col("PIRL 3006") == 1)).alias("is_reportable_individual"),
        (pl.col("PIRL1300") == 1).alias("received_training"),
        pl.col("PIRL900")
            .cast(pl.Utf8)
            .str.strptime(pl.Date, "%Y%m%d")
            .alias("entry_date"),
        pl.col("PIRL901")
            .cast(pl.Utf8)
            .str.strptime(pl.Date, "%Y%m%d")
            .alias("exit_date"),
    )
    .with_columns(
        pl.col("entry_date").dt.year().alias("entry_year"),
        pl.col("entry_date").dt.quarter().alias("entry_quarter"),
        pl.col("exit_date").dt.year().alias("exit_year"),
        pl.col("exit_date").dt.quarter().alias("exit_quarter")
    )
    .filter(
        ~pl.col("is_reportable_individual"),
        pl.col("exit_date").is_not_null()
    )
    .collect()
)

In [22]:
df2017 = (
    lf2017.select(
        pl.col("PIRL 100").alias("unique_id"),

        # Demographics Information
        pl.col("PIRL 201").alias("sex"),
        pl.col("PIRL 3023").alias("race"),
        pl.col("PIRL 3042").alias("age"),
        pl.col("PIRL 408").alias("highest_educational_level"),
        pl.col("PIRL 802").alias("low_income_status"),
        pl.col("PIRL 400").alias("employment_status"),

        # Pre-Separation Employment
        pl.col("PIRL 403").alias("occupational_code_pre").cast(pl.Int64),
        pl.col("PIRL 404").alias("industry_code_q1_pre"),
        pl.col("PIRL 404").alias("industry_code_q2_pre"),
        pl.col("PIRL 406").alias("industry_code_q3_pre"),
        pl.col("PIRL 1700").alias("wages_3q_pre"),
        pl.col("PIRL 1701").alias("wages_2q_pre"),
        pl.col("PIRL 1702").alias("wages_1q_pre"),

        # Post-Separation Employment
        pl.col("PIRL 1610").alias("occupational_code_post").cast(pl.Int64),
        pl.col("PIRL 1614").alias("industry_code_q1_post"),
        pl.col("PIRL 1615").alias("industry_code_q2_post"),
        pl.col("PIRL 1616").alias("industry_code_q3_post"),
        pl.col("PIRL 1617").alias("industry_code_q4_post"),
        pl.col("PIRL 1703").alias("wages_1q_post"),
        pl.col("PIRL 1704").alias("wages_2q_post"),
        pl.col("PIRL 1705").alias("wages_3q_post"),
        pl.col("PIRL 1706").alias("wages_4q_post"),
        
        # Program Information
        (pl.col("PIRL 3001") == 1).alias("is_adult"),
        ((pl.col("PIRL 3002") == 1) | (pl.col("PIRL 3004") == 1)).alias("is_dislocated_worker"),
        (pl.col("PIRL 3003") == 1).alias("is_youth"),
        (pl.col("PIRL 3005") == 1).alias("is_wagner_peyser"),
        ((pl.col("PIRL 3006") == 1)).alias("is_reportable_individual"),
        (pl.col("PIRL 1300") == 1).alias("received_training"),
        pl.col("PIRL 900")
            .cast(pl.Utf8)
            .str.strptime(pl.Date, "%Y%m%d")
            .alias("entry_date"),
        pl.col("PIRL 901")
            .cast(pl.Utf8)
            .str.strptime(pl.Date, "%Y%m%d")
            .alias("exit_date"),
    )
    .with_columns(
        pl.col("entry_date").dt.year().alias("entry_year"),
        pl.col("entry_date").dt.quarter().alias("entry_quarter"),
        pl.col("exit_date").dt.year().alias("exit_year"),
        pl.col("exit_date").dt.quarter().alias("exit_quarter")
    )
    .filter(
        ~pl.col("is_reportable_individual"),
        pl.col("exit_date").is_not_null()
    )
    .collect()
)

In [31]:
df2015 = (
    lf2015.select(
        pl.col("Item_100").alias("unique_id"),

        # Demographics Information
        pl.col("Item_201").alias("sex"),
        pl.col("Item_3006").alias("race"),
        pl.col("Item_3004").alias("age"),
        pl.col("Item_410").alias("highest_educational_level"), # TODO(jcanedy27): To add appropriate mapping.
        pl.col("Item_702").alias("low_income_status").cast(pl.Int64, strict=False),
        pl.col("Item_400").alias("employment_status"),


        # Pre-Separation Employment
        pl.col("Item_402").alias("occupational_code_pre").cast(pl.Int64),
        pl.col("Item_403").alias("industry_code_q1_pre").cast(pl.Int64),
        pl.col("Item_404").alias("industry_code_q2_pre").cast(pl.Int64),
        pl.col("Item_405").alias("industry_code_q3_pre").cast(pl.Int64),
        pl.col("Item_1600").alias("wages_3q_pre").cast(pl.Int64),
        pl.col("Item_1601").alias("wages_2q_pre").cast(pl.Int64),
        pl.col("Item_1602").alias("wages_1q_pre").cast(pl.Int64),

        # Post-Separation Employment
        pl.col("Item_1502").alias("occupational_code_post").cast(pl.Int64),
        pl.col("Item_1514").alias("industry_code_q1_post").cast(pl.Int64),
        pl.col("Item_1516").alias("industry_code_q2_post").cast(pl.Int64),
        pl.col("Item_1517").alias("industry_code_q3_post").cast(pl.Int64),
        pl.col("Item_1518").alias("industry_code_q4_post").cast(pl.Int64),
        pl.col("Item_1603").alias("wages_1q_post").cast(pl.Int64),
        pl.col("Item_1604").alias("wages_2q_post").cast(pl.Int64),
        pl.col("Item_1605").alias("wages_3q_post").cast(pl.Int64),
        pl.col("Item_1606").alias("wages_4q_post").cast(pl.Int64),

        # Program Information
        (pl.col("Item_3007") == 1).alias("is_adult"),
        ((pl.col("Item_3008") == 1) | (pl.col("Item_3009") == 1) | (pl.col("Item_3010") == 1)).alias("is_dislocated_worker"),
        ((pl.col("Item_3011") == 1) | (pl.col("Item_3012") == 1)).alias("is_youth"),
        (pl.col("Item_951") == 1).alias("is_wagner_peyser"),
        (pl.col("Item_3013") == 0).alias("is_reportable_individual"),
        (pl.col("Item_3014") == 1).alias("received_training"),
        pl.col("Item_900")
            .cast(pl.Utf8)
            .str.strptime(pl.Date, "%m/%d/%Y")
            .alias("entry_date"),
        pl.col("Item_901")
            .cast(pl.Utf8)
            .str.strptime(pl.Date, "%m/%d/%Y")
            .alias("exit_date"),
    )
    .with_columns(
        pl.col("entry_date").dt.year().alias("entry_year"),
        pl.col("entry_date").dt.quarter().alias("entry_quarter"),
        pl.col("exit_date").dt.year().alias("exit_year"),
        pl.col("exit_date").dt.quarter().alias("exit_quarter")
    )
    .filter(
        ~pl.col("is_reportable_individual"),
        pl.col("exit_date").is_not_null()
    )
    .collect()
)

In [30]:
df2013 = (
    lf2013.select(
        pl.col("Item_100").alias("unique_id"),
        
        # Demographics Information
        pl.col("Item_201").alias("sex"),
        pl.col("Item_3006").alias("race").cast(pl.Int64, strict=False),
        pl.col("Item_3004").alias("age"),
        pl.col("Item_410").alias("highest_educational_level"), # TODO(jcanedy27): To add appropriate mapping.
        pl.col("Item_702").alias("low_income_status").cast(pl.Int64, strict=False),
        pl.col("Item_400").alias("employment_status"),

        # Pre-Separation Employment
        pl.col("Item_402").alias("occupational_code_pre").cast(pl.Int64, strict=False),
        pl.col("Item_403").alias("industry_code_q1_pre").cast(pl.Int64, strict=False),
        pl.col("Item_404").alias("industry_code_q2_pre").cast(pl.Int64, strict=False),
        pl.col("Item_405").alias("industry_code_q3_pre").cast(pl.Int64, strict=False),
        pl.col("Item_1600").alias("wages_3q_pre").cast(pl.Int64, strict=False),
        pl.col("Item_1601").alias("wages_2q_pre").cast(pl.Int64, strict=False),
        pl.col("Item_1602").alias("wages_1q_pre").cast(pl.Int64, strict=False),

        # Post-Separation Employment
        pl.col("Item_1502").alias("occupational_code_post").cast(pl.Int64, strict=False),
        pl.col("Item_1514").alias("industry_code_q1_post").cast(pl.Int64, strict=False),
        pl.col("Item_1516").alias("industry_code_q2_post").cast(pl.Int64, strict=False),
        pl.col("Item_1517").alias("industry_code_q3_post").cast(pl.Int64, strict=False),
        pl.col("Item_1518").alias("industry_code_q4_post").cast(pl.Int64, strict=False),
        pl.col("Item_1603").alias("wages_1q_post").cast(pl.Int64, strict=False),
        pl.col("Item_1604").alias("wages_2q_post").cast(pl.Int64, strict=False),
        pl.col("Item_1605").alias("wages_3q_post").cast(pl.Int64, strict=False),
        pl.col("Item_1606").alias("wages_4q_post").cast(pl.Int64, strict=False),

        # Program Information
        (pl.col("Item_3007") == 1).alias("is_adult"),
        ((pl.col("Item_3008") == 1) | (pl.col("Item_3009") == 1) | (pl.col("Item_3010") == 1)).alias("is_dislocated_worker"),
        ((pl.col("Item_3011") == 1) | (pl.col("Item_3012") == 1)).alias("is_youth"),
        (pl.col("Item_951") == 1).alias("is_wagner_peyser"),
        (pl.col("Item_3013") == "0").alias("is_reportable_individual"),
        (pl.col("Item_3014") == "1").alias("received_training"),
        pl.col("Item_900")
            .cast(pl.Utf8)
            .str.strptime(pl.Date, "%m/%d/%Y", strict=False)
            .alias("entry_date"),
        pl.col("Item_901")
            .cast(pl.Utf8)
            .str.strptime(pl.Date, "%m/%d/%Y", strict=False)
            .alias("exit_date"),
    )
    .with_columns(
        pl.col("entry_date").dt.year().alias("entry_year"),
        pl.col("entry_date").dt.quarter().alias("entry_quarter"),
        pl.col("exit_date").dt.year().alias("exit_year"),
        pl.col("exit_date").dt.quarter().alias("exit_quarter")
    )
    .filter(
        ~pl.col("is_reportable_individual"),
        pl.col("exit_date").is_not_null()
    )
    .collect()
)

In [None]:
df_all = (
    pl.concat([df2013, df2015, df2017, df2019, df2021, df2023, df2024])
    .unique(subset=["unique_id", "entry_date", "exit_date"])
)

In [42]:
df_all.filter(
    (pl.col("is_adult") | pl.col("is_dislocated_worker") | pl.col("is_youth")),
    pl.col("received_training"),
    (pl.col("exit_date") >= pl.lit("2012-01-01").str.strptime(pl.Date)) & (pl.col("exit_date") < pl.lit("2024-01-01").str.strptime(pl.Date))
)

unique_id,sex,race,age,highest_educational_level,low_income_status,employment_status,occupational_code_pre,industry_code_q1_pre,industry_code_q2_pre,industry_code_q3_pre,wages_3q_pre,wages_2q_pre,wages_1q_pre,occupational_code_post,industry_code_q1_post,industry_code_q2_post,industry_code_q3_post,industry_code_q4_post,wages_1q_post,wages_2q_post,wages_3q_post,wages_4q_post,is_adult,is_dislocated_worker,is_youth,is_wagner_peyser,is_reportable_individual,received_training,entry_date,exit_date,entry_year,entry_quarter,exit_year,exit_quarter
str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,bool,bool,bool,bool,bool,bool,date,date,i32,i8,i32,i8
"""94Fs90DDD9""",1,6,27,14,1,0,,,,,0,0,0,,,,441110,,0,0,,,true,false,false,true,false,true,2014-11-06,2015-06-05,2014,4,2015,2
"""FA9D93CB0F1B368105841526A0496D…",2,3,28,13,1,1,,999999,999999,999999,584,4435,1698,,999999,999999,999999,999999,0,0,1152,5928,true,false,false,true,false,true,2011-10-26,2014-04-28,2011,4,2014,2
"""2DCA6B06BB0F76BD39E9C2529727DE…",1,6,30,87,0,0,,326140,326140,326140,6680,6023,7047,,,,,,0,0,0,0,true,true,false,true,false,true,2010-12-07,2014-08-08,2010,4,2014,3
"""2252A7C315FC246D4AF8438F8964B4…",2,6,55,16,0,0,,,,,6459,7624,6542,0,,,,,6588,7708,6610,7846,true,false,false,true,false,true,2012-11-05,2012-12-14,2012,4,2012,4
"""4590faaAA9be""",2,3,51,1,1,1,,,,,0,0,0,,492110,492110,492110,492110,14168,33872,17333,13987,true,false,false,true,false,true,2020-07-24,2022-07-01,2020,3,2022,3
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""UeD4a9ALaSUf""",1,6,21,1,1,1,,455211,455211,238350,9191,5535,8991,,484121,484121,,,4851,8495,,,true,false,false,false,false,true,2022-09-02,2023-05-16,2022,3,2023,2
"""SLcSULbADfLA""",2,6,24,0,0,0,,,,,2351,1603,3708,,722513,722513,,,5291,71,,,false,false,true,true,false,true,2018-03-08,2019-05-20,2018,1,2019,2
"""DD9EC48D10BFC819256483DA2BA8DE…",1,1,30,91,0,0,,999999,999999,999999,0,0,0,,,,,999999,0,0,0,1436,true,true,false,true,false,true,2013-05-29,2014-05-16,2013,2,2014,2
"""14B11269F4F3CF477D0D68270DF509…",1,2,53,16,0,0,,999999,999999,999999,,,,,999999,,,,,,,,false,true,false,true,false,true,2013-02-21,2014-02-11,2013,1,2014,1


In [44]:
(
    df_all.filter(
        ~pl.any_horizontal(pl.all().is_null()),
        pl.col("is_adult") | pl.col("is_dislocated_worker") | pl.col("is_youth"),
        pl.col("received_training")
    )
)

unique_id,sex,race,age,highest_educational_level,low_income_status,employment_status,occupational_code_pre,industry_code_q1_pre,industry_code_q2_pre,industry_code_q3_pre,wages_3q_pre,wages_2q_pre,wages_1q_pre,occupational_code_post,industry_code_q1_post,industry_code_q2_post,industry_code_q3_post,industry_code_q4_post,wages_1q_post,wages_2q_post,wages_3q_post,wages_4q_post,is_adult,is_dislocated_worker,is_youth,is_wagner_peyser,is_reportable_individual,received_training,entry_date,exit_date,entry_year,entry_quarter,exit_year,exit_quarter
str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,bool,bool,bool,bool,bool,bool,date,date,i32,i8,i32,i8
"""b9UfSfAaUU0U""",1,3,55,1,0,1,533054,485310,485310,485310,14668,12655,8208,533032,484121,484121,485310,485310,8775,8662,10818,1511,true,false,false,false,false,true,2022-06-08,2022-12-07,2022,2,2022,4
"""SSbbf5D5eFf4""",2,3,28,7,1,0,439061,999999,999999,999999,6568,5620,6684,292061,999999,999999,999999,999999,10507,3873,4171,9261,true,false,false,true,false,true,2015-02-03,2016-09-12,2015,1,2016,3
"""BDADF7B2D3C2664B20390838EE7294…",1,6,54,91,0,0,51407100,336350,336350,336350,7971,8434,7252,51919900,336320,336370,336370,336370,7600,6922,7826,7646,false,true,false,true,false,true,2013-11-08,2014-06-20,2013,4,2014,2
"""DfUS59fDL4cD""",2,6,24,1,1,1,311014,522110,522110,522110,6612,9579,6572,311014,623110,623110,623110,445110,5848,5697,5237,3048,true,false,false,false,false,true,2016-09-23,2016-12-16,2016,3,2016,4
"""ffb5Ff5Acb4F""",1,6,41,1,1,0,519199,333415,333415,333415,8026,6877,10568,492093,445110,445110,332420,330000,7686,6667,8636,11647,false,true,false,true,false,true,2019-01-22,2020-09-28,2019,1,2020,3
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""eDff00DD9Dab""",2,6,30,4,0,0,434171,622110,622110,622110,7088,6036,2527,291141,622110,622110,622110,622110,11725,13806,14990,20457,true,false,false,false,false,true,2015-02-13,2016-09-30,2015,1,2016,3
"""47B76A9815DC90E55D649295F497E4…",2,6,42,90,1,0,43904101,0,0,0,9696,1248,622,43601300,999999,999999,999999,999999,4609,5698,7099,6814,false,true,false,false,false,true,2012-05-03,2012-12-17,2012,2,2012,4
"""c94U04LAcfab""",2,3,22,2,0,0,390000,423920,423920,423920,9272,13438,9000,399011,561320,561320,492110,561320,4548,5231,4539,6664,false,false,true,true,false,true,2018-04-27,2018-11-16,2018,2,2018,4
"""AUdSDcacDA0c""",1,6,46,7,0,1,151232,561311,561311,541512,11029,10899,9558,151000,423610,423610,423610,330000,13523,15644,11852,16316,false,true,false,false,false,true,2021-12-30,2022-09-08,2021,4,2022,3


In [49]:
(
    df_all.filter(
        pl.any_horizontal(pl.col(["occupational_code_pre", "occupational_code_post"]).is_not_null()),
        pl.col("is_adult") | pl.col("is_dislocated_worker") | pl.col("is_youth"),
        pl.col("received_training")
    )
)

unique_id,sex,race,age,highest_educational_level,low_income_status,employment_status,occupational_code_pre,industry_code_q1_pre,industry_code_q2_pre,industry_code_q3_pre,wages_3q_pre,wages_2q_pre,wages_1q_pre,occupational_code_post,industry_code_q1_post,industry_code_q2_post,industry_code_q3_post,industry_code_q4_post,wages_1q_post,wages_2q_post,wages_3q_post,wages_4q_post,is_adult,is_dislocated_worker,is_youth,is_wagner_peyser,is_reportable_individual,received_training,entry_date,exit_date,entry_year,entry_quarter,exit_year,exit_quarter
str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,bool,bool,bool,bool,bool,bool,date,date,i32,i8,i32,i8
"""2252A7C315FC246D4AF8438F8964B4…",2,6,55,16,0,0,,,,,6459,7624,6542,0,,,,,6588,7708,6610,7846,true,false,false,true,false,true,2012-11-05,2012-12-14,2012,4,2012,4
"""49aDDee9bDd9""",1,2,53,7,1,0,151199,,,561320,23511,21521,0,519000,518210,518210,518210,510000,25479,21858,25167,20254,true,false,false,false,false,true,2018-07-05,2018-09-10,2018,3,2018,3
"""744E0F3BC301BDF9F60B247EEF51B3…",1,6,46,14,0,0,47206100,,,238292,,0,0,,,,,,,,,,false,true,false,true,false,true,2011-09-08,2013-12-07,2011,3,2013,4
"""480E4977BCF9A1C8F6C3BB74BF6490…",1,6,44,13,0,0,,,,,,,0,15114200,54171,,,561320,2875,10844,13725,,false,true,false,true,false,true,2011-06-07,2012-06-30,2011,2,2012,2
"""FS95a04U05e9""",2,1,50,8,1,1,419011,999999,999999,999999,648,6350,11028,,999999,999999,999999,999999,2650,10699,9346,7329,false,true,false,true,false,true,2016-08-12,2016-12-20,2016,3,2016,4
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""9545DD4829200E3E1565D0A35922AF…",1,6,26,87,0,0,51401200,332994,332994,332994,,,,51401100,332994,332994,332994,332994,,,,,false,true,false,true,false,true,2010-09-13,2012-11-05,2010,3,2012,4
"""95A64D299E50EDEB807D9C5E88C0B1…",2,1,34,87,0,0,,999999,999999,999999,0,0,0,29201103,999999,999999,999999,999999,12183,9716,3158,10825,true,false,false,true,false,true,2011-07-15,2014-05-12,2011,3,2014,2
"""38D40FDC21867F4702875DA65F0F8D…",2,3,21,87,0,1,41203100,447110,447110,447110,6057,5878,3256,43405100,445110,922140,922140,,3745,8693,10975,,true,false,false,true,false,true,2014-08-11,2015-02-28,2014,3,2015,1
"""30C43E268C8409AF33B36165894080…",2,1,46,15,1,2,51919801,311612,311612,311612,,,,,,,,,,,,,true,false,false,false,false,true,2013-11-01,2014-06-30,2013,4,2014,2


In [43]:
(
    df_all.filter(
        pl.col("is_adult") | pl.col("is_dislocated_worker") | pl.col("is_youth"),
        pl.col("received_training")
    )
    .group_by(
        [
            pl.col("is_adult"),
            pl.col("is_dislocated_worker"),
            pl.col("is_youth"),
            pl.col("is_wagner_peyser")
        ]
    ).len()
)


is_adult,is_dislocated_worker,is_youth,is_wagner_peyser,len
bool,bool,bool,bool,u32
true,false,false,true,564058
false,false,true,false,131084
false,true,false,false,176062
true,false,true,false,8792
true,true,true,true,732
…,…,…,…,…
true,false,false,false,424012
true,true,false,false,22291
true,true,false,true,92696
false,false,true,true,79260


In [149]:
df_all.filter(
    pl.col("is_wagner_peyser"),
    ~pl.col("received_training"),
    (pl.col("exit_date") > pl.lit("2012-01-01").str.strptime(pl.Date)) & (pl.col("exit_date") < pl.lit("2024-01-01").str.strptime(pl.Date))
)

unique_id,is_adult,is_dislocated_worker,is_youth,is_wagner_peyser,is_reportable_individual,received_training,entry_date,exit_date,entry_year,entry_quarter,exit_year,exit_quarter
str,bool,bool,bool,bool,bool,bool,date,date,i32,i8,i32,i8
"""DaFAeSc5dAe4""",false,false,false,true,false,false,2016-12-01,2016-12-01,2016,4,2016,4
"""ALbS0dc0cFLA""",false,true,false,true,false,false,2020-02-26,2020-06-02,2020,1,2020,2
"""5cfFD9A5cDd0""",false,false,false,true,false,false,2017-07-28,2017-07-28,2017,3,2017,3
"""7C91D21DC902194EE07103FCB819FB…",true,false,false,true,false,false,2013-07-31,2013-12-18,2013,3,2013,4
"""9c40cb50DFFU""",false,false,false,true,false,false,2023-09-27,2023-10-16,2023,3,2023,4
…,…,…,…,…,…,…,…,…,…,…,…,…
"""fSSDALUDU9LF""",false,false,false,true,false,false,2017-07-18,2017-07-31,2017,3,2017,3
"""AF9FbcfLdebf""",false,false,false,true,false,false,2022-01-10,2022-01-24,2022,1,2022,1
"""ac0fSD9aF9df""",false,false,false,true,false,false,2017-06-22,2017-06-22,2017,2,2017,2
"""Ldc4U0DceaLL""",false,false,true,true,false,false,2017-08-04,2018-04-29,2017,3,2018,2


In [91]:
participation_periods_2013 = (
    lf2013.filter(
        ((pl.col("Item_3007") == 1) | # Adult Funding
        (pl.col("Item_3008") == 1) | # Dislocated Worker Funding
        (pl.col("Item_3009") == 1) | # Dislocated worker excluding NEG only 
        (pl.col("Item_3010") == 1) | # National Emergency Grant
        (pl.col("Item_3011") == 1) | # Younger Youth 
        (pl.col("Item_3012") == 1)) & # Older Youth
        (pl.col("Item_3013") == "1") & # Received more than self services and informational activities
        (pl.col("Item_3014") == "1") # Received Training
        & (pl.col("Item_3003") != 2999) # Not non exiter
    )
    .group_by("Item_3003") # Program Exit Year
    .agg(pl.count("Item_100").alias("participation_periods"))
    .collect()
)

In [92]:
participation_periods_2013

Item_3003,participation_periods
i64,u32
2013,130220
2011,108824
2014,997
2012,194662


In [96]:
participation_periods_2015 = (
    lf2015.filter(
        ((pl.col("Item_3007") == 1) | # Adult Funding
        (pl.col("Item_3008") == 1) | # Dislocated Worker Funding
        (pl.col("Item_3009") == 1) | # Dislocated worker excluding NEG only 
        (pl.col("Item_3010") == 1) | # National Emergency Grant
        (pl.col("Item_3011") == 1) | # Younger Youth 
        (pl.col("Item_3012") == 1)) & # Older Youth
        (pl.col("Item_3013") == 1) & # Received more than self services and informational activities
        (pl.col("Item_3014") == 1) # Received Training
        & (pl.col("Item_3003") != 2999) # Not non exiter
    )
    .group_by("Item_3003") # Program Exit Year
    .agg(pl.count("Item_100").alias("participation_periods"))
    .collect()
)

In [97]:
participation_periods_2015

Item_3003,participation_periods
i64,u32
2013,83583
2014,162626
2015,116648


In [30]:
participation_periods_2017 = (
    lf2017.filter(
        ((pl.col("PIRL 3001") == 1) | # Adult Funding
        (pl.col("PIRL 3002") == 1) | # Dislocated Worker Funding
        (pl.col("PIRL 3003") == 1) | # Youth Funding
        (pl.col("PIRL 3004") == 1)) & # Dislocated Worker Grant
        (pl.col("PIRL 3006") == 0) & # Not Reportable Individual
        (pl.col("PIRL 1300") == 1) # Received Training (WIOA)
        & (pl.col("PIRL 3043") != 9999) # Not non exiter
    )
    .group_by("PIRL 3043") # Program Exit Year
    .agg(pl.count("PIRL 100").alias("participation_periods"))
    .collect()
)