- PY 2023, Q4: January 1, 2022, and June 30, 2024
- PY 2021, Q4: January 1, 2020 and June 30, 2022x
- PY 2019, Q4: January 1, 2018 and June 30, 2020
- PY 2017, Q4: July 1, 2016 and June 30, 2018
- PY 2015:  January 1, 2014 and June 30, 2016
- PY 2013: January 1, 2012 and June 30, 2014

In [1]:
import polars as pl
import numpy as np

In [2]:
lf2013 = pl.scan_csv("../wioa_performance_records/PublicWIASRD2013q4.csv")
lf2015 = pl.scan_csv("../wioa_performance_records/PublicWIASRD2015Q4.csv")
lf2017 = pl.scan_csv("../wioa_performance_records/WIOAPerformanceRecords_PY2017Q4_Public_csv")
lf2019 = pl.scan_csv("../wioa_performance_records/WIOAPerformanceRecords_PY2019Q4_Public_csv")
lf2021 = pl.scan_csv("../wioa_performance_records/WIOAPerformanceRecords_PY2021Q4_PUBLIC_csv")
lf2023 = pl.scan_csv("../wioa_performance_records/WIOAPerformanceRecords_PY2023Q4_PUBLIC.csv")
lf2024 = pl.scan_csv("../wioa_performance_records/WIOAPerformanceRecords_PY2024Q3_PUBLIC.csv")

In [3]:

sex_map = {
    1: "Male",
    2: "Female",
    9: "Participant did not self-identify"
}

race_map = {
    1: "Hispanic",
    2: "Asian (not Hispanic)",
    3: "Black (not Hispanic)",
    4: "Native Hawaiian or Pacific Islander (not Hispanic)",
    5: "American Indian or Alaska Native (not Hispanic)",
    6: "White (not Hispanic)",
    7: "Multiple Race (not Hispanic)",
}

highest_educational_level_map = {
    1: "Attained secondary school diploma",
    2: "Attained a secondary school equivalency",
    3: "The participant with a disability receives a certificate of attendance/completion as a result of successfully completing an Individualized Education Program (IEP)",
    4: "Completed one of more years of postsecondary education",
    5: "Attained a postsecondary technical or vocational certificate (non-degree)",
    6: "Attained an Associate's degree",
    7: "Attained a Bachelor's degree",
    8: "Attained a degree beyond a Bachelor's degree",
    0: "No Educational Level Completed"
}

In [5]:
df2024 = (
    lf2024.select(
        pl.col("PIRL100").alias("unique_id"),

        # Demographics Information
        pl.col("PIRL201").alias("sex"),
        pl.col("CALC4020").alias("race"),
        pl.col("CALC4039").alias("age"),
        pl.col("PIRL408").alias("highest_educational_level"),
        pl.col("PIRL802").alias("low_income_status"),
        pl.col("PIRL400").alias("employment_status"),

        # Pre-Program Employment
        pl.col("PIRL403").alias("occupational_code_pre"),
        pl.col("PIRL404").alias("industry_code_q1_pre"),
        pl.col("PIRL404").alias("industry_code_q2_pre"),
        pl.col("PIRL406").alias("industry_code_q3_pre"),
        pl.col("PIRL1700").alias("wages_3q_pre"),
        pl.col("PIRL1701").alias("wages_2q_pre"),
        pl.col("PIRL1702").alias("wages_1q_pre"),

        # Post-Program Employment
        pl.col("PIRL1610").alias("occupational_code_post"),
        pl.col("PIRL1614").alias("industry_code_q1_post"),
        pl.col("PIRL1615").alias("industry_code_q2_post"),
        pl.col("PIRL1616").alias("industry_code_q3_post"),
        pl.col("PIRL1617").alias("industry_code_q4_post"),
        pl.col("PIRL1703").alias("wages_1q_post"),
        pl.col("PIRL1704").alias("wages_2q_post"),
        pl.col("PIRL1705").alias("wages_3q_post"),
        pl.col("PIRL1706").alias("wages_4q_post"),
    
        # Program Information
        (pl.col("CALC4001") == 1).alias("is_adult"),
        ((pl.col("CALC4002") == 1) | (pl.col("CALC4004") == 1)).alias("is_dislocated_worker"),
        (pl.col("CALC4003") == 1).alias("is_youth"),
        (pl.col("CALC4005") == 1).alias("is_wagner_peyser"),
        ((pl.col("CALC4006") == 1)).alias("is_reportable_individual"),
        (pl.col("PIRL1300") == 1).alias("received_training"),
        pl.col("PIRL900")
            .cast(pl.Utf8)
            .str.strptime(pl.Date, "%Y%m%d")
            .alias("entry_date"),
        pl.col("PIRL901")
            .cast(pl.Utf8)
            .str.strptime(pl.Date, "%Y%m%d")
            .alias("exit_date"),
    )
    .with_columns(
        pl.col("entry_date").dt.year().alias("entry_year"),
        pl.col("entry_date").dt.quarter().alias("entry_quarter"),
        pl.col("exit_date").dt.year().alias("exit_year"),
        pl.col("exit_date").dt.quarter().alias("exit_quarter"),
    )
    .filter(
        ~pl.col("is_reportable_individual"),
        pl.col("exit_date").is_not_null()
    )
    .collect()
)

In [7]:
df2023 = (
    lf2023.select(
        pl.col("PIRL100").alias("unique_id"),

        # Demographics Information
        pl.col("PIRL201").alias("sex"),
        pl.col("CALC4020").alias("race"),
        pl.col("CALC4039").alias("age"),
        pl.col("PIRL408").alias("highest_educational_level"),
        pl.col("PIRL802").alias("low_income_status"),
        pl.col("PIRL400").alias("employment_status"),

        # Pre-Program Employment
        pl.col("PIRL403").alias("occupational_code_pre"),
        pl.col("PIRL404").alias("industry_code_q1_pre"),
        pl.col("PIRL404").alias("industry_code_q2_pre"),
        pl.col("PIRL406").alias("industry_code_q3_pre"),
        pl.col("PIRL1700").alias("wages_3q_pre"),
        pl.col("PIRL1701").alias("wages_2q_pre"),
        pl.col("PIRL1702").alias("wages_1q_pre"),

        # Post-Program Employment
        pl.col("PIRL1610").alias("occupational_code_post"),
        pl.col("PIRL1614").alias("industry_code_q1_post"),
        pl.col("PIRL1615").alias("industry_code_q2_post"),
        pl.col("PIRL1616").alias("industry_code_q3_post"),
        pl.col("PIRL1617").alias("industry_code_q4_post"),
        pl.col("PIRL1703").alias("wages_1q_post"),
        pl.col("PIRL1704").alias("wages_2q_post"),
        pl.col("PIRL1705").alias("wages_3q_post"),
        pl.col("PIRL1706").alias("wages_4q_post"),
        
        # Program Information
        (pl.col("CALC4001") == 1).alias("is_adult"),
        ((pl.col("CALC4002") == 1) | (pl.col("CALC4004") == 1)).alias("is_dislocated_worker"),
        (pl.col("CALC4003") == 1).alias("is_youth"),
        (pl.col("CALC4005") == 1).alias("is_wagner_peyser"),
        ((pl.col("CALC4006") == 1)).alias("is_reportable_individual"),
        (pl.col("PIRL1300") == 1).alias("received_training"),
        pl.col("PIRL900")
            .cast(pl.Utf8)
            .str.strptime(pl.Date, "%Y%m%d")
            .alias("entry_date"),
        pl.col("PIRL901")
            .cast(pl.Utf8)
            .str.strptime(pl.Date, "%Y%m%d")
            .alias("exit_date"),
    )
    .with_columns(
        pl.col("entry_date").dt.year().alias("entry_year"),
        pl.col("entry_date").dt.quarter().alias("entry_quarter"),
        pl.col("exit_date").dt.year().alias("exit_year"),
        pl.col("exit_date").dt.quarter().alias("exit_quarter")
    )
    .filter(
        ~pl.col("is_reportable_individual"),
        pl.col("exit_date").is_not_null()
    )
    .collect()
)

In [9]:
df2021 = (
    lf2021.select(
        pl.col("PIRL100").alias("unique_id"),

        # Demographics Information
        pl.col("PIRL201").alias("sex"),
        pl.col("PIRL4020").alias("race"),
        pl.col("PIRL4039").alias("age"),
        pl.col("PIRL408").alias("highest_educational_level"),
        pl.col("PIRL802").alias("low_income_status"),
        pl.col("PIRL400").alias("employment_status"),

        # Pre-Separation Employment
        pl.col("PIRL403").alias("occupational_code_pre"),
        pl.col("PIRL404").alias("industry_code_q1_pre"),
        pl.col("PIRL404").alias("industry_code_q2_pre"),
        pl.col("PIRL406").alias("industry_code_q3_pre"),
        pl.col("PIRL1700").alias("wages_3q_pre"),
        pl.col("PIRL1701").alias("wages_2q_pre"),
        pl.col("PIRL1702").alias("wages_1q_pre"),

        # Post-Separation Employment
        pl.col("PIRL1610").alias("occupational_code_post"),
        pl.col("PIRL1614").alias("industry_code_q1_post"),
        pl.col("PIRL1615").alias("industry_code_q2_post"),
        pl.col("PIRL1616").alias("industry_code_q3_post"),
        pl.col("PIRL1617").alias("industry_code_q4_post"),
        pl.col("PIRL1703").alias("wages_1q_post"),
        pl.col("PIRL1704").alias("wages_2q_post"),
        pl.col("PIRL1705").alias("wages_3q_post"),
        pl.col("PIRL1706").alias("wages_4q_post"),
        
        # Program Information
        (pl.col("PIRL4001") == 1).alias("is_adult"),
        ((pl.col("PIRL4002") == 1) | (pl.col("PIRL4004") == 1)).alias("is_dislocated_worker"),
        (pl.col("PIRL4003") == 1).alias("is_youth"),
        (pl.col("PIRL4005") == 1).alias("is_wagner_peyser"),
        ((pl.col("PIRL4006") == 1)).alias("is_reportable_individual"),
        (pl.col("PIRL1300") == 1).alias("received_training"),
        pl.col("PIRL900")
            .cast(pl.Utf8)
            .str.strptime(pl.Date, "%Y%m%d")
            .alias("entry_date"),
        pl.col("PIRL901")
            .cast(pl.Utf8)
            .str.strptime(pl.Date, "%Y%m%d")
            .alias("exit_date"),
    )
    .with_columns(
        pl.col("entry_date").dt.year().alias("entry_year"),
        pl.col("entry_date").dt.quarter().alias("entry_quarter"),
        pl.col("exit_date").dt.year().alias("exit_year"),
        pl.col("exit_date").dt.quarter().alias("exit_quarter")
    )
    .filter(
        ~pl.col("is_reportable_individual"),
        pl.col("exit_date").is_not_null()
    )
    .collect()
)

In [None]:
SEX
1 = Male
2 = Female
9 = Participant did not self-identify

RACE
1 = Hispanic
2 = Asian (not Hispanic)
3 = Black (not Hispanic)
4 = Native Hawaiian or Pacific Islander (not
Hispanic)
5 = American Indian or Alaska Native (not
Hispanic)
6 = White (not Hispanic)
7 = Multiple Race (not Hispanic)



LOW INCOME STATUS
1 = Yes
0 = No

EMPLOYMENT STATUS
1 = Employed
2 = Employed, but Received Notice of
Termination of Employment or Military
Separation is pending
3 = Not in labor force
0 = Unemployed

In [10]:
df2019 = (
    lf2019.select(
        pl.col("PIRL100").alias("unique_id"),

        # Demographics Information
        pl.col("PIRL201").alias("sex"),
        pl.col("PIRL 3023").alias("race"),
        pl.col("PIRL 3042").alias("age"),
        pl.col("PIRL408").alias("highest_educational_level"),
        pl.col("PIRL802").alias("low_income_status"),
        pl.col("PIRL400").alias("employment_status"),

        # Pre-Separation Employment
        pl.col("PIRL403").alias("occupational_code_pre").cast(pl.Int64),
        pl.col("PIRL404").alias("industry_code_q1_pre"),
        pl.col("PIRL404").alias("industry_code_q2_pre"),
        pl.col("PIRL406").alias("industry_code_q3_pre"),
        pl.col("PIRL1700").alias("wages_3q_pre"),
        pl.col("PIRL1701").alias("wages_2q_pre"),
        pl.col("PIRL1702").alias("wages_1q_pre"),

        # Post-Separation Employment
        pl.col("PIRL1610").alias("occupational_code_post").cast(pl.Int64),
        pl.col("PIRL1614").alias("industry_code_q1_post"),
        pl.col("PIRL1615").alias("industry_code_q2_post"),
        pl.col("PIRL1616").alias("industry_code_q3_post"),
        pl.col("PIRL1617").alias("industry_code_q4_post"),
        pl.col("PIRL1703").alias("wages_1q_post"),
        pl.col("PIRL1704").alias("wages_2q_post"),
        pl.col("PIRL1705").alias("wages_3q_post"),
        pl.col("PIRL1706").alias("wages_4q_post"),
        
        # Program Information
        (pl.col("PIRL 3001") == 1).alias("is_adult"),
        ((pl.col("PIRL 3002") == 1) | (pl.col("PIRL 3004") == 1)).alias("is_dislocated_worker"),
        (pl.col("PIRL 3003") == 1).alias("is_youth"),
        (pl.col("PIRL 3005") == 1).alias("is_wagner_peyser"),
        ((pl.col("PIRL 3006") == 1)).alias("is_reportable_individual"),
        (pl.col("PIRL1300") == 1).alias("received_training"),
        pl.col("PIRL900")
            .cast(pl.Utf8)
            .str.strptime(pl.Date, "%Y%m%d")
            .alias("entry_date"),
        pl.col("PIRL901")
            .cast(pl.Utf8)
            .str.strptime(pl.Date, "%Y%m%d")
            .alias("exit_date"),
    )
    .with_columns(
        pl.col("entry_date").dt.year().alias("entry_year"),
        pl.col("entry_date").dt.quarter().alias("entry_quarter"),
        pl.col("exit_date").dt.year().alias("exit_year"),
        pl.col("exit_date").dt.quarter().alias("exit_quarter")
    )
    .filter(
        ~pl.col("is_reportable_individual"),
        pl.col("exit_date").is_not_null()
    )
    .collect()
)

In [11]:
df2017 = (
    lf2017.select(
        pl.col("PIRL 100").alias("unique_id"),

        # Demographics Information
        pl.col("PIRL 201").alias("sex"),
        pl.col("PIRL 3023").alias("race"),
        pl.col("PIRL 3042").alias("age"),
        pl.col("PIRL 408").alias("highest_educational_level"),
        pl.col("PIRL 802").alias("low_income_status"),
        pl.col("PIRL 400").alias("employment_status"),

        # Pre-Separation Employment
        pl.col("PIRL 403").alias("occupational_code_pre").cast(pl.Int64),
        pl.col("PIRL 404").alias("industry_code_q1_pre"),
        pl.col("PIRL 404").alias("industry_code_q2_pre"),
        pl.col("PIRL 406").alias("industry_code_q3_pre"),
        pl.col("PIRL 1700").alias("wages_3q_pre"),
        pl.col("PIRL 1701").alias("wages_2q_pre"),
        pl.col("PIRL 1702").alias("wages_1q_pre"),

        # Post-Separation Employment
        pl.col("PIRL 1610").alias("occupational_code_post").cast(pl.Int64),
        pl.col("PIRL 1614").alias("industry_code_q1_post"),
        pl.col("PIRL 1615").alias("industry_code_q2_post"),
        pl.col("PIRL 1616").alias("industry_code_q3_post"),
        pl.col("PIRL 1617").alias("industry_code_q4_post"),
        pl.col("PIRL 1703").alias("wages_1q_post"),
        pl.col("PIRL 1704").alias("wages_2q_post"),
        pl.col("PIRL 1705").alias("wages_3q_post"),
        pl.col("PIRL 1706").alias("wages_4q_post"),
        
        # Program Information
        (pl.col("PIRL 3001") == 1).alias("is_adult"),
        ((pl.col("PIRL 3002") == 1) | (pl.col("PIRL 3004") == 1)).alias("is_dislocated_worker"),
        (pl.col("PIRL 3003") == 1).alias("is_youth"),
        (pl.col("PIRL 3005") == 1).alias("is_wagner_peyser"),
        ((pl.col("PIRL 3006") == 1)).alias("is_reportable_individual"),
        (pl.col("PIRL 1300") == 1).alias("received_training"),
        pl.col("PIRL 900")
            .cast(pl.Utf8)
            .str.strptime(pl.Date, "%Y%m%d")
            .alias("entry_date"),
        pl.col("PIRL 901")
            .cast(pl.Utf8)
            .str.strptime(pl.Date, "%Y%m%d")
            .alias("exit_date"),
    )
    .with_columns(
        pl.col("entry_date").dt.year().alias("entry_year"),
        pl.col("entry_date").dt.quarter().alias("entry_quarter"),
        pl.col("exit_date").dt.year().alias("exit_year"),
        pl.col("exit_date").dt.quarter().alias("exit_quarter")
    )
    .filter(
        ~pl.col("is_reportable_individual"),
        pl.col("exit_date").is_not_null()
    )
    .collect()
)

In [19]:
def highest_educational_level_map(value):
    match value:
        case v if v <= 12:
            return 0
        case v if (v >=13) & (v <= 15):
            return 4
        case 16:
            return 7
        case 17:
            return 8
        case 87:
            return 1
        case 88:
            return 2
        case 89:
            return 5
        case 90:
            return 5
        case 91:
            return 6
        case 0:
            return 0
        case _:
            return None


In [None]:
df2015 = (
    lf2015.select(
        pl.col("Item_100").alias("unique_id"),

        # Demographics Information
        pl.col("Item_201").alias("sex"),
        pl.col("Item_3006").alias("race"),
        pl.col("Item_3004").alias("age"),
        pl.col("Item_410").map_elements(highest_educational_level_map, return_dtype=pl.Int64).alias("highest_educational_level"),
        pl.col("Item_702").alias("low_income_status").cast(pl.Int64, strict=False),
        pl.col("Item_400").alias("employment_status"),


        # Pre-Separation Employment
        pl.col("Item_402").alias("occupational_code_pre").cast(pl.Int64),
        pl.col("Item_403").alias("industry_code_q1_pre").cast(pl.Int64),
        pl.col("Item_404").alias("industry_code_q2_pre").cast(pl.Int64),
        pl.col("Item_405").alias("industry_code_q3_pre").cast(pl.Int64),
        pl.col("Item_1600").alias("wages_3q_pre").cast(pl.Int64),
        pl.col("Item_1601").alias("wages_2q_pre").cast(pl.Int64),
        pl.col("Item_1602").alias("wages_1q_pre").cast(pl.Int64),

        # Post-Separation Employment
        pl.col("Item_1502").alias("occupational_code_post").cast(pl.Int64),
        pl.col("Item_1514").alias("industry_code_q1_post").cast(pl.Int64),
        pl.col("Item_1516").alias("industry_code_q2_post").cast(pl.Int64),
        pl.col("Item_1517").alias("industry_code_q3_post").cast(pl.Int64),
        pl.col("Item_1518").alias("industry_code_q4_post").cast(pl.Int64),
        pl.col("Item_1603").alias("wages_1q_post").cast(pl.Int64),
        pl.col("Item_1604").alias("wages_2q_post").cast(pl.Int64),
        pl.col("Item_1605").alias("wages_3q_post").cast(pl.Int64),
        pl.col("Item_1606").alias("wages_4q_post").cast(pl.Int64),

        # Program Information
        (pl.col("Item_3007") == 1).alias("is_adult"),
        ((pl.col("Item_3008") == 1) | (pl.col("Item_3009") == 1) | (pl.col("Item_3010") == 1)).alias("is_dislocated_worker"),
        ((pl.col("Item_3011") == 1) | (pl.col("Item_3012") == 1)).alias("is_youth"),
        (pl.col("Item_951") == 1).alias("is_wagner_peyser"),
        (pl.col("Item_3013") == 0).alias("is_reportable_individual"),
        (pl.col("Item_3014") == 1).alias("received_training"),
        pl.col("Item_900")
            .cast(pl.Utf8)
            .str.strptime(pl.Date, "%m/%d/%Y")
            .alias("entry_date"),
        pl.col("Item_901")
            .cast(pl.Utf8)
            .str.strptime(pl.Date, "%m/%d/%Y")
            .alias("exit_date"),
    )
    .with_columns(
        pl.col("entry_date").dt.year().alias("entry_year"),
        pl.col("entry_date").dt.quarter().alias("entry_quarter"),
        pl.col("exit_date").dt.year().alias("exit_year"),
        pl.col("exit_date").dt.quarter().alias("exit_quarter")
    )
    .filter(
        ~pl.col("is_reportable_individual"),
        pl.col("exit_date").is_not_null()
    )
    .collect()
)

In [24]:
df2013 = (
    lf2013.select(
        pl.col("Item_100").alias("unique_id"),
        
        # Demographics Information
        pl.col("Item_201").alias("sex"),
        pl.col("Item_3006").alias("race").cast(pl.Int64, strict=False),
        pl.col("Item_3004").alias("age"),
        pl.col("Item_410").map_elements(highest_educational_level_map, return_dtype=pl.Int64).alias("highest_educational_level"), # TODO(jcanedy27): To add appropriate mapping.
        pl.col("Item_702").alias("low_income_status").cast(pl.Int64, strict=False),
        pl.col("Item_400").alias("employment_status"),

        # Pre-Separation Employment
        pl.col("Item_402").alias("occupational_code_pre").cast(pl.Int64, strict=False),
        pl.col("Item_403").alias("industry_code_q1_pre").cast(pl.Int64, strict=False),
        pl.col("Item_404").alias("industry_code_q2_pre").cast(pl.Int64, strict=False),
        pl.col("Item_405").alias("industry_code_q3_pre").cast(pl.Int64, strict=False),
        pl.col("Item_1600").alias("wages_3q_pre").cast(pl.Int64, strict=False),
        pl.col("Item_1601").alias("wages_2q_pre").cast(pl.Int64, strict=False),
        pl.col("Item_1602").alias("wages_1q_pre").cast(pl.Int64, strict=False),

        # Post-Separation Employment
        pl.col("Item_1502").alias("occupational_code_post").cast(pl.Int64, strict=False),
        pl.col("Item_1514").alias("industry_code_q1_post").cast(pl.Int64, strict=False),
        pl.col("Item_1516").alias("industry_code_q2_post").cast(pl.Int64, strict=False),
        pl.col("Item_1517").alias("industry_code_q3_post").cast(pl.Int64, strict=False),
        pl.col("Item_1518").alias("industry_code_q4_post").cast(pl.Int64, strict=False),
        pl.col("Item_1603").alias("wages_1q_post").cast(pl.Int64, strict=False),
        pl.col("Item_1604").alias("wages_2q_post").cast(pl.Int64, strict=False),
        pl.col("Item_1605").alias("wages_3q_post").cast(pl.Int64, strict=False),
        pl.col("Item_1606").alias("wages_4q_post").cast(pl.Int64, strict=False),

        # Program Information
        (pl.col("Item_3007") == 1).alias("is_adult"),
        ((pl.col("Item_3008") == 1) | (pl.col("Item_3009") == 1) | (pl.col("Item_3010") == 1)).alias("is_dislocated_worker"),
        ((pl.col("Item_3011") == 1) | (pl.col("Item_3012") == 1)).alias("is_youth"),
        (pl.col("Item_951") == 1).alias("is_wagner_peyser"),
        (pl.col("Item_3013") == "0").alias("is_reportable_individual"),
        (pl.col("Item_3014") == "1").alias("received_training"),
        pl.col("Item_900")
            .cast(pl.Utf8)
            .str.strptime(pl.Date, "%m/%d/%Y", strict=False)
            .alias("entry_date"),
        pl.col("Item_901")
            .cast(pl.Utf8)
            .str.strptime(pl.Date, "%m/%d/%Y", strict=False)
            .alias("exit_date"),
    )
    .with_columns(
        pl.col("entry_date").dt.year().alias("entry_year"),
        pl.col("entry_date").dt.quarter().alias("entry_quarter"),
        pl.col("exit_date").dt.year().alias("exit_year"),
        pl.col("exit_date").dt.quarter().alias("exit_quarter")
    )
    .filter(
        ~pl.col("is_reportable_individual"),
        pl.col("exit_date").is_not_null()
    )
    .collect()
)

In [35]:
df_all = (
    pl.concat([df2013, df2015, df2017, df2019, df2021, df2023, df2024])
    .unique(subset=["unique_id", "entry_date", "exit_date"])
)

df_all = df_all.with_columns(
    pl.coalesce(
        pl.col("industry_code_q3_pre"),
        pl.col("industry_code_q2_pre"),
        pl.col("industry_code_q1_pre")
    ).alias("industry_code_pre"),
    pl.coalesce(
        pl.col("industry_code_q1_post"),
        pl.col("industry_code_q2_post"),
        pl.col("industry_code_q3_post"),
        pl.col("industry_code_q4_post")
    ).alias("industry_code_post")
)

In [36]:
df_all.filter(
    (pl.col("is_adult") | pl.col("is_dislocated_worker") | pl.col("is_youth")),
    pl.col("received_training"),
    (pl.col("exit_date") >= pl.lit("2012-01-01").str.strptime(pl.Date)) & (pl.col("exit_date") < pl.lit("2024-01-01").str.strptime(pl.Date))
)

unique_id,sex,race,age,highest_educational_level,low_income_status,employment_status,occupational_code_pre,industry_code_q1_pre,industry_code_q2_pre,industry_code_q3_pre,wages_3q_pre,wages_2q_pre,wages_1q_pre,occupational_code_post,industry_code_q1_post,industry_code_q2_post,industry_code_q3_post,industry_code_q4_post,wages_1q_post,wages_2q_post,wages_3q_post,wages_4q_post,is_adult,is_dislocated_worker,is_youth,is_wagner_peyser,is_reportable_individual,received_training,entry_date,exit_date,entry_year,entry_quarter,exit_year,exit_quarter,industry_code_pre,industry_code_post
str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,bool,bool,bool,bool,bool,bool,date,date,i32,i8,i32,i8,i64,i64
"""c09aLUFdd0Fe""",1,6,28,1,0,0,,924120,924120,924120,11179,12926,9137,,,,,,13287,18275,25955,28041,false,true,false,false,false,true,2020-02-21,2020-09-30,2020,1,2020,3,924120,
"""671C0704C06287D13106571D326FBC…",1,6,41,1,0,1,,,922160,713990,6873,2215,0,,,,921150,921150,0,0,6621,13260,true,true,false,true,false,true,2013-03-21,2014-09-30,2013,1,2014,3,713990,921150
"""aU455eUALAeA""",1,3,25,1,1,0,513022,311615,311615,455211,8652,8274,9679,,,561320,,,0,1147,,,true,false,false,true,false,true,2023-04-20,2023-05-09,2023,2,2023,2,455211,561320
"""b0dUDLS5SU4e""",1,6,60,7,1,0,,,,,0,0,0,,,,,,,,,,false,true,false,false,false,true,2021-09-15,2021-12-27,2021,3,2021,4,,
"""Le9LAdf4aLde""",1,3,23,2,1,0,,561320,561320,,0,0,2711,537051,722511,,,,5316,,,,true,false,false,false,false,true,2019-02-15,2019-07-03,2019,1,2019,3,561320,722511
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""fSd00ee5baaS""",1,1,18,0,0,0,,,,999999,1251,3446,0,,999999,,,,283,0,,,false,false,true,true,false,true,2023-01-19,2023-06-23,2023,1,2023,2,999999,999999
"""009u454""",1,6,66,4,,0,,561710,561710,561710,,,,,,,,,0,0,,,false,true,false,true,false,true,2011-07-15,2013-05-08,2011,3,2013,2,561710,
"""aaDaa94LfU5A""",2,6,28,4,1,0,319096,541940,541940,541940,9584,7896,3067,,,,,,,,,,true,false,false,true,false,true,2023-07-05,2023-12-05,2023,3,2023,4,541940,
"""C461C919E315DAFACE45ED638CF311…",1,1,33,0,0,0,47206100,999999,999999,999999,,,,,,,,,,,,,false,true,false,true,false,true,2015-11-01,2016-05-03,2015,4,2016,2,999999,


In [37]:
(
    df_all.filter(
        ~pl.any_horizontal(pl.all().is_null()),
        pl.col("is_adult") | pl.col("is_dislocated_worker") | pl.col("is_youth"),
        pl.col("received_training")
    )
)

unique_id,sex,race,age,highest_educational_level,low_income_status,employment_status,occupational_code_pre,industry_code_q1_pre,industry_code_q2_pre,industry_code_q3_pre,wages_3q_pre,wages_2q_pre,wages_1q_pre,occupational_code_post,industry_code_q1_post,industry_code_q2_post,industry_code_q3_post,industry_code_q4_post,wages_1q_post,wages_2q_post,wages_3q_post,wages_4q_post,is_adult,is_dislocated_worker,is_youth,is_wagner_peyser,is_reportable_individual,received_training,entry_date,exit_date,entry_year,entry_quarter,exit_year,exit_quarter,industry_code_pre,industry_code_post
str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,bool,bool,bool,bool,bool,bool,date,date,i32,i8,i32,i8,i64,i64
"""4dUDbLLdd9Ab""",2,3,23,6,0,0,292061,623110,623110,623110,9304,11455,9378,292061,623110,623110,623110,623110,6788,7412,10206,7266,true,false,false,true,false,true,2018-07-16,2020-12-04,2018,3,2020,4,623110,623110
"""a9a44DfL0dSd""",2,3,44,7,0,0,113021,551114,551114,551114,19778,16488,23299,436011,561320,561320,561320,561320,7069,6953,7815,3840,true,true,false,true,false,true,2017-07-11,2018-08-29,2017,3,2018,3,551114,561320
"""86BE6EFA5408C7DD42DE12B48DA659…",2,6,45,1,0,0,13116100,423830,423830,423830,6156,11538,13043,43405100,561320,335921,335921,335921,9488,11661,10778,10718,false,true,false,true,false,true,2014-01-27,2014-06-19,2014,1,2014,2,423830,561320
"""85C8C0DB3FC33D8E2326236980588C…",2,6,20,1,1,0,35202100,446110,446110,446110,2982,3709,4156,41201100,446110,446110,446110,446110,2502,5220,4212,4857,true,false,false,true,false,true,2014-01-02,2014-06-19,2014,1,2014,2,446110,446110
"""5fAADAFS554S""",1,3,38,7,1,0,435071,423120,423120,423120,10647,0,741,512011,336411,336411,336411,336411,7224,8535,12055,12755,true,false,false,false,false,true,2016-05-31,2016-12-31,2016,2,2016,4,423120,336411
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""0bbeLU5dA9LD""",2,1,56,1,0,0,537064,335131,335131,335131,12126,11857,15292,319099,623110,623110,623110,623110,9323,10424,6922,8555,false,true,false,false,false,true,2019-06-17,2022-11-18,2019,2,2022,4,335131,623110
"""bLU9DS9A4Sc5""",1,3,37,0,1,0,533033,561320,561320,561320,2672,5446,5663,533032,424440,424440,424440,424440,17930,16949,18523,19084,true,false,false,true,false,true,2018-01-09,2018-08-13,2018,1,2018,3,561320,424440
"""4SdSeAce049b""",2,6,19,2,1,1,434171,624120,624120,624120,4051,5815,4152,399021,624120,624120,624120,624120,3976,4925,4699,3983,false,false,true,true,false,true,2016-05-19,2016-12-31,2016,2,2016,4,624120,624120
"""Da4fbfDbcUAF""",1,5,43,7,0,0,435111,213112,213112,213112,25680,26645,29636,533032,221100,221100,562000,440000,18068,21960,13062,5101,false,true,false,false,false,true,2020-04-20,2020-07-16,2020,2,2020,3,213112,221100


In [38]:
(
    df_all.filter(
        ~pl.any_horizontal(pl.exclude(["occupational_code_pre", "occupational_code_post", "industry_code_q1_pre", "industry_code_q2_pre", "industry_code_q3_pre", "industry_code_q1_post", "industry_code_q2_post", "industry_code_q3_post", "industry_code_q4_post"]).is_null()),
        pl.col("is_adult") | pl.col("is_dislocated_worker") | pl.col("is_youth"),
        pl.col("received_training")
    )
)

unique_id,sex,race,age,highest_educational_level,low_income_status,employment_status,occupational_code_pre,industry_code_q1_pre,industry_code_q2_pre,industry_code_q3_pre,wages_3q_pre,wages_2q_pre,wages_1q_pre,occupational_code_post,industry_code_q1_post,industry_code_q2_post,industry_code_q3_post,industry_code_q4_post,wages_1q_post,wages_2q_post,wages_3q_post,wages_4q_post,is_adult,is_dislocated_worker,is_youth,is_wagner_peyser,is_reportable_individual,received_training,entry_date,exit_date,entry_year,entry_quarter,exit_year,exit_quarter,industry_code_pre,industry_code_post
str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,bool,bool,bool,bool,bool,bool,date,date,i32,i8,i32,i8,i64,i64
"""671C0704C06287D13106571D326FBC…",1,6,41,1,0,1,,,922160,713990,6873,2215,0,,,,921150,921150,0,0,6621,13260,true,true,false,true,false,true,2013-03-21,2014-09-30,2013,1,2014,3,713990,921150
"""73C867D9528423CD8A343E704B9213…",1,6,34,1,0,0,,999999,999999,999999,2378,3880,4073,,999999,999999,999999,999999,8367,9065,7232,10167,true,true,false,true,false,true,2013-11-25,2014-04-11,2013,4,2014,2,999999,999999
"""9000D00DD4655F8BEF878F08AC408B…",1,6,19,1,0,0,,999999,999999,999999,0,0,0,37301100,999999,999999,999999,999999,0,0,0,0,true,false,false,false,false,true,2014-01-02,2014-09-30,2014,1,2014,3,999999,999999
"""826D0E3DB8E6237571A9BD097143A5…",2,1,57,1,1,0,,999999,999999,999999,3110,3400,1508,39901100,999999,999999,999999,999999,1758,2964,1092,0,false,true,false,false,false,true,2009-11-05,2012-02-22,2009,4,2012,1,999999,999999
"""aAD4545DfAD4""",2,6,22,1,0,1,433071,522120,522120,522120,6082,6721,5927,,522120,522120,522120,522120,6314,2502,7209,7474,true,false,false,true,false,true,2018-02-01,2018-03-18,2018,1,2018,1,522120,522120
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""4b5Sacd5aL4b""",2,3,52,1,0,0,,561110,561110,561110,10468,12800,1532,,,561320,445110,445110,0,4907,8747,11243,true,false,false,true,false,true,2021-12-02,2022-05-05,2021,4,2022,2,561110,561320
"""B36BB9210E945CEB86DCEDD0C2E9B7…",2,3,34,4,1,0,,999999,999999,999999,0,0,0,,999999,999999,999999,999999,0,0,0,0,true,false,false,false,false,true,2012-03-13,2014-06-30,2012,1,2014,2,999999,999999
"""abA5SAfa95dS""",2,1,28,7,0,0,,621400,621400,622110,21389,26864,20754,291141,622110,622110,622110,622110,26202,22400,25067,22186,true,false,false,false,false,true,2016-08-15,2016-12-09,2016,3,2016,4,622110,622110
"""0E1D46D4190483F018669DE869DCFD…",1,6,37,6,1,1,,336212,336212,561320,5999,4209,4997,,561320,,561320,561320,1185,0,6525,9526,true,false,false,true,false,true,2013-11-19,2014-09-22,2013,4,2014,3,561320,561320
