# Extracting the 2023 Enchantments Lottery Data

I reached out to the USFS asking if they could provide me with the 2023 Enchantment Lottery data in `csv` or `xlsx`, which are formats available for previous years. It had been about two week and I hadn't heard back so I decided to extract the data myself using Python.

The resulting `csv` is included in this repository folder titled `2023_results.csv`. The script I used to convert the `pdf` is below.

```
import csv
import os
import re

from pypdf import PdfReader

# Get the current working directory
cwd = os.getcwd()

# File name
file_name = "fseprd1162873.pdf"

# File path
file_path = os.path.join(cwd, file_name)

# Read the PDF file
reader = PdfReader(file_path)
text = ""

TITLE_TEXT = "Enchantments Lottery 2023 Application Data"

# Text to remove from the PDF
text_to_remove = [TITLE_TEXT]

# Regex pattern that matches any single digit number followed immediately by a letter, excluding forward slashes until the of the line
PATTERN = r"(\d)([A-Za-z]+)(?![/])"

for page in reader.pages:
    page_text = page.extract_text()
    # Remove unwanted text
    for item in text_to_remove:
        page_text = page_text.replace(item, "")

    # Split the awarded group size and the state code with a space
    page_text = re.sub(PATTERN, r"\1 \2", page_text)

    # Separate 0Cancelled and 0Awarded with a space
    page_text = re.sub(r"(\d)(Cancelled|Awarded)", r"\1 \2", page_text)

    text += page_text + "\n"

# Save to CSV file
with open("temp.csv", "w", newline="") as file:
    writer = csv.writer(file, quotechar=None)
    writer.writerows(csv.reader(text.splitlines()))

CSV_ZONE_NAMES = [
    "Core,Enchantment,Zone",
    "Colchuck,Zone",
    "Stuart,Zone",
    "Snow,Zone",
    "Stuart,,Zone",
    "Eightmile/Caroline,Zone",
    "Eightmile/Caroline Zone,(stock)",
    "Stuart Zone,(stock)",
]

CSV_COLUMN_NAMES = [
    "Preferred,Entry,Date,1",
    "Preferred,Zone,1",
    "Minimum,Acceptable,Group,Size,1",
    "Maximum,Requested,Group,Size,1",
    "Preferred,Entry,Date,2",
    "Preferred,Zone,2",
    "Minimum,Acceptable,Group,Size,2",
    "Maximum,Requested,Group,Size,2",
    "Preferred,Entry,Date,3",
    "Preferred,Zone,3",
    "Minimum,Acceptable,Group,Size,3",
    "Maximum,Requested,Group,Size,3",
    "Processing,Sequence",
    "Results,Status",
    "Awarded,Preference",
    "Awarded,Entry,Date",
    "Awarded,Entrance,Code/Name",
    "Awarded,Group,Size",
]

# Combine the zone names and column names into single list
CORRECTION_NAMES = CSV_ZONE_NAMES + CSV_COLUMN_NAMES

# Open the input file in read mode and output file in write mode
with open("temp.csv", "r") as input_file, open(
    "2023_results_w_pdf_totals.csv", "w"
) as output_file:
    # Read each line from the input file
    for line in input_file:
        # Replace spaces with commas
        line = line.replace(" ", ",")

        # Check for zone names and replace commas with spaces
        for zone_name in CORRECTION_NAMES:
            line = line.replace(zone_name, zone_name.replace(",", " "))

        # Add four commas after Unsuccessful or Cancelled
        line = re.sub(r"(Unsuccessful|Cancelled)", r"\1,,,,", line)

        # Count the number of commas in the line
        num_commas = line.count(",")

        # Check if the num_commas is less than 18
        if num_commas < 18:
            # Store the number of commas to add to the line
            num_commas_to_add = 18 - num_commas

            # Regex that matches any series of digits followed by a comma and a result status (Unsuccessful, Cancelled, or Awarded)
            # This pattern is used to add commas after the result status
            pattern = r"(\d+),((Unsuccessful|Cancelled|Awarded))"

            # Add the number of commas infront of the series of digits in the line
            line = re.sub(pattern, r"," * num_commas_to_add + r"\1" + "," + r"\2", line)

        # Match the entire row before the 18th comma
        before_comma_pattern = r"^(.*?,){18}"
        # Store everything before the 18th comma in a variable
        before_18th_comma = re.match(before_comma_pattern, line)

        if before_18th_comma:
            before_18th_comma = before_18th_comma.group()

            # Match everything after the 18th comma, keeping everything before the 18th comma and after the 18th comma
            # but substitude the commas after the 18th comma with spaces
            after_comma_pattern = r"^(.*?,){18}(.*)$"
            replacement_after_18th_comma = re.sub(
                after_comma_pattern,
                lambda x: x.group(1) + x.group(2).replace(",", " "),
                line,
            )

            # Combine the before and after 18th comma
            line = before_18th_comma + replacement_after_18th_comma

        # Write the modified line to the output file
        output_file.write(line)

# Remove the temporary CSV file
os.remove("temp.csv")
```

The resulting `csv` wasn't perfect, although it was very close. It included the awarded totals for each zone as part of the _State_ cells in the first ten-ish rows. I had to go in after generating the `csv` and delete the 6 or 7 rows that included the zone data. If you use the script, you'll undoubtedly see those rows in the resulting `csv`.

For the curious, or for the future, the script may be interesting or helpful. However, for anyone interested in the 2023 results, they should download the `2023_results.csv` that is all ready to go.


# Clean


## Import


In [65]:
import pandas as pd

# Import 2021 Enchantments Lottery Data
raw_df = pd.read_csv(
    "./2023_results.csv",
    header=0,
    parse_dates=[
        "Preferred Entry Date 1",
        "Preferred Entry Date 2",
        "Preferred Entry Date 3",
        "Awarded Entry Date",
    ],
    date_format="%m-%d-%Y",
    low_memory=False,
)

# Take a quick look at the data
raw_df.head()

Unnamed: 0,Preferred Entry Date 1,Preferred Zone 1,Minimum Acceptable Group Size 1,Maximum Requested Group Size 1,Preferred Entry Date 2,Preferred Zone 2,Minimum Acceptable Group Size 2,Maximum Requested Group Size 2,Preferred Entry Date 3,Preferred Zone 3,Minimum Acceptable Group Size 3,Maximum Requested Group Size 3,Processing Sequence,Results Status,Awarded Preference,Awarded Entry Date,Awarded Entrance Code/Name,Awarded Group Size,State
0,6/18/2023,Core Enchantment Zone,2,2,6/11/2023,Core Enchantment Zone,2.0,2.0,6/4/2023,Core Enchantment Zone,2.0,2.0,438,Awarded,1.0,6/18/2023,Core Enchantment Zone,2.0,OR
1,8/21/2023,Core Enchantment Zone,8,8,8/23/2023,Core Enchantment Zone,8.0,8.0,8/24/2023,Core Enchantment Zone,8.0,8.0,16219,Unsuccessful,,,,,WA
2,6/9/2023,Core Enchantment Zone,4,4,7/21/2023,Colchuck Zone,4.0,4.0,8/4/2023,Colchuck Zone,4.0,4.0,35433,Unsuccessful,,,,,WA
3,7/6/2023,Core Enchantment Zone,2,2,7/20/2023,Core Enchantment Zone,2.0,2.0,8/3/2023,Core Enchantment Zone,2.0,2.0,22536,Unsuccessful,,,,,WA
4,9/9/2023,Core Enchantment Zone,3,3,8/22/2023,Core Enchantment Zone,3.0,3.0,8/23/2023,Eightmile/Caroline Zone,3.0,3.0,31307,Unsuccessful,,,,,WA


## Create Cleaned Full Application Dataframe


In [66]:
raw_df.dtypes

Preferred Entry Date 1              object
Preferred Zone 1                    object
Minimum Acceptable Group Size 1      int64
Maximum Requested Group Size 1       int64
Preferred Entry Date 2              object
Preferred Zone 2                    object
Minimum Acceptable Group Size 2    float64
Maximum Requested Group Size 2     float64
Preferred Entry Date 3              object
Preferred Zone 3                    object
Minimum Acceptable Group Size 3    float64
Maximum Requested Group Size 3     float64
Processing Sequence                  int64
Results Status                      object
Awarded Preference                  object
Awarded Entry Date                  object
Awarded Entrance Code/Name          object
Awarded Group Size                 float64
State                               object
dtype: object

In [67]:
# Identify columns with date data
date_columns = [
    "Preferred Entry Date 1",
    "Preferred Entry Date 2",
    "Preferred Entry Date 3",
    "Awarded Entry Date",
]

# Convert date columns to datetime
for col in date_columns:
    raw_df[col] = pd.to_datetime(raw_df[col])

# Check column data types
raw_df.dtypes

Preferred Entry Date 1             datetime64[ns]
Preferred Zone 1                           object
Minimum Acceptable Group Size 1             int64
Maximum Requested Group Size 1              int64
Preferred Entry Date 2             datetime64[ns]
Preferred Zone 2                           object
Minimum Acceptable Group Size 2           float64
Maximum Requested Group Size 2            float64
Preferred Entry Date 3             datetime64[ns]
Preferred Zone 3                           object
Minimum Acceptable Group Size 3           float64
Maximum Requested Group Size 3            float64
Processing Sequence                         int64
Results Status                             object
Awarded Preference                         object
Awarded Entry Date                 datetime64[ns]
Awarded Entrance Code/Name                 object
Awarded Group Size                        float64
State                                      object
dtype: object

In [68]:
# Number columns to convert NaN values to 0
number_columns = [
    "Minimum Acceptable Group Size 2",
    "Maximum Requested Group Size 2",
    "Minimum Acceptable Group Size 3",
    "Maximum Requested Group Size 3",
    "Awarded Preference",
    "Awarded Group Size",
    "Processing Sequence",  # new column
]

# Convert NaN values to 0
for col in number_columns:
    raw_df[col] = raw_df[col].fillna(0)

# Convert float to int
for col in raw_df.columns:
    if raw_df[col].dtype == "float64":
        raw_df[col] = raw_df[col].astype(int)

# Check column data types
raw_df.dtypes

Preferred Entry Date 1             datetime64[ns]
Preferred Zone 1                           object
Minimum Acceptable Group Size 1             int64
Maximum Requested Group Size 1              int64
Preferred Entry Date 2             datetime64[ns]
Preferred Zone 2                           object
Minimum Acceptable Group Size 2             int64
Maximum Requested Group Size 2              int64
Preferred Entry Date 3             datetime64[ns]
Preferred Zone 3                           object
Minimum Acceptable Group Size 3             int64
Maximum Requested Group Size 3              int64
Processing Sequence                         int64
Results Status                             object
Awarded Preference                         object
Awarded Entry Date                 datetime64[ns]
Awarded Entrance Code/Name                 object
Awarded Group Size                          int64
State                                      object
dtype: object

In [69]:
# Fill NaN values in string columns and convert to string
columns_to_convert = [
    "Preferred Zone 1",
    "Preferred Zone 2",
    "Preferred Zone 3",
    "Results Status",
    "Awarded Entrance Code/Name",
]
for col in columns_to_convert:
    # Converting to string may be unneccessary here
    raw_df[col] = raw_df[col].fillna("N/A").astype(str)

# Check column data types
raw_df.dtypes

Preferred Entry Date 1             datetime64[ns]
Preferred Zone 1                           object
Minimum Acceptable Group Size 1             int64
Maximum Requested Group Size 1              int64
Preferred Entry Date 2             datetime64[ns]
Preferred Zone 2                           object
Minimum Acceptable Group Size 2             int64
Maximum Requested Group Size 2              int64
Preferred Entry Date 3             datetime64[ns]
Preferred Zone 3                           object
Minimum Acceptable Group Size 3             int64
Maximum Requested Group Size 3              int64
Processing Sequence                         int64
Results Status                             object
Awarded Preference                         object
Awarded Entry Date                 datetime64[ns]
Awarded Entrance Code/Name                 object
Awarded Group Size                          int64
State                                      object
dtype: object

In [70]:
# Check for NaN values
raw_df.isna().sum()

Preferred Entry Date 1                 0
Preferred Zone 1                       0
Minimum Acceptable Group Size 1        0
Maximum Requested Group Size 1         0
Preferred Entry Date 2               508
Preferred Zone 2                       0
Minimum Acceptable Group Size 2        0
Maximum Requested Group Size 2         0
Preferred Entry Date 3              1154
Preferred Zone 3                       0
Minimum Acceptable Group Size 3        0
Maximum Requested Group Size 3         0
Processing Sequence                    0
Results Status                         0
Awarded Preference                     0
Awarded Entry Date                 37474
Awarded Entrance Code/Name             0
Awarded Group Size                     0
State                                  2
dtype: int64

In [71]:
# Convert NaN values in date columns to 0
# This feels like an odd approach, but I want to maintain the date data type.
# The analyst will need to understand that zero epoch dates are actually NaN values.
for col in date_columns:  # Date columns defined in previous cell
    raw_df[col] = raw_df[col].fillna(pd.Timestamp(0))

In [72]:
# Check for NaN values
raw_df.isna().sum()

Preferred Entry Date 1             0
Preferred Zone 1                   0
Minimum Acceptable Group Size 1    0
Maximum Requested Group Size 1     0
Preferred Entry Date 2             0
Preferred Zone 2                   0
Minimum Acceptable Group Size 2    0
Maximum Requested Group Size 2     0
Preferred Entry Date 3             0
Preferred Zone 3                   0
Minimum Acceptable Group Size 3    0
Maximum Requested Group Size 3     0
Processing Sequence                0
Results Status                     0
Awarded Preference                 0
Awarded Entry Date                 0
Awarded Entrance Code/Name         0
Awarded Group Size                 0
State                              2
dtype: int64

In [73]:
# Check data types
raw_df.dtypes

Preferred Entry Date 1             datetime64[ns]
Preferred Zone 1                           object
Minimum Acceptable Group Size 1             int64
Maximum Requested Group Size 1              int64
Preferred Entry Date 2             datetime64[ns]
Preferred Zone 2                           object
Minimum Acceptable Group Size 2             int64
Maximum Requested Group Size 2              int64
Preferred Entry Date 3             datetime64[ns]
Preferred Zone 3                           object
Minimum Acceptable Group Size 3             int64
Maximum Requested Group Size 3              int64
Processing Sequence                         int64
Results Status                             object
Awarded Preference                         object
Awarded Entry Date                 datetime64[ns]
Awarded Entrance Code/Name                 object
Awarded Group Size                          int64
State                                      object
dtype: object

In [74]:
# Check values for each column
for col in raw_df.columns:
    print(f"{col}: {raw_df[col].unique()}\n\n\n")

Preferred Entry Date 1: <DatetimeArray>
['2023-06-18 00:00:00', '2023-08-21 00:00:00', '2023-06-09 00:00:00',
 '2023-07-06 00:00:00', '2023-09-09 00:00:00', '2023-09-13 00:00:00',
 '2023-09-02 00:00:00', '2023-07-16 00:00:00', '2023-08-01 00:00:00',
 '2023-08-06 00:00:00',
 ...
 '2023-06-07 00:00:00', '2023-05-30 00:00:00', '2023-05-17 00:00:00',
 '2023-10-23 00:00:00', '2023-10-28 00:00:00', '2023-10-17 00:00:00',
 '2023-10-16 00:00:00', '2023-10-31 00:00:00', '2023-10-22 00:00:00',
 '2023-10-24 00:00:00']
Length: 168, dtype: datetime64[ns]



Preferred Zone 1: ['Core Enchantment Zone' 'Snow Zone' 'Colchuck Zone' 'Stuart  Zone'
 'Eightmile/Caroline Zone' 'Stuart Zone (stock)'
 'Eightmile/Caroline Zone (stock)']



Minimum Acceptable Group Size 1: [2 8 4 3 6 5 7 1]



Maximum Requested Group Size 1: [2 8 4 3 6 5 7 1]



Preferred Entry Date 2: <DatetimeArray>
['2023-06-11 00:00:00', '2023-08-23 00:00:00', '2023-07-21 00:00:00',
 '2023-07-20 00:00:00', '2023-08-22 00:00:00', '2023-09-20

In [75]:
# Examine the first 20 rows
raw_df.head(20)

Unnamed: 0,Preferred Entry Date 1,Preferred Zone 1,Minimum Acceptable Group Size 1,Maximum Requested Group Size 1,Preferred Entry Date 2,Preferred Zone 2,Minimum Acceptable Group Size 2,Maximum Requested Group Size 2,Preferred Entry Date 3,Preferred Zone 3,Minimum Acceptable Group Size 3,Maximum Requested Group Size 3,Processing Sequence,Results Status,Awarded Preference,Awarded Entry Date,Awarded Entrance Code/Name,Awarded Group Size,State
0,2023-06-18,Core Enchantment Zone,2,2,2023-06-11,Core Enchantment Zone,2,2,2023-06-04,Core Enchantment Zone,2,2,438,Awarded,1,2023-06-18,Core Enchantment Zone,2,OR
1,2023-08-21,Core Enchantment Zone,8,8,2023-08-23,Core Enchantment Zone,8,8,2023-08-24,Core Enchantment Zone,8,8,16219,Unsuccessful,0,1970-01-01,,0,WA
2,2023-06-09,Core Enchantment Zone,4,4,2023-07-21,Colchuck Zone,4,4,2023-08-04,Colchuck Zone,4,4,35433,Unsuccessful,0,1970-01-01,,0,WA
3,2023-07-06,Core Enchantment Zone,2,2,2023-07-20,Core Enchantment Zone,2,2,2023-08-03,Core Enchantment Zone,2,2,22536,Unsuccessful,0,1970-01-01,,0,WA
4,2023-09-09,Core Enchantment Zone,3,3,2023-08-22,Core Enchantment Zone,3,3,2023-08-23,Eightmile/Caroline Zone,3,3,31307,Unsuccessful,0,1970-01-01,,0,WA
5,2023-09-13,Core Enchantment Zone,2,2,2023-09-20,Core Enchantment Zone,2,2,2023-09-28,Core Enchantment Zone,2,2,14673,Unsuccessful,0,1970-01-01,,0,WA
6,2023-09-02,Core Enchantment Zone,2,2,2023-09-09,Core Enchantment Zone,2,2,2023-09-16,Core Enchantment Zone,2,2,21409,Unsuccessful,0,1970-01-01,,0,WA
7,2023-07-16,Snow Zone,8,8,2023-08-13,Snow Zone,8,8,2023-08-27,Snow Zone,8,8,21481,Unsuccessful,0,1970-01-01,,0,WA
8,2023-08-01,Colchuck Zone,4,4,2023-08-02,Colchuck Zone,4,4,2023-07-31,Snow Zone,4,4,17193,Unsuccessful,0,1970-01-01,,0,OR
9,2023-08-06,Core Enchantment Zone,2,2,2023-08-11,Core Enchantment Zone,2,2,2023-08-20,Core Enchantment Zone,2,2,6119,Unsuccessful,0,1970-01-01,,0,CA


In [76]:
# The group size columns have the same values for the minimum and maximum group size
# This is redundant and we can drop the maximum group size columns
print(
    (
        raw_df["Maximum Requested Group Size 1"]
        - raw_df["Minimum Acceptable Group Size 1"]
    ).unique()
)
print(
    (
        raw_df["Maximum Requested Group Size 2"]
        - raw_df["Minimum Acceptable Group Size 2"]
    ).unique()
)
print(
    (
        raw_df["Maximum Requested Group Size 3"]
        - raw_df["Minimum Acceptable Group Size 3"]
    ).unique()
)

[0]
[0]
[0]


In [77]:
# Drop the maximum group size columns because there is no variation from the minimum group size columns
raw_df = raw_df.drop(
    columns=[
        "Maximum Requested Group Size 1",
        "Maximum Requested Group Size 2",
        "Maximum Requested Group Size 3",
    ]
)

# Check the data
raw_df.head(20)

Unnamed: 0,Preferred Entry Date 1,Preferred Zone 1,Minimum Acceptable Group Size 1,Preferred Entry Date 2,Preferred Zone 2,Minimum Acceptable Group Size 2,Preferred Entry Date 3,Preferred Zone 3,Minimum Acceptable Group Size 3,Processing Sequence,Results Status,Awarded Preference,Awarded Entry Date,Awarded Entrance Code/Name,Awarded Group Size,State
0,2023-06-18,Core Enchantment Zone,2,2023-06-11,Core Enchantment Zone,2,2023-06-04,Core Enchantment Zone,2,438,Awarded,1,2023-06-18,Core Enchantment Zone,2,OR
1,2023-08-21,Core Enchantment Zone,8,2023-08-23,Core Enchantment Zone,8,2023-08-24,Core Enchantment Zone,8,16219,Unsuccessful,0,1970-01-01,,0,WA
2,2023-06-09,Core Enchantment Zone,4,2023-07-21,Colchuck Zone,4,2023-08-04,Colchuck Zone,4,35433,Unsuccessful,0,1970-01-01,,0,WA
3,2023-07-06,Core Enchantment Zone,2,2023-07-20,Core Enchantment Zone,2,2023-08-03,Core Enchantment Zone,2,22536,Unsuccessful,0,1970-01-01,,0,WA
4,2023-09-09,Core Enchantment Zone,3,2023-08-22,Core Enchantment Zone,3,2023-08-23,Eightmile/Caroline Zone,3,31307,Unsuccessful,0,1970-01-01,,0,WA
5,2023-09-13,Core Enchantment Zone,2,2023-09-20,Core Enchantment Zone,2,2023-09-28,Core Enchantment Zone,2,14673,Unsuccessful,0,1970-01-01,,0,WA
6,2023-09-02,Core Enchantment Zone,2,2023-09-09,Core Enchantment Zone,2,2023-09-16,Core Enchantment Zone,2,21409,Unsuccessful,0,1970-01-01,,0,WA
7,2023-07-16,Snow Zone,8,2023-08-13,Snow Zone,8,2023-08-27,Snow Zone,8,21481,Unsuccessful,0,1970-01-01,,0,WA
8,2023-08-01,Colchuck Zone,4,2023-08-02,Colchuck Zone,4,2023-07-31,Snow Zone,4,17193,Unsuccessful,0,1970-01-01,,0,OR
9,2023-08-06,Core Enchantment Zone,2,2023-08-11,Core Enchantment Zone,2,2023-08-20,Core Enchantment Zone,2,6119,Unsuccessful,0,1970-01-01,,0,CA


In [78]:
# Change columns names to lower case with underscores for spaces
raw_df.columns = [
    col.lower().replace(" ", "_").replace("/", "_") for col in raw_df.columns
]

# Check the names
raw_df.columns

Index(['preferred_entry_date_1', 'preferred_zone_1',
       'minimum_acceptable_group_size_1', 'preferred_entry_date_2',
       'preferred_zone_2', 'minimum_acceptable_group_size_2',
       'preferred_entry_date_3', 'preferred_zone_3',
       'minimum_acceptable_group_size_3', 'processing_sequence',
       'results_status', 'awarded_preference', 'awarded_entry_date',
       'awarded_entrance_code_name', 'awarded_group_size', 'state'],
      dtype='object')

In [79]:
# Check the data
raw_df.head(20)

Unnamed: 0,preferred_entry_date_1,preferred_zone_1,minimum_acceptable_group_size_1,preferred_entry_date_2,preferred_zone_2,minimum_acceptable_group_size_2,preferred_entry_date_3,preferred_zone_3,minimum_acceptable_group_size_3,processing_sequence,results_status,awarded_preference,awarded_entry_date,awarded_entrance_code_name,awarded_group_size,state
0,2023-06-18,Core Enchantment Zone,2,2023-06-11,Core Enchantment Zone,2,2023-06-04,Core Enchantment Zone,2,438,Awarded,1,2023-06-18,Core Enchantment Zone,2,OR
1,2023-08-21,Core Enchantment Zone,8,2023-08-23,Core Enchantment Zone,8,2023-08-24,Core Enchantment Zone,8,16219,Unsuccessful,0,1970-01-01,,0,WA
2,2023-06-09,Core Enchantment Zone,4,2023-07-21,Colchuck Zone,4,2023-08-04,Colchuck Zone,4,35433,Unsuccessful,0,1970-01-01,,0,WA
3,2023-07-06,Core Enchantment Zone,2,2023-07-20,Core Enchantment Zone,2,2023-08-03,Core Enchantment Zone,2,22536,Unsuccessful,0,1970-01-01,,0,WA
4,2023-09-09,Core Enchantment Zone,3,2023-08-22,Core Enchantment Zone,3,2023-08-23,Eightmile/Caroline Zone,3,31307,Unsuccessful,0,1970-01-01,,0,WA
5,2023-09-13,Core Enchantment Zone,2,2023-09-20,Core Enchantment Zone,2,2023-09-28,Core Enchantment Zone,2,14673,Unsuccessful,0,1970-01-01,,0,WA
6,2023-09-02,Core Enchantment Zone,2,2023-09-09,Core Enchantment Zone,2,2023-09-16,Core Enchantment Zone,2,21409,Unsuccessful,0,1970-01-01,,0,WA
7,2023-07-16,Snow Zone,8,2023-08-13,Snow Zone,8,2023-08-27,Snow Zone,8,21481,Unsuccessful,0,1970-01-01,,0,WA
8,2023-08-01,Colchuck Zone,4,2023-08-02,Colchuck Zone,4,2023-07-31,Snow Zone,4,17193,Unsuccessful,0,1970-01-01,,0,OR
9,2023-08-06,Core Enchantment Zone,2,2023-08-11,Core Enchantment Zone,2,2023-08-20,Core Enchantment Zone,2,6119,Unsuccessful,0,1970-01-01,,0,CA


In [80]:
# There is one row where the status is 'Applied' and the processing sequence is 0. It seems like some sort of mistake
# And I won't be able to work easily with it. I will drop this row.
# Delete row where status is Applied
raw_df = raw_df[raw_df["results_status"] != "Applied"]

# Check the data
raw_df.head(20)

Unnamed: 0,preferred_entry_date_1,preferred_zone_1,minimum_acceptable_group_size_1,preferred_entry_date_2,preferred_zone_2,minimum_acceptable_group_size_2,preferred_entry_date_3,preferred_zone_3,minimum_acceptable_group_size_3,processing_sequence,results_status,awarded_preference,awarded_entry_date,awarded_entrance_code_name,awarded_group_size,state
0,2023-06-18,Core Enchantment Zone,2,2023-06-11,Core Enchantment Zone,2,2023-06-04,Core Enchantment Zone,2,438,Awarded,1,2023-06-18,Core Enchantment Zone,2,OR
1,2023-08-21,Core Enchantment Zone,8,2023-08-23,Core Enchantment Zone,8,2023-08-24,Core Enchantment Zone,8,16219,Unsuccessful,0,1970-01-01,,0,WA
2,2023-06-09,Core Enchantment Zone,4,2023-07-21,Colchuck Zone,4,2023-08-04,Colchuck Zone,4,35433,Unsuccessful,0,1970-01-01,,0,WA
3,2023-07-06,Core Enchantment Zone,2,2023-07-20,Core Enchantment Zone,2,2023-08-03,Core Enchantment Zone,2,22536,Unsuccessful,0,1970-01-01,,0,WA
4,2023-09-09,Core Enchantment Zone,3,2023-08-22,Core Enchantment Zone,3,2023-08-23,Eightmile/Caroline Zone,3,31307,Unsuccessful,0,1970-01-01,,0,WA
5,2023-09-13,Core Enchantment Zone,2,2023-09-20,Core Enchantment Zone,2,2023-09-28,Core Enchantment Zone,2,14673,Unsuccessful,0,1970-01-01,,0,WA
6,2023-09-02,Core Enchantment Zone,2,2023-09-09,Core Enchantment Zone,2,2023-09-16,Core Enchantment Zone,2,21409,Unsuccessful,0,1970-01-01,,0,WA
7,2023-07-16,Snow Zone,8,2023-08-13,Snow Zone,8,2023-08-27,Snow Zone,8,21481,Unsuccessful,0,1970-01-01,,0,WA
8,2023-08-01,Colchuck Zone,4,2023-08-02,Colchuck Zone,4,2023-07-31,Snow Zone,4,17193,Unsuccessful,0,1970-01-01,,0,OR
9,2023-08-06,Core Enchantment Zone,2,2023-08-11,Core Enchantment Zone,2,2023-08-20,Core Enchantment Zone,2,6119,Unsuccessful,0,1970-01-01,,0,CA


In [81]:
# Export cleaned data to csv
raw_df.to_csv("./2023_results_cleaned.csv", index=False, date_format="%m-%d-%Y")

In [82]:
# Check import of cleaned data
cleaned_raw_df = pd.read_csv(
    "./2023_results_cleaned.csv",
    # Import was failing to parse date columns, so I
    # had to pass in the column names
    parse_dates=[
        "preferred_entry_date_1",
        "preferred_entry_date_2",
        "preferred_entry_date_3",
        "awarded_entry_date",
    ],
    date_format="%m-%d-%Y",  # Align format with export format
    na_filter=False,  # Do not convert 'N/A' to NaN
)

# Check the datatypes
cleaned_raw_df.dtypes

preferred_entry_date_1             datetime64[ns]
preferred_zone_1                           object
minimum_acceptable_group_size_1             int64
preferred_entry_date_2             datetime64[ns]
preferred_zone_2                           object
minimum_acceptable_group_size_2             int64
preferred_entry_date_3             datetime64[ns]
preferred_zone_3                           object
minimum_acceptable_group_size_3             int64
processing_sequence                         int64
results_status                             object
awarded_preference                          int64
awarded_entry_date                 datetime64[ns]
awarded_entrance_code_name                 object
awarded_group_size                          int64
state                                      object
dtype: object

## Create Cleaned Split Dataframe


Unlike the 2021 and 2022 data, the 2023 data erases awarded information for permits that were cancelled. For example, in 2022, if a permit was cancelled we still kenw which preference was awarded. The 2023 data shows the _processing_sequence_ as 0 and shows all awarded data as blank. Therefore, those entries aren't helpful in the split entry dataframes. I am going to remove them.


In [83]:
# It may be better to break up each individual entry into its own row, so that the data can be analyzed more easily.
preferred_options = [1, 2, 3]

# Columns that every dataframe will have
shared_columns = [
    "results_status",
    "awarded_preference",
    "awarded_entry_date",
    "awarded_entrance_code_name",
    "awarded_group_size",
    "processing_sequence",
    "state",
]
new_dataframes = []

# Iterate over each option number creating a new dataframe for each
for option in preferred_options:
    # Get the columns for the current option
    columns = [
        f"preferred_zone_{option}",
        f"preferred_entry_date_{option}",
        f"minimum_acceptable_group_size_{option}",
    ]
    # Create a new dataframe for the current option
    df_option = cleaned_raw_df[columns + shared_columns].copy()
    # Rename the columns to remove the option number
    df_option.columns = [
        "preferred_zone",
        "preferred_entry_date",
        "minimum_acceptable_group_size",
    ] + shared_columns
    # Add a column to indicate if the permit was awarded for the current option
    df_option["awarded"] = df_option["awarded_preference"] == option
    df_option["preferred_option"] = option

    # Append the new dataframe to the list of dataframes
    new_dataframes.append(df_option)

# Concatenate the list of dataframes into a single dataframe
df_split = pd.concat(new_dataframes)

# Drop rows where the preferred division is N/A
df_split = df_split[df_split["preferred_zone"] != "N/A"]

# Check the new dataframe
df_split.head()

Unnamed: 0,preferred_zone,preferred_entry_date,minimum_acceptable_group_size,results_status,awarded_preference,awarded_entry_date,awarded_entrance_code_name,awarded_group_size,processing_sequence,state,awarded,preferred_option
0,Core Enchantment Zone,2023-06-18,2,Awarded,1,2023-06-18,Core Enchantment Zone,2,438,OR,True,1
1,Core Enchantment Zone,2023-08-21,8,Unsuccessful,0,1970-01-01,,0,16219,WA,False,1
2,Core Enchantment Zone,2023-06-09,4,Unsuccessful,0,1970-01-01,,0,35433,WA,False,1
3,Core Enchantment Zone,2023-07-06,2,Unsuccessful,0,1970-01-01,,0,22536,WA,False,1
4,Core Enchantment Zone,2023-09-09,3,Unsuccessful,0,1970-01-01,,0,31307,WA,False,1


In [84]:
# Drop entries where the results_status is 'Cancelled'
df_split = df_split[df_split["results_status"] != "Cancelled"]

In [85]:
# Add the month of the preferred entry date to the dataframe
import calendar

# Get the month as an integer
df_split["preferred_entry_date" + "_month"] = df_split["preferred_entry_date"].dt.month
# Get the month as a string
df_split["preferred_entry_date" + "_month"] = df_split[
    "preferred_entry_date" + "_month"
].apply(lambda x: calendar.month_name[x])

# Check the data
df_split.head()

Unnamed: 0,preferred_zone,preferred_entry_date,minimum_acceptable_group_size,results_status,awarded_preference,awarded_entry_date,awarded_entrance_code_name,awarded_group_size,processing_sequence,state,awarded,preferred_option,preferred_entry_date_month
0,Core Enchantment Zone,2023-06-18,2,Awarded,1,2023-06-18,Core Enchantment Zone,2,438,OR,True,1,June
1,Core Enchantment Zone,2023-08-21,8,Unsuccessful,0,1970-01-01,,0,16219,WA,False,1,August
2,Core Enchantment Zone,2023-06-09,4,Unsuccessful,0,1970-01-01,,0,35433,WA,False,1,June
3,Core Enchantment Zone,2023-07-06,2,Unsuccessful,0,1970-01-01,,0,22536,WA,False,1,July
4,Core Enchantment Zone,2023-09-09,3,Unsuccessful,0,1970-01-01,,0,31307,WA,False,1,September


In [86]:
# Add the day of the week columns based on preferred entry date
df_split["preferred_entry_date" + "_day"] = df_split[
    "preferred_entry_date"
].dt.day_name()

# Check the data
df_split.head()

Unnamed: 0,preferred_zone,preferred_entry_date,minimum_acceptable_group_size,results_status,awarded_preference,awarded_entry_date,awarded_entrance_code_name,awarded_group_size,processing_sequence,state,awarded,preferred_option,preferred_entry_date_month,preferred_entry_date_day
0,Core Enchantment Zone,2023-06-18,2,Awarded,1,2023-06-18,Core Enchantment Zone,2,438,OR,True,1,June,Sunday
1,Core Enchantment Zone,2023-08-21,8,Unsuccessful,0,1970-01-01,,0,16219,WA,False,1,August,Monday
2,Core Enchantment Zone,2023-06-09,4,Unsuccessful,0,1970-01-01,,0,35433,WA,False,1,June,Friday
3,Core Enchantment Zone,2023-07-06,2,Unsuccessful,0,1970-01-01,,0,22536,WA,False,1,July,Thursday
4,Core Enchantment Zone,2023-09-09,3,Unsuccessful,0,1970-01-01,,0,31307,WA,False,1,September,Saturday


In [87]:
# Export the split data to a csv file
df_split.to_csv("./2023_results_split.csv", index=False, date_format="%m-%d-%Y")

In [88]:
# Create a datatframe of skipped entries
# Find where the preferred option and the awarded preference are 0
awarded_preference_greater_than_zero = df_split["awarded_preference"] > 0

# Find where the preferred option was equal to the awarded preference
preferred_option_equals_awarded_preference = (
    df_split["preferred_option"] == df_split["awarded_preference"]
)

# Awarded entries filter
awarded_entries_filter = (
    awarded_preference_greater_than_zero & preferred_option_equals_awarded_preference
)

df_split_skipped = df_split[~awarded_entries_filter].copy()

# Print the number of awarded entries and the number of failed entries
print(
    f"Number of awarded entries: {len(df_split[awarded_entries_filter])}\nNumber of skipped entries: {len(df_split_skipped)}"
)

Number of awarded entries: 2558
Number of skipped entries: 115367


In [89]:
# Export the split skipped data to a csv file
df_split_skipped.to_csv(
    "./2023_results_split_skipped.csv", index=False, date_format="%m-%d-%Y"
)