# Clean


## Import


In [26]:
import pandas as pd

# Import 2021 Enchantments Lottery Data
raw_df = pd.read_csv(
    "./2022_results.csv",
    header=0,
    parse_dates=[
        "Preferred Entry Date 1",
        "Preferred Entry Date 2",
        "Preferred Entry Date 3",
        "Awarded Entry Date",
    ],
    date_format="%m-%d-%Y",
)

# Take a quick look at the data
raw_df.head()

Unnamed: 0,Preferred Entry Date 1,Preferred Zone 1,Minimum Acceptable Group Size 1,Maximum Requested Group Size 1,Preferred Entry Date 2,Preferred Zone 2,Minimum Acceptable Group Size 2,Maximum Requested Group Size 2,Preferred Entry Date 3,Preferred Zone 3,Minimum Acceptable Group Size 3,Maximum Requested Group Size 3,Results Status,Awarded Preference,Awarded Entry Date,Awarded Entrance,Awarded Group Size
0,9/2/2022,Core Enchantment Zone,8,8,8/26/2022,Colchuck Zone,8.0,8.0,9/16/2022,Core Enchantment Zone,8.0,8.0,Unsuccessful,,,,
1,8/15/2022,Colchuck Zone,2,2,8/24/2022,Colchuck Zone,2.0,2.0,8/29/2022,Colchuck Zone,2.0,2.0,Unsuccessful,,,,
2,8/12/2022,Snow Zone,8,8,8/19/2022,Snow Zone,8.0,8.0,8/3/2022,Snow Zone,8.0,8.0,Unsuccessful,,,,
3,7/12/2022,Core Enchantment Zone,2,2,7/20/2022,Core Enchantment Zone,2.0,2.0,7/13/2022,Snow Zone,2.0,2.0,Unsuccessful,,,,
4,9/3/2022,Stuart Zone,4,4,8/28/2022,Stuart Zone,4.0,4.0,8/21/2022,Stuart Zone,4.0,4.0,Unsuccessful,,,,


## Create Cleaned Full Application Dataframe


In [27]:
# Examine column data types
raw_df.dtypes

Preferred Entry Date 1              object
Preferred Zone 1                    object
Minimum Acceptable Group Size 1      int64
Maximum Requested Group Size 1       int64
Preferred Entry Date 2              object
Preferred Zone 2                    object
Minimum Acceptable Group Size 2    float64
Maximum Requested Group Size 2     float64
Preferred Entry Date 3              object
Preferred Zone 3                    object
Minimum Acceptable Group Size 3    float64
Maximum Requested Group Size 3     float64
Results Status                      object
Awarded Preference                 float64
Awarded Entry Date                  object
Awarded Entrance                    object
Awarded Group Size                 float64
dtype: object

In [28]:
# Identify columns with date data
date_columns = [
    "Preferred Entry Date 1",
    "Preferred Entry Date 2",
    "Preferred Entry Date 3",
    "Awarded Entry Date",
]

# Convert date columns to datetime
for col in date_columns:
    raw_df[col] = pd.to_datetime(raw_df[col])

# Check column data types
raw_df.dtypes

Preferred Entry Date 1             datetime64[ns]
Preferred Zone 1                           object
Minimum Acceptable Group Size 1             int64
Maximum Requested Group Size 1              int64
Preferred Entry Date 2             datetime64[ns]
Preferred Zone 2                           object
Minimum Acceptable Group Size 2           float64
Maximum Requested Group Size 2            float64
Preferred Entry Date 3             datetime64[ns]
Preferred Zone 3                           object
Minimum Acceptable Group Size 3           float64
Maximum Requested Group Size 3            float64
Results Status                             object
Awarded Preference                        float64
Awarded Entry Date                 datetime64[ns]
Awarded Entrance                           object
Awarded Group Size                        float64
dtype: object

In [29]:
# Number columns to convert NaN values to 0
number_columns = [
    "Minimum Acceptable Group Size 2",
    "Maximum Requested Group Size 2",
    "Minimum Acceptable Group Size 3",
    "Maximum Requested Group Size 3",
    "Awarded Preference",
    "Awarded Group Size",
]

# Convert NaN values to 0
for col in number_columns:
    raw_df[col] = raw_df[col].fillna(0)

# Convert float to int
for col in raw_df.columns:
    if raw_df[col].dtype == "float64":
        raw_df[col] = raw_df[col].astype(int)

# Check column data types
raw_df.dtypes

Preferred Entry Date 1             datetime64[ns]
Preferred Zone 1                           object
Minimum Acceptable Group Size 1             int64
Maximum Requested Group Size 1              int64
Preferred Entry Date 2             datetime64[ns]
Preferred Zone 2                           object
Minimum Acceptable Group Size 2             int64
Maximum Requested Group Size 2              int64
Preferred Entry Date 3             datetime64[ns]
Preferred Zone 3                           object
Minimum Acceptable Group Size 3             int64
Maximum Requested Group Size 3              int64
Results Status                             object
Awarded Preference                          int64
Awarded Entry Date                 datetime64[ns]
Awarded Entrance                           object
Awarded Group Size                          int64
dtype: object

In [30]:
# Fill NaN values in string columns and convert to string
columns_to_convert = [
    "Preferred Zone 1",
    "Preferred Zone 2",
    "Preferred Zone 3",
    "Results Status",
    "Awarded Entrance",
]
for col in columns_to_convert:
    # Converting to string may be unneccessary here
    raw_df[col] = raw_df[col].fillna("N/A").astype(str)

# Check column data types
raw_df.dtypes

Preferred Entry Date 1             datetime64[ns]
Preferred Zone 1                           object
Minimum Acceptable Group Size 1             int64
Maximum Requested Group Size 1              int64
Preferred Entry Date 2             datetime64[ns]
Preferred Zone 2                           object
Minimum Acceptable Group Size 2             int64
Maximum Requested Group Size 2              int64
Preferred Entry Date 3             datetime64[ns]
Preferred Zone 3                           object
Minimum Acceptable Group Size 3             int64
Maximum Requested Group Size 3              int64
Results Status                             object
Awarded Preference                          int64
Awarded Entry Date                 datetime64[ns]
Awarded Entrance                           object
Awarded Group Size                          int64
dtype: object

In [31]:
# Examine NaN values
raw_df.isnull().sum()

Preferred Entry Date 1                 0
Preferred Zone 1                       0
Minimum Acceptable Group Size 1        0
Maximum Requested Group Size 1         0
Preferred Entry Date 2               527
Preferred Zone 2                       0
Minimum Acceptable Group Size 2        0
Maximum Requested Group Size 2         0
Preferred Entry Date 3              1144
Preferred Zone 3                       0
Minimum Acceptable Group Size 3        0
Maximum Requested Group Size 3         0
Results Status                         0
Awarded Preference                     0
Awarded Entry Date                 34299
Awarded Entrance                       0
Awarded Group Size                     0
dtype: int64

In [32]:
# Convert NaN values in date columns to 0
# This feels like an odd approach, but I want to maintain the date data type.
# The analyst will need to understand that zero epoch dates are actually NaN values.
for col in date_columns:  # Date columns defined in previous cell
    raw_df[col] = raw_df[col].fillna(pd.Timestamp(0))

# Examine NaN values
raw_df.isnull().sum()

Preferred Entry Date 1             0
Preferred Zone 1                   0
Minimum Acceptable Group Size 1    0
Maximum Requested Group Size 1     0
Preferred Entry Date 2             0
Preferred Zone 2                   0
Minimum Acceptable Group Size 2    0
Maximum Requested Group Size 2     0
Preferred Entry Date 3             0
Preferred Zone 3                   0
Minimum Acceptable Group Size 3    0
Maximum Requested Group Size 3     0
Results Status                     0
Awarded Preference                 0
Awarded Entry Date                 0
Awarded Entrance                   0
Awarded Group Size                 0
dtype: int64

In [33]:
# Examine data types again
raw_df.dtypes

Preferred Entry Date 1             datetime64[ns]
Preferred Zone 1                           object
Minimum Acceptable Group Size 1             int64
Maximum Requested Group Size 1              int64
Preferred Entry Date 2             datetime64[ns]
Preferred Zone 2                           object
Minimum Acceptable Group Size 2             int64
Maximum Requested Group Size 2              int64
Preferred Entry Date 3             datetime64[ns]
Preferred Zone 3                           object
Minimum Acceptable Group Size 3             int64
Maximum Requested Group Size 3              int64
Results Status                             object
Awarded Preference                          int64
Awarded Entry Date                 datetime64[ns]
Awarded Entrance                           object
Awarded Group Size                          int64
dtype: object

In [34]:
# Examine values for each column
for col in raw_df.columns:
    print(f"Column: {col}")
    print(raw_df[col].value_counts())
    print("\n")

Column: Preferred Entry Date 1
Preferred Entry Date 1
2022-08-05    760
2022-08-04    688
2022-08-12    626
2022-07-15    613
2022-07-22    601
             ... 
2022-10-25      2
2022-10-30      2
2022-10-29      2
2022-10-26      1
2022-10-23      1
Name: count, Length: 169, dtype: int64


Column: Preferred Zone 1
Preferred Zone 1
Core Enchantment Zone              26988
Colchuck Zone                       4606
Snow Zone                           2881
Stuart  Zone                        1713
Eightmile/Caroline Zone              548
Eightmile/Caroline Zone (stock)       50
Stuart Zone (stock)                   41
Name: count, dtype: int64


Column: Minimum Acceptable Group Size 1
Minimum Acceptable Group Size 1
4    10824
8     7621
2     6539
6     6265
5     2459
3     2252
7      462
1      405
Name: count, dtype: int64


Column: Maximum Requested Group Size 1
Maximum Requested Group Size 1
4    10824
8     7621
2     6539
6     6265
5     2459
3     2252
7      462
1      405
Name

In [35]:
# Examine the first 20 rows
raw_df.head(20)

Unnamed: 0,Preferred Entry Date 1,Preferred Zone 1,Minimum Acceptable Group Size 1,Maximum Requested Group Size 1,Preferred Entry Date 2,Preferred Zone 2,Minimum Acceptable Group Size 2,Maximum Requested Group Size 2,Preferred Entry Date 3,Preferred Zone 3,Minimum Acceptable Group Size 3,Maximum Requested Group Size 3,Results Status,Awarded Preference,Awarded Entry Date,Awarded Entrance,Awarded Group Size
0,2022-09-02,Core Enchantment Zone,8,8,2022-08-26,Colchuck Zone,8,8,2022-09-16,Core Enchantment Zone,8,8,Unsuccessful,0,1970-01-01,,0
1,2022-08-15,Colchuck Zone,2,2,2022-08-24,Colchuck Zone,2,2,2022-08-29,Colchuck Zone,2,2,Unsuccessful,0,1970-01-01,,0
2,2022-08-12,Snow Zone,8,8,2022-08-19,Snow Zone,8,8,2022-08-03,Snow Zone,8,8,Unsuccessful,0,1970-01-01,,0
3,2022-07-12,Core Enchantment Zone,2,2,2022-07-20,Core Enchantment Zone,2,2,2022-07-13,Snow Zone,2,2,Unsuccessful,0,1970-01-01,,0
4,2022-09-03,Stuart Zone,4,4,2022-08-28,Stuart Zone,4,4,2022-08-21,Stuart Zone,4,4,Unsuccessful,0,1970-01-01,,0
5,2022-09-10,Core Enchantment Zone,8,8,2022-09-17,Core Enchantment Zone,8,8,2022-07-20,Core Enchantment Zone,8,8,Unsuccessful,0,1970-01-01,,0
6,2022-09-01,Core Enchantment Zone,2,2,2022-09-09,Core Enchantment Zone,2,2,2022-09-16,Core Enchantment Zone,2,2,Unsuccessful,0,1970-01-01,,0
7,2022-06-24,Core Enchantment Zone,2,2,2022-07-01,Core Enchantment Zone,2,2,2022-08-19,Core Enchantment Zone,2,2,Unsuccessful,0,1970-01-01,,0
8,2022-09-09,Core Enchantment Zone,6,6,2022-09-01,Snow Zone,6,6,2022-08-04,Stuart Zone,6,6,Unsuccessful,0,1970-01-01,,0
9,2022-10-03,Core Enchantment Zone,2,2,2022-08-07,Core Enchantment Zone,2,2,2022-10-10,Core Enchantment Zone,2,2,Unsuccessful,0,1970-01-01,,0


In [36]:
# The group size columns have the same values for the minimum and maximum group size
# This is redundant and we can drop the maximum group size columns
print(
    (
        raw_df["Maximum Requested Group Size 1"]
        - raw_df["Minimum Acceptable Group Size 1"]
    ).unique()
)
print(
    (
        raw_df["Maximum Requested Group Size 2"]
        - raw_df["Minimum Acceptable Group Size 2"]
    ).unique()
)
print(
    (
        raw_df["Maximum Requested Group Size 3"]
        - raw_df["Minimum Acceptable Group Size 3"]
    ).unique()
)

[0]
[0]
[0]


In [37]:
# Drop the maximum group size columns because there is no variation from the minimum group size columns
raw_df = raw_df.drop(
    columns=[
        "Maximum Requested Group Size 1",
        "Maximum Requested Group Size 2",
        "Maximum Requested Group Size 3",
    ]
)

# Check the data
raw_df.head(20)

Unnamed: 0,Preferred Entry Date 1,Preferred Zone 1,Minimum Acceptable Group Size 1,Preferred Entry Date 2,Preferred Zone 2,Minimum Acceptable Group Size 2,Preferred Entry Date 3,Preferred Zone 3,Minimum Acceptable Group Size 3,Results Status,Awarded Preference,Awarded Entry Date,Awarded Entrance,Awarded Group Size
0,2022-09-02,Core Enchantment Zone,8,2022-08-26,Colchuck Zone,8,2022-09-16,Core Enchantment Zone,8,Unsuccessful,0,1970-01-01,,0
1,2022-08-15,Colchuck Zone,2,2022-08-24,Colchuck Zone,2,2022-08-29,Colchuck Zone,2,Unsuccessful,0,1970-01-01,,0
2,2022-08-12,Snow Zone,8,2022-08-19,Snow Zone,8,2022-08-03,Snow Zone,8,Unsuccessful,0,1970-01-01,,0
3,2022-07-12,Core Enchantment Zone,2,2022-07-20,Core Enchantment Zone,2,2022-07-13,Snow Zone,2,Unsuccessful,0,1970-01-01,,0
4,2022-09-03,Stuart Zone,4,2022-08-28,Stuart Zone,4,2022-08-21,Stuart Zone,4,Unsuccessful,0,1970-01-01,,0
5,2022-09-10,Core Enchantment Zone,8,2022-09-17,Core Enchantment Zone,8,2022-07-20,Core Enchantment Zone,8,Unsuccessful,0,1970-01-01,,0
6,2022-09-01,Core Enchantment Zone,2,2022-09-09,Core Enchantment Zone,2,2022-09-16,Core Enchantment Zone,2,Unsuccessful,0,1970-01-01,,0
7,2022-06-24,Core Enchantment Zone,2,2022-07-01,Core Enchantment Zone,2,2022-08-19,Core Enchantment Zone,2,Unsuccessful,0,1970-01-01,,0
8,2022-09-09,Core Enchantment Zone,6,2022-09-01,Snow Zone,6,2022-08-04,Stuart Zone,6,Unsuccessful,0,1970-01-01,,0
9,2022-10-03,Core Enchantment Zone,2,2022-08-07,Core Enchantment Zone,2,2022-10-10,Core Enchantment Zone,2,Unsuccessful,0,1970-01-01,,0


In [38]:
# Change columns names to lower case with underscores for spaces
raw_df.columns = [
    col.lower().replace(" ", "_").replace("/", "_") for col in raw_df.columns
]

# Check the names
raw_df.columns

Index(['preferred_entry_date_1', 'preferred_zone_1',
       'minimum_acceptable_group_size_1', 'preferred_entry_date_2',
       'preferred_zone_2', 'minimum_acceptable_group_size_2',
       'preferred_entry_date_3', 'preferred_zone_3',
       'minimum_acceptable_group_size_3', 'results_status',
       'awarded_preference', 'awarded_entry_date', 'awarded_entrance',
       'awarded_group_size'],
      dtype='object')

In [39]:
# Export cleaned data to csv
raw_df.to_csv("./2022_results_cleaned.csv", index=False, date_format="%m-%d-%Y")

In [40]:
# Check import of cleaned data
cleaned_raw_df = pd.read_csv(
    "./2022_results_cleaned.csv",
    # Import was failing to parse date columns, so I
    # had to pass in the column names
    parse_dates=[
        "preferred_entry_date_1",
        "preferred_entry_date_2",
        "preferred_entry_date_3",
        "awarded_entry_date",
    ],
    date_format="%m-%d-%Y",  # Align format with export format
    na_filter=False,  # Do not convert 'N/A' to NaN
)

# Check the datatypes
cleaned_raw_df.dtypes

preferred_entry_date_1             datetime64[ns]
preferred_zone_1                           object
minimum_acceptable_group_size_1             int64
preferred_entry_date_2             datetime64[ns]
preferred_zone_2                           object
minimum_acceptable_group_size_2             int64
preferred_entry_date_3             datetime64[ns]
preferred_zone_3                           object
minimum_acceptable_group_size_3             int64
results_status                             object
awarded_preference                          int64
awarded_entry_date                 datetime64[ns]
awarded_entrance                           object
awarded_group_size                          int64
dtype: object

In [41]:
# Check cleaned data frame for NaN values
cleaned_raw_df.isnull().sum()

preferred_entry_date_1             0
preferred_zone_1                   0
minimum_acceptable_group_size_1    0
preferred_entry_date_2             0
preferred_zone_2                   0
minimum_acceptable_group_size_2    0
preferred_entry_date_3             0
preferred_zone_3                   0
minimum_acceptable_group_size_3    0
results_status                     0
awarded_preference                 0
awarded_entry_date                 0
awarded_entrance                   0
awarded_group_size                 0
dtype: int64

## Create Cleaned Split Dataframe


In [42]:
# It may be better to break up each individual entry into its own row, so that the data can be analyzed more easily.
preferred_options = [1, 2, 3]

# Columns that every dataframe will have
shared_columns = [
    "results_status",
    "awarded_preference",
    "awarded_entry_date",
    "awarded_entrance",
    "awarded_group_size",
]
new_dataframes = []

# Iterate over each option number creating a new dataframe for each
for option in preferred_options:
    # Get the columns for the current option
    columns = [
        f"preferred_zone_{option}",
        f"preferred_entry_date_{option}",
        f"minimum_acceptable_group_size_{option}",
    ]
    # Create a new dataframe for the current option
    df_option = cleaned_raw_df[columns + shared_columns].copy()
    # Rename the columns to remove the option number
    df_option.columns = [
        "preferred_zone",
        "preferred_entry_date",
        "minimum_acceptable_group_size",
    ] + shared_columns
    # Add a column to indicate if the permit was awarded for the current option
    df_option["awarded"] = df_option["awarded_preference"] == option
    df_option["preferred_option"] = option

    # Append the new dataframe to the list of dataframes
    new_dataframes.append(df_option)

# Concatenate the list of dataframes into a single dataframe
df_split = pd.concat(new_dataframes)

# Drop rows where the preferred division is N/A
df_split = df_split[df_split["preferred_zone"] != "N/A"]

# Check the new dataframe
df_split.head()

Unnamed: 0,preferred_zone,preferred_entry_date,minimum_acceptable_group_size,results_status,awarded_preference,awarded_entry_date,awarded_entrance,awarded_group_size,awarded,preferred_option
0,Core Enchantment Zone,2022-09-02,8,Unsuccessful,0,1970-01-01,,0,False,1
1,Colchuck Zone,2022-08-15,2,Unsuccessful,0,1970-01-01,,0,False,1
2,Snow Zone,2022-08-12,8,Unsuccessful,0,1970-01-01,,0,False,1
3,Core Enchantment Zone,2022-07-12,2,Unsuccessful,0,1970-01-01,,0,False,1
4,Stuart Zone,2022-09-03,4,Unsuccessful,0,1970-01-01,,0,False,1


In [43]:
# Add the month of the preferred entry date to the dataframe
import calendar

# Get the month as an integer
df_split["preferred_entry_date" + "_month"] = df_split["preferred_entry_date"].dt.month
# Get the month as a string
df_split["preferred_entry_date" + "_month"] = df_split[
    "preferred_entry_date" + "_month"
].apply(lambda x: calendar.month_name[x])

# Check the data
df_split.head()

Unnamed: 0,preferred_zone,preferred_entry_date,minimum_acceptable_group_size,results_status,awarded_preference,awarded_entry_date,awarded_entrance,awarded_group_size,awarded,preferred_option,preferred_entry_date_month
0,Core Enchantment Zone,2022-09-02,8,Unsuccessful,0,1970-01-01,,0,False,1,September
1,Colchuck Zone,2022-08-15,2,Unsuccessful,0,1970-01-01,,0,False,1,August
2,Snow Zone,2022-08-12,8,Unsuccessful,0,1970-01-01,,0,False,1,August
3,Core Enchantment Zone,2022-07-12,2,Unsuccessful,0,1970-01-01,,0,False,1,July
4,Stuart Zone,2022-09-03,4,Unsuccessful,0,1970-01-01,,0,False,1,September


In [44]:
# Add the day of the week columns based on preferred entry date
df_split["preferred_entry_date" + "_day"] = df_split[
    "preferred_entry_date"
].dt.day_name()

# Check the data
df_split.head()

Unnamed: 0,preferred_zone,preferred_entry_date,minimum_acceptable_group_size,results_status,awarded_preference,awarded_entry_date,awarded_entrance,awarded_group_size,awarded,preferred_option,preferred_entry_date_month,preferred_entry_date_day
0,Core Enchantment Zone,2022-09-02,8,Unsuccessful,0,1970-01-01,,0,False,1,September,Friday
1,Colchuck Zone,2022-08-15,2,Unsuccessful,0,1970-01-01,,0,False,1,August,Monday
2,Snow Zone,2022-08-12,8,Unsuccessful,0,1970-01-01,,0,False,1,August,Friday
3,Core Enchantment Zone,2022-07-12,2,Unsuccessful,0,1970-01-01,,0,False,1,July,Tuesday
4,Stuart Zone,2022-09-03,4,Unsuccessful,0,1970-01-01,,0,False,1,September,Saturday


In [45]:
# Export the split data to a csv file
df_split.to_csv("./2022_results_split.csv", index=False, date_format="%m-%d-%Y")

In [46]:
# Create a datatframe of the values we KNOW were skipped during
# the lottery process because the preference could not be accomodated.

awarded_preference_greater_than_0 = df_split["awarded_preference"] > 0

# Find where the preferred option was less than the awarded preference
preferred_option_less_than_awarded_option = (
    df_split["preferred_option"] < df_split["awarded_preference"]
)

df_split_skipped = df_split[
    awarded_preference_greater_than_0 & preferred_option_less_than_awarded_option
].copy()

df_split_skipped.head(5)

Unnamed: 0,preferred_zone,preferred_entry_date,minimum_acceptable_group_size,results_status,awarded_preference,awarded_entry_date,awarded_entrance,awarded_group_size,awarded,preferred_option,preferred_entry_date_month,preferred_entry_date_day
19,Core Enchantment Zone,2022-06-24,4,Awarded,2,2022-07-01,Snow Zone,4,False,1,June,Friday
113,Core Enchantment Zone,2022-09-26,6,Awarded,3,2022-10-04,Stuart Zone,6,False,1,September,Monday
127,Core Enchantment Zone,2022-06-24,2,Awarded,3,2022-10-21,Colchuck Zone,2,False,1,June,Friday
133,Core Enchantment Zone,2022-08-30,3,Awarded,2,2022-08-30,Snow Zone,3,False,1,August,Tuesday
154,Core Enchantment Zone,2022-09-24,4,Awarded,3,2022-09-25,Core Enchantment Zone,4,False,1,September,Saturday


In [47]:
# Export the split skipped data to a csv file
df_split_skipped.to_csv(
    "./2022_results_split_skipped.csv", index=False, date_format="%m-%d-%Y"
)