In [None]:
# Import necessary libraries
from pyspark.sql import SparkSession
import pandas as pd
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier

# Create a Spark session
spark = SparkSession.builder \
    .appName("Disneyland Ride Wait Times") \
    .getOrCreate()



In [None]:
# URL of the CSV file
url_github = 'https://github.com/kileykarecki/Project-4/raw/refs/heads/main/Data/disney_california_adv-ride_wait_times-2024-(thrill-data)-v2.csv'

# Read the CSV file into a Pandas DataFrame
pd_df = pd.read_csv(url_github)

# Convert the Pandas DataFrame to a Spark DataFrame
spark_df = spark.createDataFrame(pd_df)

# Show the Spark DataFrame
spark_df.show()
spark_df.printSchema()

+------------------+--------------+---------+
|              Ride|     Date/Time|Wait Time|
+------------------+--------------+---------+
|Goofy's Sky School| 4/1/2024 8:05|        1|
|Goofy's Sky School| 4/1/2024 8:10|        1|
|Goofy's Sky School| 4/1/2024 8:40|        5|
|Goofy's Sky School| 4/1/2024 8:45|        5|
|Goofy's Sky School| 4/1/2024 9:00|       10|
|Goofy's Sky School| 4/1/2024 9:05|       30|
|Goofy's Sky School| 4/1/2024 9:10|       30|
|Goofy's Sky School| 4/1/2024 9:15|       30|
|Goofy's Sky School| 4/1/2024 9:20|       30|
|Goofy's Sky School| 4/1/2024 9:25|       30|
|Goofy's Sky School| 4/1/2024 9:30|       30|
|Goofy's Sky School| 4/1/2024 9:35|       30|
|Goofy's Sky School| 4/1/2024 9:40|       30|
|Goofy's Sky School| 4/1/2024 9:45|       30|
|Goofy's Sky School| 4/1/2024 9:50|       30|
|Goofy's Sky School| 4/1/2024 9:55|       30|
|Goofy's Sky School|4/1/2024 10:00|       30|
|Goofy's Sky School|4/1/2024 10:05|       30|
|Goofy's Sky School|4/1/2024 10:10

In [None]:
#Convert "Date/Time" Column into datetime
pd_df['Date/Time'] = pd.to_datetime(pd_df['Date/Time'], format='%m/%d/%Y %H:%M')

In [None]:
# Extract features from 'Date/Time'
pd_df['Hour'] = pd_df['Date/Time'].dt.hour
pd_df['Day'] = pd_df['Date/Time'].dt.day
pd_df['Month'] = pd_df['Date/Time'].dt.month



In [None]:
#Categorize Wait times



In [None]:
#Verify that "wait time" is integer type
pd_df["Wait Time"] = pd_df['Wait Time'].astype(int)

In [None]:
#Create a Binary classification column
#pd_df['High_Traffic'] = (pd_df['Wait Time'] > 60).astype(int)

pd_df['High_Traffic'] = (pd_df['Wait Time'] > 30).astype(int)

In [None]:
#Drop non numerical columns
pd_df = pd_df.drop(columns=['Date/Time', 'Ride'])

In [None]:
#This is where we will show the processed dataset
print(pd_df.head())

   Wait Time  Hour  Day  Month  High_Traffic
0          1     8    1      4             0
1          1     8    1      4             0
2          5     8    1      4             0
3          5     8    1      4             0
4         10     9    1      4             0


In [None]:
#Split data into X and y
X = pd_df.drop(columns=['High_Traffic'])
y = pd_df['High_Traffic']

In [None]:
#Split into training and testing sets / generated by google created overfitting of model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




In [None]:
#Train the decision tree / Caused over fitting?
#dt_classifier = DecisionTreeClassifier(random_state=42)

#Refactored with parameters / tree pruning - refrenced Xpert student learning assistant
dt_classifier = DecisionTreeClassifier(random_state=42, max_depth=10,min_samples_split=50, min_samples_leaf=20)

dt_classifier.fit(X_train, y_train)

In [None]:
#Evaluate the model
y_pred = dt_classifier.predict(X_test)

In [None]:
#Print Evaluations
print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

[[488087      0]
 [     0  55046]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    488087
           1       1.00      1.00      1.00     55046

    accuracy                           1.00    543133
   macro avg       1.00      1.00      1.00    543133
weighted avg       1.00      1.00      1.00    543133



In [None]:
print(pd_df['High_Traffic'].value_counts())

High_Traffic
0    2440790
1     274872
Name: count, dtype: int64
