In [17]:
# Import necessary libraries
from pyspark.sql import SparkSession
import pandas as pd
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier

# Create a Spark session
spark = SparkSession.builder \
    .appName("Disneyland Ride Wait Times") \
    .getOrCreate()



In [18]:
# URL of the CSV file
url_github = 'https://github.com/kileykarecki/Project-4/raw/refs/heads/main/Data/disneyland-ride_wait_times-2024-(thrill-data)-v1.csv'

# Read the CSV file into a Pandas DataFrame
pd_df = pd.read_csv(url_github)

# Convert the Pandas DataFrame to a Spark DataFrame
spark_df = spark.createDataFrame(pd_df)

# Show the Spark DataFrame
spark_df.show()
spark_df.printSchema()

+--------------------+-------------+---------+
|                Ride|    Date/Time|Wait Time|
+--------------------+-------------+---------+
|Big Thunder Mount...|4/1/2024 8:05|        5|
|Big Thunder Mount...|4/1/2024 8:10|        5|
|Big Thunder Mount...|4/1/2024 8:15|       45|
|Big Thunder Mount...|4/1/2024 8:20|       45|
|Big Thunder Mount...|4/1/2024 8:25|       45|
|Big Thunder Mount...|4/1/2024 8:30|       45|
|Big Thunder Mount...|4/1/2024 8:35|       45|
|Big Thunder Mount...|4/1/2024 8:40|       45|
|Big Thunder Mount...|4/1/2024 8:45|       45|
|Big Thunder Mount...|4/1/2024 8:50|       30|
|Big Thunder Mount...|4/1/2024 8:55|       30|
|Big Thunder Mount...|4/1/2024 9:00|       30|
|Big Thunder Mount...|4/1/2024 9:05|       30|
|Big Thunder Mount...|4/1/2024 9:10|       30|
|Big Thunder Mount...|4/1/2024 9:15|       30|
|Big Thunder Mount...|4/1/2024 9:20|       30|
|Big Thunder Mount...|4/1/2024 9:25|       30|
|Big Thunder Mount...|4/1/2024 9:30|       30|
|Big Thunder 

In [19]:
#Convert "Date/Time" Column into datetime
pd_df['Date/Time'] = pd.to_datetime(pd_df['Date/Time'], format='%m/%d/%Y %H:%M')

In [20]:
# Extract features from 'Date/Time'
pd_df['Hour'] = pd_df['Date/Time'].dt.hour
pd_df['Day'] = pd_df['Date/Time'].dt.day
pd_df['Month'] = pd_df['Date/Time'].dt.month



In [21]:
#Categorize Wait times



In [22]:
#Verify that "wait time" is integer type
pd_df["Wait Time"] = pd_df['Wait Time'].astype(int)

In [23]:
#Create a Binary classification column
#pd_df['High_Traffic'] = (pd_df['Wait Time'] > 60).astype(int)

pd_df['High_Traffic'] = (pd_df['Wait Time'] > 30).astype(int)

In [24]:
#Drop non numerical columns
pd_df = pd_df.drop(columns=['Date/Time', 'Ride'])

In [25]:
#This is where we will show the processed dataset
print(pd_df.head())

   Wait Time  Hour  Day  Month  High_Traffic
0          5     8    1      4             0
1          5     8    1      4             0
2         45     8    1      4             1
3         45     8    1      4             1
4         45     8    1      4             1


In [26]:
#Split data into X and y
X = pd_df.drop(columns=['High_Traffic'])
y = pd_df['High_Traffic']

In [27]:
#Split into training and testing sets / generated by google created overfitting of model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




In [28]:
#Train the decision tree / Caused over fitting?
dt_classifier = DecisionTreeClassifier(random_state=42)

#Refactored with parameters / tree pruning - refrenced Xpert student learning assistant
#dt_classifier = DecisionTreeClassifier(random_state=42, max_depth=10,min_samples_split=50, min_samples_leaf=20)

dt_classifier.fit(X_train, y_train)

In [29]:
#Evaluate the model
y_pred = dt_classifier.predict(X_test)

In [30]:
#Print Evaluations
print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

[[376244      0]
 [     0 247116]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    376244
           1       1.00      1.00      1.00    247116

    accuracy                           1.00    623360
   macro avg       1.00      1.00      1.00    623360
weighted avg       1.00      1.00      1.00    623360



In [31]:
print(pd_df['High_Traffic'].value_counts())

High_Traffic
0    1882014
1    1234782
Name: count, dtype: int64
