In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime as dt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [2]:
# Read in csv
df_temps = pd.read_csv('Resources/combined_weather_f.csv')
df_temps.head()

Unnamed: 0,date,tavg_f,tmin_f,tmax_f,Location
0,736695,56.12,46.04,69.98,Anaheim
1,736696,60.8,51.08,78.98,Anaheim
2,736697,61.34,51.08,73.94,Anaheim
3,736698,62.06,50.0,77.0,Anaheim
4,736699,62.24,51.98,75.02,Anaheim


In [3]:
df_temps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8758 entries, 0 to 8757
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   date      8758 non-null   int64  
 1   tavg_f    8758 non-null   float64
 2   tmin_f    8758 non-null   float64
 3   tmax_f    8758 non-null   float64
 4   Location  8758 non-null   object 
dtypes: float64(3), int64(1), object(1)
memory usage: 342.2+ KB


In [4]:
# Separate out categories for One Hot Encoding
cats = df_temps.dtypes[df_temps.dtypes == 'object'].index.tolist()
cats

['Location']

In [5]:
# One Hot Encoding for binary classification of locations

enc = OneHotEncoder(sparse_output=False)

encode_df = pd.DataFrame(enc.fit_transform(df_temps[cats]))

encode_df.columns = enc.get_feature_names_out(['Location'])
encode_df

Unnamed: 0,Location_Anaheim,Location_Epcot,Location_Hong Kong,Location_Paris
0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0
...,...,...,...,...
8753,0.0,0.0,1.0,0.0
8754,0.0,0.0,1.0,0.0
8755,0.0,0.0,1.0,0.0
8756,0.0,0.0,1.0,0.0


In [6]:
encode_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8758 entries, 0 to 8757
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Location_Anaheim    8758 non-null   float64
 1   Location_Epcot      8758 non-null   float64
 2   Location_Hong Kong  8758 non-null   float64
 3   Location_Paris      8758 non-null   float64
dtypes: float64(4)
memory usage: 273.8 KB


In [7]:
# Concatenate dataframes
merged = pd.merge(df_temps, encode_df, left_index=True, right_index=True)
merged.head()

Unnamed: 0,date,tavg_f,tmin_f,tmax_f,Location,Location_Anaheim,Location_Epcot,Location_Hong Kong,Location_Paris
0,736695,56.12,46.04,69.98,Anaheim,1.0,0.0,0.0,0.0
1,736696,60.8,51.08,78.98,Anaheim,1.0,0.0,0.0,0.0
2,736697,61.34,51.08,73.94,Anaheim,1.0,0.0,0.0,0.0
3,736698,62.06,50.0,77.0,Anaheim,1.0,0.0,0.0,0.0
4,736699,62.24,51.98,75.02,Anaheim,1.0,0.0,0.0,0.0


In [8]:

merged.columns

Index(['date', 'tavg_f', 'tmin_f', 'tmax_f', 'Location', 'Location_Anaheim',
       'Location_Epcot', 'Location_Hong Kong', 'Location_Paris'],
      dtype='object')

In [9]:
copy_df = merged.copy()

In [22]:
# Split into Features and Target
X = copy_df[['date',  'tmin_f', 'tmax_f', 'Location_Anaheim', 'Location_Epcot',
       'Location_Hong Kong', 'Location_Paris']]
y = copy_df['tavg_f'].to_numpy()
y = y.flatten()

In [23]:
# Split into Testing and Training Data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [24]:
# Scale Data
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [25]:
# Preview scaled data
X_train_scaled

array([[ 1.13992743, -0.73037565, -0.72561829, ...,  1.75846861,
        -0.57875673, -0.57852232],
       [-1.2091707 ,  0.32779599, -0.17975556, ..., -0.56867663,
         1.72784169, -0.57852232],
       [ 1.37373141,  0.00632612, -0.28626536, ..., -0.56867663,
        -0.57875673, -0.57852232],
       ...,
       [-0.29291185,  0.3813743 ,  0.31285227, ..., -0.56867663,
        -0.57875673, -0.57852232],
       [-0.44140897, -1.5474449 , -1.76408885, ..., -0.56867663,
        -0.57875673,  1.72854179],
       [-1.3513488 ,  0.67605501,  0.52587187, ..., -0.56867663,
        -0.57875673, -0.57852232]])

In [34]:
# Create model
rf_model = RandomForestRegressor(n_estimators=500, random_state=78)

In [35]:
# Fit model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [36]:
# Predict from model
predictions = rf_model.predict(X_test_scaled)

In [40]:
# Preview prediction results
predictions

array([71.9654 , 51.64412, 74.2946 , ..., 81.27356, 80.16008, 61.1996 ])

In [41]:
# Preview predictions and actual results
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
0,71.96540,71.60
1,51.64412,50.36
2,74.29460,74.48
3,55.55264,56.12
4,72.11264,72.32
...,...,...
2185,64.96448,64.76
2186,64.54688,62.24
2187,81.27356,80.96
2188,80.16008,79.16


In [42]:
# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')

Mean Squared Error: 0.8948767650411005
R-squared: 0.9942516420419406


In [43]:
# User input
location_input = input("Enter the location (Anaheim, Epcot, Hong Kong, Paris): ")
date_input = input("Enter the date (YYYY-MM-DD): ")
tmin_input = float(input("Enter the expected minimum temperature (°F)"))
tmax_input = float(input("Enter the expected maximum temperature (°F)"))

In [44]:
# View user input
print(location_input, date_input, tmin_input, tmax_input)

Epcot 2025-02-04 68.0 82.0


In [45]:
# Use input with model to predict temperature
locations = ['Anaheim', 'Epcot', 'Hong Kong', 'Paris']
binary_features = [1 if loc == location_input else 0 for loc in locations]
date_num = pd.to_datetime(date_input).date()
date_ord = date_num.toordinal()
input_features = np.array([[date_ord] + binary_features + [tmin_input, tmax_input]])
input_features

array([[7.39286e+05, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        6.80000e+01, 8.20000e+01]])

In [47]:
# Predict using User Input
predicted_temperature = rf_model.predict(input_features)
print(f"The predicted average temperature in {location_input} on {date_input} is: {round(predicted_temperature[0], 2)} °F")

The predicted average temperature in Epcot on 2025-02-04 is: 75.64 °F
