### This notebook will leverage the Random Forest Model to evaluate the weather csv files

In [44]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
%matplotlib inline

### Load data set

In [45]:
file_path = Path("Philly_weather.csv")
philly_df = pd.read_csv(file_path)
philly_df.head()

Unnamed: 0,datetime,humidity,pressure,temperature,wind-direction,wind-speed,weather-description
0,2012-10-01 12:00:00,,,,,,
1,2012-10-01 13:00:00,71.0,1014.0,285.63,270.0,4.0,broken clouds
2,2012-10-01 14:00:00,70.0,1014.0,285.663208,270.0,4.0,broken clouds
3,2012-10-01 15:00:00,70.0,1014.0,285.756824,271.0,3.0,broken clouds
4,2012-10-01 16:00:00,69.0,1014.0,285.85044,272.0,3.0,broken clouds


In [46]:
philly_df.dropna(inplace=True)

In [47]:
philly_df.isnull().sum()

datetime               0
humidity               0
pressure               0
temperature            0
wind-direction         0
wind-speed             0
weather-description    0
dtype: int64

In [48]:
#philly_df["default"] = 0
philly_df.drop("datetime", axis=1, inplace=True)
philly_df.head()

Unnamed: 0,humidity,pressure,temperature,wind-direction,wind-speed,weather-description
1,71.0,1014.0,285.63,270.0,4.0,broken clouds
2,70.0,1014.0,285.663208,270.0,4.0,broken clouds
3,70.0,1014.0,285.756824,271.0,3.0,broken clouds
4,69.0,1014.0,285.85044,272.0,3.0,broken clouds
5,69.0,1014.0,285.944057,274.0,3.0,broken clouds


In [49]:
#philly_df.temperature.plot(figsize=(20,15))

In [50]:
# Binary encoding using Pandas (single column)
philly_binary_encoded = pd.get_dummies(philly_df, columns=["weather-description"])
philly_binary_encoded.head()

Unnamed: 0,humidity,pressure,temperature,wind-direction,wind-speed,weather-description_broken clouds,weather-description_drizzle,weather-description_few clouds,weather-description_fog,weather-description_freezing rain,...,weather-description_sky is clear,weather-description_smoke,weather-description_snow,weather-description_squalls,weather-description_thunderstorm,weather-description_thunderstorm with heavy rain,weather-description_thunderstorm with light rain,weather-description_thunderstorm with rain,weather-description_very heavy rain,weather-description_volcanic ash
1,71.0,1014.0,285.63,270.0,4.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,70.0,1014.0,285.663208,270.0,4.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,70.0,1014.0,285.756824,271.0,3.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,69.0,1014.0,285.85044,272.0,3.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,69.0,1014.0,285.944057,274.0,3.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
### Process data and split into training and testing data sets

In [52]:
# Define features set
X = philly_binary_encoded.copy()
X.drop(columns = "temperature", axis=1, inplace=True)
X.head()


Unnamed: 0,humidity,pressure,wind-direction,wind-speed,weather-description_broken clouds,weather-description_drizzle,weather-description_few clouds,weather-description_fog,weather-description_freezing rain,weather-description_haze,...,weather-description_sky is clear,weather-description_smoke,weather-description_snow,weather-description_squalls,weather-description_thunderstorm,weather-description_thunderstorm with heavy rain,weather-description_thunderstorm with light rain,weather-description_thunderstorm with rain,weather-description_very heavy rain,weather-description_volcanic ash
1,71.0,1014.0,270.0,4.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,70.0,1014.0,270.0,4.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,70.0,1014.0,271.0,3.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,69.0,1014.0,272.0,3.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,69.0,1014.0,274.0,3.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
# Define target vector
#y = philly_binary_encoded["temperature"].values.reshape(-1, 1)
y = philly_binary_encoded["temperature"]
y[:5]


1    285.630000
2    285.663208
3    285.756824
4    285.850440
5    285.944057
Name: temperature, dtype: float64

In [57]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Create the StandardScaler instance
scaler = StandardScaler()

# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)



In [64]:
# Create the random forest regressor instance
rf_model = RandomForestRegressor(n_estimators=500, random_state=78)

# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)



In [65]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)


In [66]:
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,278.781777,273.94
1,267.578462,263.979333
2,273.118519,267.94
3,289.345724,273.4995
4,283.672874,273.911
5,277.555106,274.748
6,288.027892,283.129138
7,276.443914,274.54
8,293.002282,295.54
9,279.589377,292.06


In [67]:
rf_model.score(X_train_scaled, y_train)

0.9280371344020457

In [68]:
rf_model.score(X_test_scaled, y_test)

0.49332515374852715