<a href="https://colab.research.google.com/github/kmikk/solar_machine_learning/blob/solar_branch/random_forest_solar_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import tree
import os

In [0]:
#load minneapolis weather data
mpls = pd.read_csv('https://raw.githubusercontent.com/kmikk/solar_machine_learning/master/resources/mpls_solar_weather.csv')


In [0]:
#list columns 
mpls_columns = list(mpls.columns)
mpls_columns

['date_time',
 'power_delivered',
 'energy_delivered',
 'cumulative_energy',
 'weather_description',
 'clouds_all',
 'temp_f',
 'pressure',
 'humidity',
 'wind_speed',
 'wind_deg',
 'rain_1h',
 'snow_1h',
 'weather_main',
 'hour',
 'day_of_year',
 'month',
 'sin_day',
 'cos_day',
 'sin_hour',
 'cos_hour',
 'sin_month',
 'cos_month',
 'daylength']

In [0]:
features = mpls[['clouds_all','temp_f', 'pressure', 'humidity', 'wind_speed', 
                 'sin_day', 'cos_day', 'sin_hour', 'cos_hour', 'rain_1h']]
# Saving feature names for later use
feature_names = list(features.columns)

In [0]:
# create bins for power_delivered

labels = range(0, 12)
mpls_power_bins = pd.cut(mpls.power_delivered,
                         [0, 0.0001, 500, 1000, 1500, 2000, 2500, 3000, 3500,
                          4000, 4500, 5000, 5500],
                         labels=labels)
mpls_power_bins.fillna(0, inplace=True)

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, mpls_power_bins, random_state=42)

In [0]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.7560926485397784

In [0]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.7963746223564955

In [0]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.32188530739881704, 'cos_hour'),
 (0.11791757381578544, 'temp_f'),
 (0.11077967024795876, 'cos_day'),
 (0.10618544925611338, 'sin_hour'),
 (0.09855527771413235, 'humidity'),
 (0.08814683899278623, 'sin_day'),
 (0.05712799084475325, 'pressure'),
 (0.046057772673057135, 'clouds_all'),
 (0.04007575427846076, 'wind_speed'),
 (0.013268364778135604, 'rain_1h')]

In [0]:
feature_importances=rf.feature_importances_.tolist()

In [0]:
importances=sorted(zip(feature_names, feature_importances), reverse=True)

In [0]:
df = pd.DataFrame(list(sorted(zip(feature_importances,feature_names),reverse=False)), 
               columns =['Parameter', 'Importance']) 

In [0]:
df

Unnamed: 0,Parameter,Importance
0,0.013268,rain_1h
1,0.040076,wind_speed
2,0.046058,clouds_all
3,0.057128,pressure
4,0.088147,sin_day
5,0.098555,humidity
6,0.106185,sin_hour
7,0.11078,cos_day
8,0.117918,temp_f
9,0.321885,cos_hour


In [0]:
import plotly.express as px


fig = px.bar(df, x='Parameter', y='Importance', orientation='h')
fig.show()