In [55]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [56]:
df = pd.read_csv('station_data_dataverse.csv')

In [57]:
df['created'] = df['created'].astype(str)

In [58]:
df['created'] = df['created'].apply(lambda x: '20' + x[2:] if x[:2].isdigit() and int(x[:2]) < 100 else x)

In [59]:
df['created'] = pd.to_datetime(df['created'], errors='coerce')

In [60]:
df.dropna(subset=['created'], inplace=True)

In [61]:
df['date'] = df['created'].dt.date
df['hour'] = df['created'].dt.hour
df['day'] = df['created'].dt.day
df['month'] = df['created'].dt.month

In [62]:
hourly_usage = df.groupby(['day', 'month', 'hour', 'date']).agg({'kwhTotal': 'sum'}).reset_index()

In [63]:
hourly_usage['kwhTotal'].fillna(hourly_usage['kwhTotal'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  hourly_usage['kwhTotal'].fillna(hourly_usage['kwhTotal'].median(), inplace=True)


In [64]:
threshold = np.percentile(hourly_usage['kwhTotal'], 75) if not hourly_usage['kwhTotal'].empty else 0
hourly_usage['peak'] = (hourly_usage['kwhTotal'] >= threshold).astype(int)

In [65]:
hourly_usage['next_day'] = hourly_usage['date'].shift(-24)
hourly_usage['next_day_peak'] = hourly_usage['peak'].shift(-24)

In [66]:
hourly_usage.dropna(inplace=True)

In [67]:
X = hourly_usage[['hour', 'day', 'month', 'kwhTotal']]
y = hourly_usage['next_day_peak']

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [69]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [70]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

Model Accuracy: 0.68


In [71]:
def predict_peak_hours(date_input):
    try:
        # Convert input date (DD-MM) to integers
        day, month = map(int, date_input.split('-'))

        # Generate next day's hour-wise data
        next_day_data = pd.DataFrame({
            'hour': list(range(24)),  # 0 to 23 hours
            'day': [day + 1] * 24,  # Next day's date
            'month': [month] * 24,
            'kwhTotal': [hourly_usage['kwhTotal'].median()] * 24  # Using median value
        })

        # Predict peak hours
        predictions = model.predict(next_day_data)

        # Extract peak hours
        peak_hours = next_day_data.loc[predictions == 1, 'hour'].tolist()
        
        return peak_hours

    except Exception as e:
        return f"Error in prediction: {str(e)}"

In [75]:
next_day_peaks = predict_peak_hours("15-04")
print(f"Predicted Peak Hours for Tomorrow: {next_day_peaks}")

Predicted Peak Hours for Tomorrow: [10, 20, 22, 23]


In [76]:
import pickle

# Save the trained model
with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [77]:
# Load the saved model
with open('random_forest_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)


In [78]:
def predict_peak_hours(date_input):
    try:
        # Convert input date (DD-MM) to integers
        day, month = map(int, date_input.split('-'))

        # Generate next day's hour-wise data
        next_day_data = pd.DataFrame({
            'hour': list(range(24)),  # 0 to 23 hours
            'day': [day + 1] * 24,  # Next day's date
            'month': [month] * 24,
            'kwhTotal': [hourly_usage['kwhTotal'].median()] * 24  # Using median value
        })

        # Ensure column order is the same as during training
        next_day_data = next_day_data[['hour', 'day', 'month', 'kwhTotal']]

        # Predict peak hours
        predictions = loaded_model.predict(next_day_data)

        # Convert predictions to list of peak hours
        peak_hours = next_day_data.loc[predictions == 1, 'hour'].tolist()
        
        return peak_hours

    except Exception as e:
        return f"Error in prediction: {str(e)}"

In [79]:
next_day_peaks = predict_peak_hours("15-04")
print(f"Predicted Peak Hours for Tomorrow: {next_day_peaks}")

Predicted Peak Hours for Tomorrow: [10, 20, 22, 23]
