In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.ensemble import RandomForestRegressor
import time
import warnings # prevent warnings
import joblib

In [2]:
site = 'AWI'

In [13]:
# load the data
data_directory = "C://Users//User//Desktop//dissertation_ventient//Data_clean"
data_name = site + "_data_clean.csv"

data_all = pd.read_csv(os.path.join(os.getcwd(), data_directory, data_name), index_col = 0)

In [14]:
# drop irrelevant columns
data_all = data_all.drop(columns = ['Wind_speed_sd', 'Air_density', 'Humidity', 'TI'])

In [15]:
# define training size and thus number of samples per turbine
train_size = 10000

In [17]:
# sample data 
data_train = data_all.groupby('instanceID', group_keys=False).apply(lambda x: x.sample(train_size, 
                                                                                 random_state = 22))

data_train.to_csv('data_train.csv')

In [7]:
data_train

Unnamed: 0,ts,Month,Day,Hour,instanceID,Wind_speed,Temperature,Power
395052,2020-05-10 15:20:00,5,10,15,AWI_WTG01,7.900000,7.738333,1161.069946
489027,2020-06-10 17:10:00,6,10,17,AWI_WTG01,6.137338,11.253334,431.023064
815535,2020-09-26 16:30:00,9,26,16,AWI_WTG01,2.981927,15.045000,22.673340
600978,2020-07-17 17:40:00,7,17,17,AWI_WTG01,3.696848,14.026667,73.544284
751359,2020-09-05 11:10:00,9,5,11,AWI_WTG01,7.433360,12.183333,950.906729
...,...,...,...,...,...,...,...,...
48845,2020-01-17 03:30:00,1,17,3,AWI_WTG21,5.888219,5.418333,562.047208
593165,2020-07-15 03:30:00,7,15,3,AWI_WTG21,4.657002,11.220000,210.426899
546713,2020-06-29 18:50:00,6,29,18,AWI_WTG21,10.369692,10.800000,1990.417444
861167,2020-10-11 18:30:00,10,11,18,AWI_WTG21,3.429893,9.196666,65.283786


### Train

In [8]:
tuned_forest = joblib.load("tuned_forest.pkl")

In [9]:
tuned_forest

{'max_depth': 9,
 'n_estimators': 170,
 'max_samples': 0.12583648926857596,
 'random_state': 99}

In [10]:
data_train.iloc[:, 5:-1]

Unnamed: 0,Wind_speed,Temperature
395052,7.900000,7.738333
489027,6.137338,11.253334
815535,2.981927,15.045000
600978,3.696848,14.026667
751359,7.433360,12.183333
...,...,...
48845,5.888219,5.418333
593165,4.657002,11.220000
546713,10.369692,10.800000
861167,3.429893,9.196666


In [11]:
%%time
#################################################### training ######################################################### 

turbines = data_train.instanceID.unique()
forest_all = {}


for ID in turbines:
    
    # select data based on turbine ID
    data_temp = data_train[data_train['instanceID'] == ID]

    # separate X and y
    X = data_temp.iloc[:, 5:-1]
    y = data_temp.iloc[:, -1]
    
    
    # train
    temp_forest = RandomForestRegressor(max_depth=tuned_forest["max_depth"], max_samples=tuned_forest["max_samples"],
                      n_estimators=tuned_forest["n_estimators"], n_jobs=-1, oob_score=True,
                      random_state=99)
    temp_forest.fit(X, y)
                                          
    forest_all[ID] = temp_forest
    
    print('Done', ID)
    

Done AWI_WTG01
Done AWI_WTG02
Done AWI_WTG03
Done AWI_WTG04
Done AWI_WTG05
Done AWI_WTG06
Done AWI_WTG07
Done AWI_WTG08
Done AWI_WTG09
Done AWI_WTG10
Done AWI_WTG11
Done AWI_WTG12
Done AWI_WTG13
Done AWI_WTG14
Done AWI_WTG15
Done AWI_WTG16
Done AWI_WTG17
Done AWI_WTG18
Done AWI_WTG19
Done AWI_WTG20
Done AWI_WTG21
Wall time: 12.1 s


### Save models

In [12]:
# joblib.dump(forest_all, "forest_all.pkl")

['forest_all.pkl']