In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.ensemble import RandomForestRegressor
import time
import warnings # prevent warnings
import joblib

In [2]:
site = 'C'

In [3]:
import sys

# load the data
# get parent directory
os.chdir("../..")
data_directory = 'Data_clean'
data_name = site + '_data_clean.csv'

data_all = pd.read_csv(os.path.join(os.getcwd(), data_directory, data_name), index_col = 0)

# reset to current directory
fd = sys.path[0]
os.chdir(fd)

In [4]:
# drop irrelevant columns
data_all = data_all.drop(columns = ['Wind_speed_sd', 'Temperature'])

In [5]:
# define training size and thus number of samples per turbine
train_size = 10000

In [6]:
# sample data 
data_train = data_all.groupby('instanceID', group_keys=False).apply(lambda x: x.sample(train_size, 
                                                                                 random_state = 22))

data_train.to_csv('data_train.csv')

In [7]:
data_train

Unnamed: 0,ts,Month,Day,Hour,instanceID,Wind_speed,TI,Power
395052,2020-05-10 15:20:00,5,10,15,C_WTG01,7.900000,24.246294,1161.069946
489027,2020-06-10 17:10:00,6,10,17,C_WTG01,6.137338,11.976684,431.023064
815535,2020-09-26 16:30:00,9,26,16,C_WTG01,2.981927,16.486589,22.673340
600978,2020-07-17 17:40:00,7,17,17,C_WTG01,3.696848,21.221244,73.544284
751359,2020-09-05 11:10:00,9,5,11,C_WTG01,7.433360,13.744645,950.906729
...,...,...,...,...,...,...,...,...
48845,2020-01-17 03:30:00,1,17,3,C_WTG21,5.888219,13.888784,562.047208
593165,2020-07-15 03:30:00,7,15,3,C_WTG21,4.657002,16.132754,210.426899
546713,2020-06-29 18:50:00,6,29,18,C_WTG21,10.369692,8.756958,1990.417444
861167,2020-10-11 18:30:00,10,11,18,C_WTG21,3.429893,12.941757,65.283786


### Train

In [8]:
tuned_forest = joblib.load("tuned_forest.pkl")

In [9]:
tuned_forest

{'max_depth': 10,
 'n_estimators': 220,
 'max_samples': 0.1003845683426459,
 'random_state': 99}

In [10]:
data_train.iloc[:, 5:-1]

Unnamed: 0,Wind_speed,TI
395052,7.900000,24.246294
489027,6.137338,11.976684
815535,2.981927,16.486589
600978,3.696848,21.221244
751359,7.433360,13.744645
...,...,...
48845,5.888219,13.888784
593165,4.657002,16.132754
546713,10.369692,8.756958
861167,3.429893,12.941757


In [11]:
%%time
#################################################### training ######################################################### 

turbines = data_train.instanceID.unique()
forest_all = {}


for ID in turbines:
    
    # select data based on turbine ID
    data_temp = data_train[data_train['instanceID'] == ID]

    # separate X and y
    X = data_temp.iloc[:, 5:-1]
    y = data_temp.iloc[:, -1]
    
    
    # train
    temp_forest = RandomForestRegressor(max_depth=tuned_forest["max_depth"], max_samples=tuned_forest["max_samples"],
                      n_estimators=tuned_forest["n_estimators"], n_jobs=-1, oob_score=True,
                      random_state=99)
    temp_forest.fit(X, y)
                                          
    forest_all[ID] = temp_forest
    
    print('Done', ID)
    

Done C_WTG01
Done C_WTG02
Done C_WTG03
Done C_WTG04
Done C_WTG05
Done C_WTG06
Done C_WTG07
Done C_WTG08
Done C_WTG09
Done C_WTG10
Done C_WTG11
Done C_WTG12
Done C_WTG13
Done C_WTG14
Done C_WTG15
Done C_WTG16
Done C_WTG17
Done C_WTG18
Done C_WTG19
Done C_WTG20
Done C_WTG21
Wall time: 12.9 s


### Save models

In [12]:
# joblib.dump(forest_all, "forest_all.pkl")

['forest_all.pkl']