# Load data and Config

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from joblib import dump, load
import warnings
warnings.filterwarnings("ignore")
import sys
sys.path.append(r'D:\Hacks\re_new')

In [2]:
RAND = 4567

N_FOLDS =  10

model_name = "Xgb"
save_models = False
desc = 'testing'

cat_cols = ['turbine_id']

# for wandb
project = "renew-machinehack"

In [3]:
from sklearn.metrics import mean_absolute_percentage_error as mape
def comp_score(y_true, y_pred):
    return mape(y_true, y_pred)

In [4]:
train_data = pd.read_csv('../data/raw/train.csv')
test_data = pd.read_csv('../data/raw/test.csv')
train_data.shape, test_data.shape

((909604, 16), (303202, 15))

In [5]:
org_cols = train_data.drop('timestamp', axis=1).columns
org_cols

Index(['active_power_calculated_by_converter', 'active_power_raw',
       'ambient_temperature', 'generator_speed', 'generator_winding_temp_max',
       'grid_power10min_average', 'nc1_inside_temp', 'nacelle_temp',
       'reactice_power_calculated_by_converter', 'reactive_power',
       'wind_direction_raw', 'wind_speed_raw', 'wind_speed_turbulence',
       'turbine_id', 'Target'],
      dtype='object')

# Engine

In [6]:
from category_encoders.target_encoder import TargetEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import Lasso

In [7]:
def get_model(model=None):
    # model = ExtraTreesRegressor(n_estimators=50, random_state=RAND, verbose=1, n_jobs=-1)
    # model = Lasso()
    # model = XGBRegressor()

    tme = TargetEncoder()
    # pf = PolynomialFeatures(degree=2)
    # pca = PCA(n_components=10, random_state=config['RAND'])
    scaler = StandardScaler()
    ct = make_column_transformer(
        (tme, cat_cols),
        remainder = 'passthrough',
        n_jobs= -1
    )
    model_pipe = make_pipeline(
            ct,
            scaler,
            model
        )
    return model_pipe

In [8]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import argparse
import wandb
from joblib import dump
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import time
np.random.seed(RAND)

# wandb.login(key = wandb_login)

# Load configuration
print(model_name)
print(desc)

Xgb
testing


# Ft. Engineering

In [9]:
numeric_cols = [col for col in train_data.columns if train_data[col].dtype != 'object']
numeric_cols = numeric_cols[:-1]
numeric_cols

['active_power_calculated_by_converter',
 'active_power_raw',
 'ambient_temperature',
 'generator_speed',
 'generator_winding_temp_max',
 'grid_power10min_average',
 'nc1_inside_temp',
 'nacelle_temp',
 'reactice_power_calculated_by_converter',
 'reactive_power',
 'wind_direction_raw',
 'wind_speed_raw',
 'wind_speed_turbulence']

In [10]:
train_data = train_data.where(train_data.values != 0, 0.0001)
test_data = test_data.where(test_data.values != 0, 0.0001)

In [11]:
# generate realtive features
from feature_engine.creation import RelativeFeatures
rf = RelativeFeatures(variables=numeric_cols, reference=numeric_cols, func=['add'])
train_data_trans = rf.fit_transform(train_data.drop(['timestamp','Target'], axis=1))
test_data_trans = rf.transform(test_data.drop('timestamp', axis=1))

In [12]:
print(train_data_trans.info())
print(test_data_trans.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 909604 entries, 0 to 909603
Columns: 183 entries, active_power_calculated_by_converter to wind_speed_turbulence_add_wind_speed_turbulence
dtypes: float64(182), object(1)
memory usage: 1.2+ GB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303202 entries, 0 to 303201
Columns: 183 entries, active_power_calculated_by_converter to wind_speed_turbulence_add_wind_speed_turbulence
dtypes: float64(182), object(1)
memory usage: 423.3+ MB
None


In [13]:
train_data_trans['Target'] = train_data.Target
# train_data = train_data_trans
# test_data = test_data_trans

In [14]:
'Target' in train_data_trans.columns

True

# Ft. Sel

In [15]:
sample_data = train_data_trans.sample(frac=0.2, random_state=RAND)
sample_data.shape

(181921, 184)

In [16]:
from sklearn.model_selection import train_test_split
x_tr, x_val, y_tr, y_val = train_test_split(sample_data.drop('Target', axis=1), sample_data.Target, test_size=0.25, random_state=RAND)
x_tr.shape, y_tr.shape, x_val.shape, y_val.shape

((136440, 183), (136440,), (45481, 183), (45481,))

In [17]:
from sklearn.linear_model import LinearRegression

In [18]:
lasso = get_model(LinearRegression())

print(lasso)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(n_jobs=-1, remainder='passthrough',
                                   transformers=[('targetencoder',
                                                  TargetEncoder(),
                                                  ['turbine_id'])])),
                ('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])


In [19]:
%%time
lasso.fit(x_tr, y_tr)
preds = lasso.predict(x_val)
comp_score(y_val, preds)

CPU times: total: 18.1 s
Wall time: 10.9 s


0.026223388797549565

In [20]:
lasso[2].coef_

array([ 1.11044412e+00,  5.15012323e+10,  1.14622659e+10,  2.54820388e+11,
        2.35178976e+11,  4.25773332e+11, -1.56770991e+11, -1.10978792e+11,
       -1.20219635e+12,  6.39907260e+10,  3.03791084e+10,  3.00071756e+11,
       -7.87190671e+10,  1.45971193e+10, -1.42834237e+09,  5.57176089e+09,
       -5.76479102e+08, -2.24600664e+10, -6.82024635e+09, -1.03105222e+10,
        6.87254164e+09,  8.01146933e+09,  1.17461007e+10,  4.78198267e+09,
        1.22195733e+10, -1.13550990e+10, -1.45003006e+09,  5.57176089e+09,
        6.18771654e+09,  1.13118858e+10,  3.74945924e+10,  1.61974343e+10,
       -1.16795890e+10,  1.84361642e+10,  3.70432132e+10,  3.90453604e+10,
        7.15447171e+09, -1.30448302e+11, -2.73342600e+10,  8.14655143e+09,
       -2.47596557e+10,  4.42613707e+09,  7.25393877e+09, -5.07109363e+11,
       -1.97269975e+11,  1.61446482e+10, -5.57519223e+10,  1.33197886e+11,
        4.19142493e+10,  2.72270888e+10,  1.04674081e+11, -7.86803837e+10,
        1.21988180e+10, -

In [21]:
lasso.feature_names_in_

array(['active_power_calculated_by_converter', 'active_power_raw',
       'ambient_temperature', 'generator_speed',
       'generator_winding_temp_max', 'grid_power10min_average',
       'nc1_inside_temp', 'nacelle_temp',
       'reactice_power_calculated_by_converter', 'reactive_power',
       'wind_direction_raw', 'wind_speed_raw', 'wind_speed_turbulence',
       'turbine_id',
       'active_power_calculated_by_converter_add_active_power_calculated_by_converter',
       'active_power_raw_add_active_power_calculated_by_converter',
       'ambient_temperature_add_active_power_calculated_by_converter',
       'generator_speed_add_active_power_calculated_by_converter',
       'generator_winding_temp_max_add_active_power_calculated_by_converter',
       'grid_power10min_average_add_active_power_calculated_by_converter',
       'nc1_inside_temp_add_active_power_calculated_by_converter',
       'nacelle_temp_add_active_power_calculated_by_converter',
       'reactice_power_calculated_by_con

In [22]:
wts  = lasso[2].coef_/sum(lasso[2].coef_)

In [23]:
all_fts = {}
for ft_name, coef in zip(lasso.feature_names_in_, wts):
    all_fts[ft_name] = abs(coef)
# all_fts

{'active_power_calculated_by_converter': 3.0370655171358856e-11,
 'active_power_raw': 1.4085591016938617,
 'ambient_temperature': 0.3134930602721683,
 'generator_speed': 6.969339598156212,
 'generator_winding_temp_max': 6.432146805668183,
 'grid_power10min_average': 11.644903914773515,
 'nc1_inside_temp': 4.287687807036323,
 'nacelle_temp': 3.0352708086299187,
 'reactice_power_calculated_by_converter': 32.88007957049923,
 'reactive_power': 1.7501468524250277,
 'wind_direction_raw': 0.8308688505036644,
 'wind_speed_raw': 8.206964855998883,
 'wind_speed_turbulence': 2.1529670937257626,
 'turbine_id': 0.3992313245933589,
 'active_power_calculated_by_converter_add_active_power_calculated_by_converter': 0.03906517497366189,
 'active_power_raw_add_active_power_calculated_by_converter': 0.15238770342866356,
 'ambient_temperature_add_active_power_calculated_by_converter': 0.01576670789543068,
 'generator_speed_add_active_power_calculated_by_converter': 0.61428298929733,
 'generator_winding_tem

In [24]:
all_fts = {k: v for k, v in sorted(all_fts.items(), key=lambda item: item[1], reverse=True)}

In [36]:
# all_fts

In [26]:
# list(all_fts.keys()).index('reactive_power_mul_reactice_power_calculated_by_converter')  # fts. with coef > 3

In [27]:
div_fts = []
for idx, ft in enumerate(list(all_fts.keys())):
    if idx == 10:
        break
    else:
        div_fts.append(ft)

In [28]:
d_fts = ['reactice_power_calculated_by_converter_div_generator_speed',
 'grid_power10min_average_div_generator_speed',
 'nacelle_temp_div_generator_speed',
 'active_power_calculated_by_converter_div_generator_speed',
 'nc1_inside_temp_div_generator_speed',
 'generator_winding_temp_max_div_generator_speed',
 'reactive_power_div_generator_speed',
 'active_power_raw_div_generator_speed',
 'wind_speed_raw_div_generator_speed',
 'generator_winding_temp_max_div_wind_speed_turbulence',
 'ambient_temperature_div_generator_speed',
 'nacelle_temp_div_wind_speed_turbulence',
 'wind_speed_turbulence_div_generator_speed',
 'generator_winding_temp_max_div_grid_power10min_average',
 'wind_speed_raw_div_wind_speed_turbulence',
 'nacelle_temp_div_grid_power10min_average',
 'reactive_power_div_grid_power10min_average',
 'ambient_temperature_div_wind_speed_turbulence',
 'reactice_power_calculated_by_converter_div_grid_power10min_average',
 'wind_direction_raw_div_generator_speed',
 'reactive_power_div_wind_speed_turbulence',
 'wind_direction_raw_div_wind_speed_turbulence',
 'nc1_inside_temp_div_wind_speed_turbulence',
 'ambient_temperature_div_grid_power10min_average',
 'nc1_inside_temp_div_ambient_temperature']


sub_fts = ['nc1_inside_temp',
 'grid_power10min_average',
 'active_power_calculated_by_converter_sub_grid_power10min_average',
 'reactice_power_calculated_by_converter_sub_grid_power10min_average',
 'grid_power10min_average_sub_reactice_power_calculated_by_converter',
 'reactice_power_calculated_by_converter',
 'active_power_raw_sub_grid_power10min_average',
 'grid_power10min_average_sub_active_power_raw',
 'ambient_temperature_sub_grid_power10min_average',
 'grid_power10min_average_sub_ambient_temperature']

In [29]:
div_fts

['reactice_power_calculated_by_converter',
 'generator_speed_add_generator_speed',
 'generator_speed_add_ambient_temperature',
 'ambient_temperature_add_generator_speed',
 'wind_speed_raw_add_nacelle_temp',
 'nacelle_temp_add_wind_speed_raw',
 'grid_power10min_average',
 'wind_direction_raw_add_wind_direction_raw',
 'generator_speed_add_wind_speed_turbulence',
 'wind_speed_turbulence_add_generator_speed']

In [30]:
final_fts = []
final_fts.extend(div_fts)
final_fts.extend(org_cols)
final_fts

['reactice_power_calculated_by_converter',
 'generator_speed_add_generator_speed',
 'generator_speed_add_ambient_temperature',
 'ambient_temperature_add_generator_speed',
 'wind_speed_raw_add_nacelle_temp',
 'nacelle_temp_add_wind_speed_raw',
 'grid_power10min_average',
 'wind_direction_raw_add_wind_direction_raw',
 'generator_speed_add_wind_speed_turbulence',
 'wind_speed_turbulence_add_generator_speed',
 'active_power_calculated_by_converter',
 'active_power_raw',
 'ambient_temperature',
 'generator_speed',
 'generator_winding_temp_max',
 'grid_power10min_average',
 'nc1_inside_temp',
 'nacelle_temp',
 'reactice_power_calculated_by_converter',
 'reactive_power',
 'wind_direction_raw',
 'wind_speed_raw',
 'wind_speed_turbulence',
 'turbine_id',
 'Target']

In [31]:
len(final_fts)

25

In [33]:
tr_, te_ = train_data_trans[final_fts], test_data_trans[final_fts[:-1]]
tr_.shape, te_.shape

((909604, 25), (303202, 24))

In [34]:
tr_.drop('Target', axis=1).columns.equals(te_.columns)

True

# Train and Eval

In [35]:
def train_and_eval(X_train, y_train, X_val, y_val):
    print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)
    print('Training Model...')
    model = XGBRegressor(tree_method='gpu_hist', n_estimators=3000, random_state=RAND, n_jobs=-1, verbose=True)
    # model = RandomForestRegressor
    model = get_model(model)
    model.fit(X_train, y_train)
    print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)
    train_score = comp_score(model.predict(X_train), y_train)
    print("Training MAPE: ", train_score)

    print('Validating Model..')
    preds = model.predict(X_val)
    val_score = comp_score(y_val, preds)
    print("Validation MAPE: ", val_score)
    print("validation rmse: ", mean_squared_error(y_val, preds, squared=False))

    return model, train_score, val_score


def __cross_validate(holdout=False, cv_predict=False, wandb_track=True):
    cv_scores = []

    drop_cols = ['Target']
    
    if cv_predict:
        cvpreds_test = np.zeros(shape=(len(te_), N_FOLDS))
        cvpreds_train = np.zeros(shape=(len(tr_)))
    
    kf = KFold(n_splits=N_FOLDS, random_state=RAND, shuffle=True)
    for fold, (train_idx, val_idx) in enumerate(kf.split(tr_)):
        print(f'Fold : {fold}')

        train_fold = tr_.iloc[train_idx]
        val_fold = tr_.iloc[val_idx]

        X_train, y_train = train_fold.drop(
            drop_cols, axis=1), train_fold.Target
        X_val, y_val = val_fold.drop(
            drop_cols, axis=1), val_fold.Target
        
        start = time.time() 
        model, train_score, val_score = train_and_eval(
            X_train, y_train, X_val, y_val)
        end = time.time()
        print(f'Time taken: {end - start}')

        if wandb_track:
            # wandb tracking
            wandb.log({
                'fold': fold,
                'Train_score': train_score,
                'Val_score': val_score 
            })

        cv_scores.append(val_score)

        if cv_predict:
            # save predictions for ensembling
            cvpreds_test[:, fold] = model.predict(te_, axis=1)
            cvpreds_train[val_idx] = model.predict(X_val)
            
        print('----------------------------------------------------------')

        # save_model
        if save_models :
            dump(model, model_name + '_' + str(fold))
            print('Model saved')

        if holdout == True:
            break

    if cv_predict:
        print('Saving cross validated predictions...')
        test_cv = pd.DataFrame(cvpreds_test.mean(axis=1), columns=['Target'])
        train_cv = pd.DataFrame(cvpreds_train, columns=['Target'])
        print('Test shape: ', test_cv.shape)
        print('Train shape: ', train_cv.shape)
        test_cv.to_csv(f"{model_name}_test_cv.csv", index=False)
        train_cv.to_csv(f"{model_name}_train_cv.csv", index=False)

    print("AVG mape :", np.array(cv_scores).mean())


def cross_validate(holdout=False, wandb_track=True, cv_predict=False):
    if wandb_track:
        # wandb tracking
        with wandb.init(project=project, name=model_name):
            __cross_validate(holdout, wandb_track=wandb_track, cv_predict=cv_predict)
    else:
        __cross_validate(holdout, wandb_track=wandb_track, cv_predict=cv_predict)

In [37]:
model_name = 'Xgb_3k_add_10'
cross_validate(holdout=True, wandb_track=True)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mk_loki[0m. Use [1m`wandb login --relogin`[0m to force relogin


Fold : 0
(818643, 24) (818643,) (90961, 24) (90961,)
Training Model...
Parameters: { "verbose" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


(818643, 24) (818643,) (90961, 24) (90961,)


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

ValueError: X has 28 features, but StandardScaler is expecting 24 features as input.

In [4]:
# avg mape for rel_fts_v1 
score = [0.01105228392097477, 0.011120003621454082, 0.011174543328309047, 0.011096411224224305, 0.01106832339962325, 0.01119513224812868, 0.01108963556414593, 0.01110345894554849, 0.011117898186473048, 0.011131487805745395]
sum(score)/len(score)

0.011114917824462698