# Config

In [1]:
RAND = 4567

# data
train_data_path = "../input/re-newmachinehack/ReNew_Participants_Data/train.csv"
test_data_path = "../input/re-newmachinehack/ReNew_Participants_Data/test.csv"
test_timestamp_path = '../input/renew-test-timestamp/test_timestamp.csv'

N_FOLDS =  10

model_name = "extratree-55-T1"
desc = 'extra tree 55 on train_T1'

cat_cols = ['turbine_id', 'mmtt']

# for wandb
project = "renew-machinehack"

In [2]:
from sklearn.metrics import mean_absolute_percentage_error as mape
def comp_score(y_true, y_pred):
    return mape(y_true, y_pred)

In [3]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
wandb_login = user_secrets.get_secret("wandb key")

# Engine

In [4]:
from catboost import CatBoostRegressor
from category_encoders.target_encoder import TargetEncoder
from sklearn.compose import make_column_transformer
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

In [5]:
def get_model():
    model = ExtraTreesRegressor(n_estimators=55, random_state=RAND, verbose=1, n_jobs=-1)

    tme = TargetEncoder()
    # pf = PolynomialFeatures(degree=2)
    # pca = PCA(n_components=10, random_state=config['RAND'])
    # scaler = StandardScaler()
    ct = make_column_transformer(
        (tme, cat_cols),
        remainder = 'passthrough',
        n_jobs= -1
    )
    model_pipe = make_pipeline(
        ct,
        model
    )
    return model_pipe

# Train and Eval

In [6]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import argparse
import wandb
from joblib import dump
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import time
np.random.seed(RAND)

wandb.login(key = wandb_login)

# Load configuration
print(model_name)
print(desc)

# Load data
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
test_timestamp = pd.read_csv(test_timestamp_path)
test_data['timestamp'] = test_timestamp

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


extratree-55-T1
extra tree 55 on train_T1


In [7]:
# ft-eng
# add new features
def add_features(df):
    df['timestamp'] = pd.to_datetime(df.timestamp)
    df['month'] = df.timestamp.dt.month
    df['hour'] = df.timestamp.dt.hour
    df['dayofweek'] = df.timestamp.dt.dayofweek
    df['dayofmonth'] = df.timestamp.dt.day
    df['mmtt'] = df['month'].astype(str) + '_' + df['turbine_id']
    return df

In [8]:
train_data = add_features(train_data)
test_data = add_features(test_data)

In [9]:
def train_and_eval(X_train, y_train, X_val, y_val):
    print('Training Model...')
    model = get_model()
    model.fit(X_train, y_train)
    train_score = comp_score(model.predict(X_train), y_train)
    print("Training MAPE: ", train_score)

    print('Validating Model..')
    preds = model.predict(X_val)
    val_score = comp_score(y_val, preds)
    print("Validation MAPE: ", val_score)
    print("validation rmse: ", mean_squared_error(y_val, preds, squared=False))

    return model, train_score, val_score


def __cross_validate(holdout=False, cv_predict=False, wandb_track=True):
    cv_scores = []

    drop_cols = ['timestamp', 'Target']
    
    if cv_predict:
        cvpreds_test = np.zeros(shape=(len(test_data), N_FOLDS))
        cvpreds_train = np.zeros(shape=(len(train_data)))
    
    kf = KFold(n_splits=N_FOLDS, random_state=RAND, shuffle=True)
    for fold, (train_idx, val_idx) in enumerate(kf.split(train_data)):
        print(f'Fold : {fold}')

        train_fold = train_data.iloc[train_idx]
        val_fold = train_data.iloc[val_idx]

        X_train, y_train = train_fold.drop(
            drop_cols, axis=1), train_fold.Target
        X_val, y_val = val_fold.drop(
            drop_cols, axis=1), val_fold.Target
        
        start = time.time() 
        model, train_score, val_score = train_and_eval(
            X_train, y_train, X_val, y_val)
        end = time.time()
        print(f'Time taken: {end - start}')

        if wandb_track:
            # wandb tracking
            wandb.log({
                'fold': fold,
                'Train_score': train_score,
                'Val_score': val_score 
            })

        cv_scores.append(val_score)

        if cv_predict:
            # save predictions for ensembling
            cvpreds_test[:, fold] = model.predict(test_data.drop(['timestamp'], axis=1))
            cvpreds_train[val_idx] = model.predict(X_val)
            
        print('----------------------------------------------------------')

        # # save_model
        # if config["save_models"] :
        #     dump(model, config['save_model_to'] + '/' +  config['model_name'] + '_' + str(fold))
        #     print('Model saved')

        if holdout == True:
            break

    if cv_predict:
        print('Saving cross validated predictions...')
        test_cv = pd.DataFrame(cvpreds_test.mean(axis=1), columns=['Target'])
        train_cv = pd.DataFrame(cvpreds_train, columns=['Target'])
        print('Test shape: ', test_cv.shape)
        print('Train shape: ', train_cv.shape)
        test_cv.to_csv(f"{model_name}_test_cv.csv", index=False)
        train_cv.to_csv(f"{model_name}_train_cv.csv", index=False)

    print("AVG mape :", np.array(cv_scores).mean())


def cross_validate(holdout=False, wandb_track=True, cv_predict=False):
    if wandb_track:
        # wandb tracking
        with wandb.init(project=project, name=model_name):
            __cross_validate(holdout, wandb_track=wandb_track, cv_predict=cv_predict)
    else:
        __cross_validate(holdout, wandb_track=wandb_track, cv_predict=cv_predict)

In [10]:
cross_validate(wandb_track=True, cv_predict=True)

[34m[1mwandb[0m: Currently logged in as: [33mk_loki[0m. Use [1m`wandb login --relogin`[0m to force relogin


Fold : 0
Training Model...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:  2.5min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   11.7s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:   14.8s finished


Training MAPE:  3.135911132599308e-13
Validating Model..


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    1.5s finished


Validation MAPE:  0.002423819893504584
validation rmse:  0.1944474449659575
Time taken: 174.44701600074768


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    5.0s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    6.4s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.3s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    1.6s finished


----------------------------------------------------------
Fold : 1
Training Model...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:  2.4min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   11.3s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:   14.3s finished


Training MAPE:  3.412094952198129e-13
Validating Model..


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    1.6s finished


Validation MAPE:  0.002447744479722155
validation rmse:  0.20056389505502256
Time taken: 164.03512334823608


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.1s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    5.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.5s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    1.9s finished


----------------------------------------------------------
Fold : 2
Training Model...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:  2.6min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   11.8s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:   14.9s finished


Training MAPE:  3.0436494416732277e-13
Validating Model..


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.3s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    1.6s finished


Validation MAPE:  0.0024445720894162646
validation rmse:  0.19896606288833127
Time taken: 175.65037536621094


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.2s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    5.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    1.6s finished


----------------------------------------------------------
Fold : 3
Training Model...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:  2.3min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   11.4s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:   14.5s finished


Training MAPE:  2.840027515019466e-13
Validating Model..


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    1.6s finished


Validation MAPE:  0.002413202706361808
validation rmse:  0.1941674635455659
Time taken: 159.79339265823364


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.1s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    5.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.4s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    1.9s finished


----------------------------------------------------------
Fold : 4
Training Model...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:  2.4min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   11.3s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:   14.7s finished


Training MAPE:  3.418917748344041e-13
Validating Model..


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.3s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    1.6s finished


Validation MAPE:  0.0024480042280128483
validation rmse:  0.19957036609817994
Time taken: 163.80022764205933


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.1s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    5.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    1.5s finished


----------------------------------------------------------
Fold : 5
Training Model...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:  2.4min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   11.6s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:   14.5s finished


Training MAPE:  2.642531205732451e-13
Validating Model..


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    1.5s finished


Validation MAPE:  0.0024381773510963075
validation rmse:  0.2008137457143468
Time taken: 163.0806794166565


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.9s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    5.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    1.6s finished


----------------------------------------------------------
Fold : 6
Training Model...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:  2.2min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   11.3s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:   14.2s finished


Training MAPE:  2.949412116060336e-13
Validating Model..


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    1.6s finished


Validation MAPE:  0.0024196932005452155
validation rmse:  0.1958916653231356
Time taken: 152.71819925308228


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.0s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    5.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    1.5s finished


----------------------------------------------------------
Fold : 7
Training Model...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:  2.3min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   11.3s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:   14.5s finished


Training MAPE:  3.014438598036152e-13
Validating Model..


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.3s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    1.6s finished


Validation MAPE:  0.002454515274250163
validation rmse:  0.2013057037973251
Time taken: 158.4831988811493


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.2s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    5.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.3s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    1.6s finished


----------------------------------------------------------
Fold : 8
Training Model...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:  2.3min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   12.1s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:   15.2s finished


Training MAPE:  3.123916767543552e-13
Validating Model..


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    2.0s finished


Validation MAPE:  0.00244867444209496
validation rmse:  0.2074952071093046
Time taken: 163.94157767295837


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.3s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    5.4s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.3s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    1.6s finished


----------------------------------------------------------
Fold : 9
Training Model...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:  2.4min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   15.6s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:   20.4s finished


Training MAPE:  3.1215220925045277e-13
Validating Model..


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.3s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    1.7s finished


Validation MAPE:  0.0024475968712442953
validation rmse:  0.20749366788643364
Time taken: 173.27117133140564


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.9s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    5.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    1.6s finished


----------------------------------------------------------
Saving cross validated predictions...
Test shape:  (303202, 1)
Train shape:  (909604, 1)
AVG mape : 0.00243860005362486


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Train_score,▅█▅▃█▁▄▄▅▅
Val_score,▃▇▆▁▇▅▂█▇▇
fold,▁▂▃▃▄▅▆▆▇█

0,1
Train_score,0.0
Val_score,0.00245
fold,9.0
