# Config

In [1]:
RAND = 4567

# data
train_data_path = "../input/re-newmachinehack/ReNew_Participants_Data/train.csv"
test_data_path = "../input/re-newmachinehack/ReNew_Participants_Data/test.csv"

N_FOLDS =  10

model_name = "extratree reg 50"
desc = 'base line extra tree regressor'

cat_cols = ['turbine_id']

# for wandb
project = "renew-machinehack"

In [2]:
from sklearn.metrics import mean_absolute_percentage_error as mape
def comp_score(y_true, y_pred):
    return mape(y_true, y_pred)

In [3]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
wandb_login = user_secrets.get_secret("wandb key")

# Engine

In [4]:
from catboost import CatBoostRegressor
from category_encoders.target_encoder import TargetEncoder
from sklearn.compose import make_column_transformer
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

In [5]:
def get_model():
    model = ExtraTreesRegressor(n_estimators=50, random_state=RAND, verbose=1, n_jobs=-1)

    tme = TargetEncoder()
    # pf = PolynomialFeatures(degree=2)
    # pca = PCA(n_components=10, random_state=config['RAND'])
    # scaler = StandardScaler()
    ct = make_column_transformer(
        (tme, cat_cols),
        remainder = 'passthrough',
        n_jobs= -1
    )
    model_pipe = make_pipeline(
        ct,
        model
    )
    return model_pipe

# Train and Eval

In [6]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import argparse
import wandb
from joblib import dump
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import time
np.random.seed(RAND)

wandb.login(key = wandb_login)

# Load configuration
print(model_name)
print(desc)

# Load data
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


extratree reg 50
base line extra tree regressor


In [7]:
def train_and_eval(X_train, y_train, X_val, y_val):
    print('Training Model...')
    model = get_model()
    model.fit(X_train, y_train)
    train_score = comp_score(model.predict(X_train), y_train)
    print("Training MAPE: ", train_score)

    print('Validating Model..')
    preds = model.predict(X_val)
    val_score = comp_score(y_val, preds)
    print("Validation MAPE: ", val_score)
    print("validation rmse: ", mean_squared_error(y_val, preds, squared=False))

    return model, train_score, val_score


def __cross_validate(holdout=False, cv_predict=False, wandb_track=True):
    cv_scores = []

    drop_cols = ['timestamp', 'Target']
    
    if cv_predict:
        cvpreds_test = np.zeros(shape=(len(test_data), N_FOLDS))
        cvpreds_train = np.zeros(shape=(len(train_data)))
    
    kf = KFold(n_splits=N_FOLDS, random_state=RAND, shuffle=True)
    for fold, (train_idx, val_idx) in enumerate(kf.split(train_data)):
        print(f'Fold : {fold}')

        train_fold = train_data.iloc[train_idx]
        val_fold = train_data.iloc[val_idx]

        X_train, y_train = train_fold.drop(
            drop_cols, axis=1), train_fold.Target
        X_val, y_val = val_fold.drop(
            drop_cols, axis=1), val_fold.Target
        
        start = time.time() 
        model, train_score, val_score = train_and_eval(
            X_train, y_train, X_val, y_val)
        end = time.time()
        print(f'Time taken: {end - start}')

        if wandb_track:
            # wandb tracking
            wandb.log({
                'fold': fold,
                'Train_score': train_score,
                'Val_score': val_score 
            })

        cv_scores.append(val_score)

        if cv_predict:
            # save predictions for ensembling
            cvpreds_test[:, fold] = model.predict(test_data)
            cvpreds_train[val_idx] = model.predict(X_val)
            
        print('----------------------------------------------------------')

        # # save_model
        # if config["save_models"] :
        #     dump(model, config['save_model_to'] + '/' +  config['model_name'] + '_' + str(fold))
        #     print('Model saved')

        if holdout == True:
            break

    if cv_predict:
        print('Saving cross validated predictions...')
        test_cv = pd.DataFrame(cvpreds_test.mean(axis=1), columns=['Target'])
        train_cv = pd.DataFrame(cvpreds_train, columns=['Target'])
        print('Test shape: ', test_cv.shape)
        print('Train shape: ', train_cv.shape)
        test_cv.to_csv(f"{model_name}_test_cv.csv", index=False)
        train_cv.to_csv(f"{model_name}_train_cv.csv", index=False)

    print("AVG mape :", np.array(cv_scores).mean())


def cross_validate(holdout=False, wandb_track=True, cv_predict=False):
    if wandb_track:
        # wandb tracking
        with wandb.init(project=project, name=model_name):
            __cross_validate(holdout, wandb_track=wandb_track, cv_predict=cv_predict)
    else:
        __cross_validate(holdout, wandb_track=wandb_track, cv_predict=cv_predict)

In [8]:
cross_validate(wandb_track=True, cv_predict=True)

[34m[1mwandb[0m: Currently logged in as: [33mk_loki[0m. Use [1m`wandb login --relogin`[0m to force relogin


Fold : 0
Training Model...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.9min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   15.7s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   18.8s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.


Training MAPE:  1.4582848459893644e-13
Validating Model..


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    1.9s finished


Validation MAPE:  0.012707009765460806
validation rmse:  0.9817101964022367
Time taken: 141.77010893821716


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    6.0s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    7.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.8s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.1s finished


----------------------------------------------------------
Fold : 1
Training Model...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.9min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   19.0s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   22.1s finished


Training MAPE:  1.5997371290961593e-13
Validating Model..


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.0s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.3s finished


Validation MAPE:  0.012738509851205821
validation rmse:  0.9846533652908813
Time taken: 139.15156435966492


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    6.9s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    8.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.1s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.4s finished


----------------------------------------------------------
Fold : 2
Training Model...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  2.1min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   14.6s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   17.3s finished


Training MAPE:  1.4969872009679996e-13
Validating Model..


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.0s finished


Validation MAPE:  0.012772263109305262
validation rmse:  0.9822109147756151
Time taken: 150.24044013023376


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    5.4s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    6.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    1.9s finished


----------------------------------------------------------
Fold : 3
Training Model...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  2.2min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   18.3s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   21.6s finished


Training MAPE:  1.371174016672583e-13
Validating Model..


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.3s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.7s finished


Validation MAPE:  0.01278818259631395
validation rmse:  0.9953380665132532
Time taken: 162.0699338912964


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    6.5s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    7.7s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.8s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.1s finished


----------------------------------------------------------
Fold : 4
Training Model...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  2.1min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   14.2s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   17.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.


Training MAPE:  1.3646568392650719e-13
Validating Model..


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.0s finished


Validation MAPE:  0.012698884846675421
validation rmse:  0.9780555035151091
Time taken: 146.2797245979309


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    5.9s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    7.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    1.9s finished


----------------------------------------------------------
Fold : 5
Training Model...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  2.0min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   17.6s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   20.8s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.


Training MAPE:  1.2263045558324882e-13
Validating Model..


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.9s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.2s finished


Validation MAPE:  0.012826329893370642
validation rmse:  0.9972826929842272
Time taken: 145.36818647384644


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    6.7s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    7.9s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.8s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.1s finished


----------------------------------------------------------
Fold : 6
Training Model...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  2.0min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   15.3s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   17.8s finished


Training MAPE:  1.4612774716937908e-13
Validating Model..


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    1.8s finished


Validation MAPE:  0.012703322684202533
validation rmse:  0.9866302911894511
Time taken: 142.4574773311615


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    5.2s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    6.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.5s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    1.7s finished


----------------------------------------------------------
Fold : 7
Training Model...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  2.0min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   17.0s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   19.6s finished


Training MAPE:  1.4211690705478046e-13
Validating Model..


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.0s finished


Validation MAPE:  0.01274607237580086
validation rmse:  0.9829838392264535
Time taken: 143.17980933189392


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    5.5s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    6.4s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    1.9s finished


----------------------------------------------------------
Fold : 8
Training Model...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.9min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   14.4s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   16.9s finished


Training MAPE:  1.5513043320466953e-13
Validating Model..


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.5s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    1.8s finished


Validation MAPE:  0.012712809046239771
validation rmse:  0.9763526442743194
Time taken: 138.36032629013062


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    5.1s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    5.9s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    1.8s finished


----------------------------------------------------------
Fold : 9
Training Model...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  2.0min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   15.6s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   18.4s finished


Training MAPE:  1.763849305894882e-13
Validating Model..


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.0s finished


Validation MAPE:  0.01284016403015586
validation rmse:  0.9954205803888113
Time taken: 142.53774571418762


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    5.9s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    7.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    2.0s finished


----------------------------------------------------------
Saving cross validated predictions...
Test shape:  (303202, 1)
Train shape:  (909604, 1)
AVG mape : 0.012753354819873091


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Train_score,▄▆▅▃▃▁▄▄▅█
Val_score,▁▃▅▅▁▇▁▃▂█
fold,▁▂▃▃▄▅▆▆▇█

0,1
Train_score,0.0
Val_score,0.01284
fold,9.0
