# Config

In [1]:
RAND = 4567

# data
train_data_path = "../input/re-newmachinehack/ReNew_Participants_Data/train.csv"
test_data_path = "../input/re-newmachinehack/ReNew_Participants_Data/test.csv"

N_FOLDS =  10

model_name = "lr-rel_fts_v1"
save_models = False
desc = 'lr'

cat_cols = ['turbine_id']

# for wandb
project = "renew-machinehack"

In [2]:
from sklearn.metrics import mean_absolute_percentage_error as mape
def comp_score(y_true, y_pred):
    return mape(y_true, y_pred)

In [3]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
wandb_login = user_secrets.get_secret("wandb key")

# Engine

In [4]:
from catboost import CatBoostRegressor
from category_encoders.target_encoder import TargetEncoder
from sklearn.compose import make_column_transformer
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression

In [5]:
def get_model():
#     model = ExtraTreesRegressor(n_estimators=50, random_state=RAND, verbose=1, n_jobs=-1)
#     model = XGBRegressor(tree_method='gpu_hist', random_state=RAND, n_estimators=1000, n_jobs=-1, verbose=False)
    model = LinearRegression()
    tme = TargetEncoder()
    # pf = PolynomialFeatures(degree=2)
    # pca = PCA(n_components=10, random_state=config['RAND'])
    scaler = StandardScaler()
    ct = make_column_transformer(
        (tme, cat_cols),
        remainder = 'passthrough',
        n_jobs= -1
    )
    model_pipe = make_pipeline(
        ct,
        scaler,
        model
    )
    return model_pipe

# Train and Eval

In [6]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import argparse
import wandb
from joblib import dump
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import time
np.random.seed(RAND)

wandb.login(key = wandb_login)

# Load configuration
print(model_name)
print(desc)

# Load data
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


lr-rel_fts_v1
lr


In [7]:
numeric_cols = [col for col in train_data.columns if train_data[col].dtype != 'object']
numeric_cols = numeric_cols[:-1]
numeric_cols

['active_power_calculated_by_converter',
 'active_power_raw',
 'ambient_temperature',
 'generator_speed',
 'generator_winding_temp_max',
 'grid_power10min_average',
 'nc1_inside_temp',
 'nacelle_temp',
 'reactice_power_calculated_by_converter',
 'reactive_power',
 'wind_direction_raw',
 'wind_speed_raw',
 'wind_speed_turbulence']

In [8]:
len(numeric_cols)

13

In [9]:
! pip install feature_engine -q

[0m

In [10]:
train_data = train_data.where(train_data.values != 0, 0.0001)
test_data = test_data.where(test_data.values != 0, 0.0001)

In [11]:
# generate realtive features
from feature_engine.creation import RelativeFeatures
rf = RelativeFeatures(variables=numeric_cols[:6], reference=numeric_cols[6:], func=['sub', 'mul', 'div'])
train_data_trans = rf.fit_transform(train_data.drop(['timestamp','Target'], axis=1))
test_data_trans = rf.transform(test_data)

In [12]:
print(train_data_trans.info())
print(test_data_trans.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 909604 entries, 0 to 909603
Columns: 140 entries, active_power_calculated_by_converter to grid_power10min_average_div_wind_speed_turbulence
dtypes: float64(139), object(1)
memory usage: 971.6+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303202 entries, 0 to 303201
Columns: 140 entries, active_power_calculated_by_converter to grid_power10min_average_div_wind_speed_turbulence
dtypes: float64(139), object(1)
memory usage: 323.9+ MB
None


In [13]:
'target' in train_data_trans.columns

False

In [14]:
train_data_trans['Target'] = train_data.Target
train_data = train_data_trans
test_data = test_data_trans

In [15]:
print(train_data.info())
print(test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 909604 entries, 0 to 909603
Columns: 141 entries, active_power_calculated_by_converter to Target
dtypes: float64(140), object(1)
memory usage: 978.5+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303202 entries, 0 to 303201
Columns: 140 entries, active_power_calculated_by_converter to grid_power10min_average_div_wind_speed_turbulence
dtypes: float64(139), object(1)
memory usage: 323.9+ MB
None


In [16]:
'timestamp' in train_data, 'timestamp' in test_data

(False, False)

In [17]:
'Target' in train_data.columns, 'Target' in test_data.columns

(True, False)

In [18]:
cvpreds_test = np.zeros(shape=(len(test_data), N_FOLDS))
cvpreds_train = np.zeros(shape=(len(train_data)))


In [19]:
model = get_model()
print(model)
#  eval on holdout test
kf = KFold(n_splits=N_FOLDS, random_state=RAND, shuffle=True)
for fold, (train_idx, val_idx) in enumerate(kf.split(train_data)):
    print(f'Fold : {fold}')
    
    X_train, y_train = train_data.drop('Target', axis=1).iloc[train_idx], train_data['Target'].iloc[train_idx]
    X_val, y_val = train_data.drop('Target', axis=1).iloc[val_idx], train_data['Target'].iloc[val_idx]
    
    print('training,..')
    model.fit(X_train, y_train)
    print('validating...')
    preds = model.predict(X_val)
    mape_score = comp_score(y_val, preds)
    print('MAPE Score: ', mape_score)
    
    # test data predictions
    cvpreds_test[:, fold] = model.predict(test_data)
    cvpreds_train[val_idx] = preds
    print('--------------------------------------------------------------------')


# ran in 1475 sec.

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(n_jobs=-1, remainder='passthrough',
                                   transformers=[('targetencoder',
                                                  TargetEncoder(),
                                                  ['turbine_id'])])),
                ('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])
Fold : 0
training,..
validating...
MAPE Score:  0.025506144259611403
--------------------------------------------------------------------
Fold : 1
training,..
validating...
MAPE Score:  0.02564579501314742
--------------------------------------------------------------------
Fold : 2
training,..
validating...
MAPE Score:  0.03477561386859414
--------------------------------------------------------------------
Fold : 3
training,..
validating...
MAPE Score:  0.025848497293301947
--------------------------------------------------------------------
Fold : 4
training,..
val

In [20]:
test_cv = pd.DataFrame(cvpreds_test.mean(axis=1), columns=['Target'])
train_cv = pd.DataFrame(cvpreds_train, columns=['Target'])

test_cv.to_csv('test_lr_rel_fts_v1_oof.csv', index=False)
train_cv.to_csv('train_lr_rel_fts_v1_oof.csv', index=False)

In [21]:
# model[2].feature_importances_

In [22]:

# def train_and_eval(model, X_train, y_train, X_val, y_val):
#     print('Training Model...')
#     model.fit(X_train, y_train)
#     train_score = comp_score(model.predict(X_train), y_train)
#     print("Training MAPE: ", train_score)

#     print('Validating Model..')
#     preds = model.predict(X_val)
#     val_score = comp_score(y_val, preds)
#     print("Validation MAPE: ", val_score)
#     print("validation rmse: ", mean_squared_error(y_val, preds, squared=False))

#     return model, train_score, val_score


# def __cross_validate(holdout=False, cv_predict=False, wandb_track=True):
#     cv_scores = []

#     drop_cols = ['Target']
#     model = get_model()

    
#     if cv_predict:
#         cvpreds_test = np.zeros(shape=(len(test_data), N_FOLDS))
#         cvpreds_train = np.zeros(shape=(len(train_data)))
    
#     kf = KFold(n_splits=N_FOLDS, random_state=RAND, shuffle=True)
#     for fold, (train_idx, val_idx) in enumerate(kf.split(train_data)):
#         print(f'Fold : {fold}')

#         train_fold = train_data.iloc[train_idx]
#         val_fold = train_data.iloc[val_idx]

#         X_train, y_train = train_fold.drop(
#             drop_cols, axis=1), train_fold.Target
#         X_val, y_val = val_fold.drop(
#             drop_cols, axis=1), val_fold.Target
        
#         start = time.time() 
#         model, train_score, val_score = train_and_eval(model,
#             X_train, y_train, X_val, y_val)
#         end = time.time()
#         print(f'Time taken: {end - start}')

#         if wandb_track:
#             # wandb tracking
#             wandb.log({
#                 'fold': fold,
#                 'Train_score': train_score,
#                 'Val_score': val_score 
#             })

#         cv_scores.append(val_score)

#         if cv_predict:
#             # save predictions for ensembling
#             cvpreds_test[:, fold] = model.predict(test_data)
#             cvpreds_train[val_idx] = model.predict(X_val)
            
#         print('----------------------------------------------------------')

#         # save_model
#         if save_models :
#             dump(model, model_name + '_' + str(fold))
#             print('Model saved')

#         if holdout == True:
#             break

#     if cv_predict:
#         print('Saving cross validated predictions...')
#         test_cv = pd.DataFrame(cvpreds_test.mean(axis=1), columns=['Target'])
#         train_cv = pd.DataFrame(cvpreds_train, columns=['Target'])
#         print('Test shape: ', test_cv.shape)
#         print('Train shape: ', train_cv.shape)
#         test_cv.to_csv(f"{model_name}_test_cv.csv", index=False)
#         train_cv.to_csv(f"{model_name}_train_cv.csv", index=False)

#     print("AVG mape :", np.array(cv_scores).mean())


# def cross_validate(holdout=False, wandb_track=True, cv_predict=False):
#     if wandb_track:
#         # wandb tracking
#         with wandb.init(project=project, name=model_name):
#             __cross_validate(holdout, wandb_track=wandb_track, cv_predict=cv_predict)
#     else:
#         __cross_validate(holdout, wandb_track=wandb_track, cv_predict=cv_predict)

In [23]:
# cross_validate(holdout=True, wandb_track=False, cv_predict=True)