In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [2]:
# Load the training data
train = pd.read_csv("../input/30-days-of-ml/train.csv", index_col='id')
test = pd.read_csv("../input/30-days-of-ml/test.csv", index_col='id')

# Preview the data
train.head()

Unnamed: 0_level_0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,B,B,B,C,B,B,A,E,C,N,...,0.400361,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634
2,B,B,A,A,B,D,A,F,A,O,...,0.533087,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233
3,A,A,A,C,B,D,A,D,A,F,...,0.650609,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351
4,B,B,A,C,B,D,A,E,C,K,...,0.66898,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253
6,A,A,A,C,B,D,A,E,A,N,...,0.686964,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226


In [3]:
train.describe()

Unnamed: 0,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
count,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0,300000.0
mean,0.527335,0.460926,0.490498,0.496689,0.491654,0.510526,0.467476,0.537119,0.498456,0.474872,0.474492,0.473216,0.494561,0.508273,8.241979
std,0.230599,0.214003,0.253346,0.219199,0.240074,0.228232,0.210331,0.21814,0.23992,0.218007,0.255949,0.222022,0.247292,0.22295,0.746555
min,-0.118039,-0.069309,-0.056104,0.130676,0.255908,0.045915,-0.224689,0.203763,-0.260275,0.117896,0.048732,0.052608,-0.074208,0.15105,0.140329
25%,0.405965,0.310494,0.300604,0.329783,0.284188,0.354141,0.342873,0.355825,0.332486,0.306874,0.276017,0.308151,0.289074,0.300669,7.742071
50%,0.497053,0.427903,0.502462,0.465026,0.39047,0.488865,0.429383,0.504661,0.439151,0.43462,0.459975,0.433812,0.422887,0.4724,8.191373
75%,0.66806,0.615113,0.647512,0.664451,0.696599,0.669625,0.573383,0.703441,0.606056,0.614333,0.691579,0.642057,0.714502,0.758447,8.728634
max,1.058443,0.887253,1.034704,1.03956,1.055424,1.067649,1.111552,1.032837,1.040229,0.982922,1.05596,1.071444,0.975035,0.905992,10.411992


In [4]:
# Separate target from features
y = train['target']
features = train.drop(['target'], axis=1)

categorical_cols = [col for col in train.columns if 'cat' in col]
continuous_cols = [col for col in train.columns if 'cont' in col]

# Preview features
features.head()

Unnamed: 0_level_0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,B,B,B,C,B,B,A,E,C,N,...,0.610706,0.400361,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985
2,B,B,A,A,B,D,A,F,A,O,...,0.276853,0.533087,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083
3,A,A,A,C,B,D,A,D,A,F,...,0.285074,0.650609,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846
4,B,B,A,C,B,D,A,E,C,K,...,0.284667,0.66898,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682
6,A,A,A,C,B,D,A,E,A,N,...,0.287595,0.686964,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823


In [5]:
to_remove = ['cat0','cat2','cat3','cat4','cat5','cat6','cat7','cat9', 'cont1']
X = features.drop(to_remove, axis=1)
X.head()

Unnamed: 0_level_0,cat1,cat8,cont0,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,B,C,0.20147,0.669699,0.136278,0.610706,0.400361,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985
2,B,A,0.743068,1.021605,0.365798,0.276853,0.533087,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083
3,A,A,0.742708,-0.012673,0.576957,0.285074,0.650609,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846
4,B,C,0.429551,0.577942,0.28061,0.284667,0.66898,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682
6,A,A,1.058291,-0.052389,0.232407,0.287595,0.686964,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823


In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=42)

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('continuous', MinMaxScaler(), list(set(continuous_cols)-set(to_remove))),
        ('categorical', OrdinalEncoder(), list(set(categorical_cols)-set(to_remove)))
    ])

In [8]:
model= XGBRegressor(
    n_estimators=10000,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.1,
    max_depth=3,
    booster='gbtree',
    tree_method='gpu_hist',
    predictor='gpu_predictor',
    reg_lambda=0.0009,
    reg_alpha=23,
    random_state=42
)

In [9]:
#pipeline = Pipeline(
#    steps=[
#        ('preprocessor', preprocessor),
#        ('model', model)
#    ])

In [10]:
#pipeline.fit(X_train, y_train)

In [11]:
#mean_squared_error(y_train, pipeline.predict(X_train), squared=False)

In [12]:
#mean_squared_error(y_valid, pipeline.predict(X_valid), squared=False)

In [13]:
#pipeline = Pipeline(
#    steps=[
#        ('preprocessor', preprocessor),
#        ('model', model)
#    ])

#pipeline.fit(X, y)

In [14]:
#final_predictions1 = pd.DataFrame(zip(test.index, pipeline.predict(test.drop(to_remove, axis=1))), columns=['id', 'target'])

In [15]:
#lgbm_parameters_1 = {
#    'metric': 'RMSE',
#    'feature_pre_filter': False,
#    'reg_alpha': 0.497, 
#    'reg_lambda': 0.327, 
#    'num_leaves': 50, 
#    'learning_rate': 0.032,                      
#    'max_depth': 40,                     
#    'n_estimators': 4060, 
#    'min_child_weight': 0.0173,
#    'subsample': 0.949, 
#    'colsample_bytree': 0.532, 
#    'min_child_samples': 80
#}
#
#lgbm_model_1 = LGBMRegressor(**lgbm_parameters_1)
#pipeline1 = Pipeline(
#    steps=[
#        ('preprocessor', preprocessor),
#        ('model', lgbm_model_1)
#    ])
#pipeline1.fit(X_train, y_train)
#mean_squared_error(y_train, pipeline1.predict(X_train), squared=False)

In [16]:
#mean_squared_error(y_valid, pipeline1.predict(X_valid), squared=False)

In [17]:
#pipeline1.fit(X, y)
#final_predictions2 = pd.DataFrame(zip(test.index, pipeline1.predict(test.drop(to_remove, axis=1))), columns=['id', 'target'])

In [18]:
cat_parameters_1 = {    
    'iterations':1600,
    'learning_rate':0.024,
    'l2_leaf_reg':20,
    'random_strength':1.5,
    'grow_policy':'Depthwise',
    'leaf_estimation_method':'Newton', 
    'bootstrap_type':'Bernoulli',
    'thread_count':4,
    'verbose':False,
    'loss_function':'RMSE',
    'eval_metric':'RMSE',
    'od_type':'Iter'
}

cat_model_1 = CatBoostRegressor(**cat_parameters_1)
pipeline2 = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', cat_model_1)
    ])
pipeline2.fit(X_train, y_train)
mean_squared_error(y_train, pipeline2.predict(X_train), squared=False)

0.6996803705175187

In [19]:
mean_squared_error(y_valid, pipeline2.predict(X_valid), squared=False)

0.7196372790530492

In [20]:
pipeline2.fit(X, y)
final_predictions3 = pd.DataFrame(zip(test.index, pipeline2.predict(test.drop(to_remove, axis=1))), columns=['id', 'target'])
final_predictions3.to_csv('submission.csv', index=False)

In [21]:
#preds = [final_predictions1, final_predictions2, final_predictions3]
#weights = [100., 50., 500.] 
#total_sum = sum(weights)

In [22]:
#sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")
#sample_submission.target = 0.0

#for pred, weight in zip(preds, weights):
#    sample_submission.target += weight * pred / total_sum

#sample_submission.to_csv('submission.csv', index=False)