In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # for visualization 
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
from IPython.display import clear_output
from sklearn import preprocessing
from sklearn.model_selection import *
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor, Pool
from lightgbm import LGBMRegressor
from numpy import nan
from sklearn.metrics import mean_squared_error
import math # to use sqrt function
import sys
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/30-days-of-ml/sample_submission.csv
/kaggle/input/30-days-of-ml/train.csv
/kaggle/input/30-days-of-ml/test.csv


In [2]:
#Read the test file and train file into dataframes
X_full = pd.read_csv('/kaggle/input/30-days-of-ml/train.csv', index_col = 0) #Include target (y)
X = pd.read_csv('/kaggle/input/30-days-of-ml/train.csv', index_col = 0) #Target y dropped
X_test = pd.read_csv('/kaggle/input/30-days-of-ml/test.csv')
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

In [3]:
#Separate the target (dependent variable) into a different variable and drop it
#from the training dataset (in turn modifying the dataset)
y = X.target
X.drop('target',axis = 'columns', inplace=True)
X['kfold'] = -1

In [4]:
#Divide category variables and numeric variables for ML
category_var = [col for col in X.columns if X[col].dtype == 'object']
numeric_var = [col for col in X.columns if 'cont' in col]
useful_features = [col for col in X.columns if col not in ("id", "target", "kfold")]

#Count the cardinality of the category variables
low_cardinality_cols = [col for col in category_var if X[col].nunique() < 10]
high_cardinality_cols = list(set(category_var) - set(low_cardinality_cols))

In [5]:
Standardizer = preprocessing.StandardScaler()
# preprocessing transformer for categorical and numeric variables to be used in pipeline
dataPreProcessingTransformer = ColumnTransformer(transformers = [
    ('category',OrdinalEncoder(),category_var),
    ('numerical',Standardizer,numeric_var)
    ],remainder='passthrough') # no change to non-categorical variables, they will
                                # will be processed as is

In [6]:
#Models used: 3 different models (for 4th attempt with 5 kfolds)

# Change the model for different ML algorithm, replaced with the best tuned parameters
model = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1.0, gamma=1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=5,
             min_child_weight=10, missing=nan, monotone_constraints='()',
             n_estimators=600, n_jobs=4, num_parallel_tree=1,
             random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=0.6,
             validate_parameters=1, verbosity=None) 
# gpu_id=0, tree_method='gpu_hist', predictor='gpu_predictor'
                    #Using GPU
                    
        #Use GPU
#         'device':'gpu','gpu_platform_id': -1, 'gpu_device_id': -1,
model_2_params = {'early_stopping_rounds':5,
     'objective': 'rmse', 'subsample_for_bin': 923,  'min_split_gain': 0.8980555793561906, 
     'min_child_weight': 0.004449972204357289,  'n_estimators': 12398, 
     'reg_alpha': 0.7239349780460461,  'reg_lambda': 0.0011824098455801091, 
     'tree_learner': 'serial',  'application': 'regression_l2', 
     'bagging_freq': 3,  'bagging_fraction': 0.7064001568577296,  'feature_fraction': 0.26030506934030706, 
     'colsample_bytree': 0.5,  'subsample': 1.0,  'learning_rate': 0.008, 
     'max_depth': 100,  'num_leaves': 157,  'min_child_samples': 134, 
     'cat_smooth': 74, 'metric':'rmse', 'random_state': 0, 'verbose' : -100,
                  'silent': True
}
model2 = LGBMRegressor(**model_2_params)

model3 = GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=500, 
                                   subsample=1.0, criterion='mse', min_samples_split=2,
                                   min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3,
                                   min_impurity_decrease=0.0, min_impurity_split=None, init=None,
                                   random_state=None, max_features=None, alpha=0.9, verbose=0,
                                   max_leaf_nodes=None, warm_start=False, validation_fraction=0.2,
                                   n_iter_no_change=10, tol=0.00001, ccp_alpha=0.0,
                                   )

# model3 = CatBoostRegressor(task_type="GPU",
#                            devices='0:1', learning_rate=0.1,depth=None,l2_leaf_reg=None,
#                         model_size_reg=None,rsm=None,loss_function='RMSE',
#                         verbose=False,
#                         max_depth=8,
#                         n_estimators=1000,
#                         early_stopping_rounds=10)

#Pipeline to run preprocessing above and regression model
myPipeLine = Pipeline(steps=[('preprocessor', dataPreProcessingTransformer),
                              ('model', model)
                             ])

myPipeLine2 = Pipeline(steps=[('preprocessor', dataPreProcessingTransformer),
                              ('model', model2)
                             ])
myPipeLine3 = Pipeline(steps=[('preprocessor', dataPreProcessingTransformer),
                              ('model', model3)
                             ])

processingPipeline = Pipeline([('preprocessor', dataPreProcessingTransformer)])

In [7]:
#Model 1: using XGBregressor
#Splitting into 4 different datasets: X_train, X_valid, y_train, y_valid
#Recreate 5 folds of data
kf5 = KFold(n_splits = 5, shuffle = False, random_state = 0)
final_test_predictions = []
final_valid_predictions = {}
scores = []

for fold, (train_index, valid_index) in enumerate(kf5.split(X, y), start=1):
    X_train, y_train = X.iloc[train_index], y.iloc[train_index] 
    X_valid, y_valid = X.iloc[valid_index], y.iloc[valid_index] 
    #Take out the columns not used in the model (id, kfold, target)
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    valid_ids = X_valid.index.values.tolist()
    
    # Fitting the model
    myPipeLine.fit(X_train,y_train)

    # Prediction from X_valid
    preds_valid = myPipeLine.predict(X_valid)
    
    #Final predictions
    test_preds = myPipeLine.predict(X_test)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    
    #Scoring the model
    foldScore = mean_squared_error(y_valid, preds_valid, squared = False)
    scores.append(foldScore)
    #Using mean squared error to grade the algorithm, take the sqr root to find the RMSE
    print(f"RMSE for the {fold}th fold is {foldScore}") 
    
#Get the fold with the best score:
fold_scores = {i+1:scores[i] for i in range(len(scores))}
best_kfold = min(fold_scores, key=fold_scores.get)
print("the best kth fold is the " + str(best_kfold) +"th fold")

final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient ='index').reset_index()
final_valid_predictions.columns = ['id','target_pred_1']
final_valid_predictions.to_csv('train_pred_1.csv', index = False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis = 1) 
sample_submission.columns = ['id','target_pred_1']
sample_submission.to_csv('test_pred_1.csv', index = False)

RMSE for the 1th fold is 0.7188257399902477
RMSE for the 2th fold is 0.7252054530101001
RMSE for the 3th fold is 0.7191736303613289
RMSE for the 4th fold is 0.7226923059933567
RMSE for the 5th fold is 0.7207345258113744
the best kth fold is the 1th fold


In [8]:
#Drop the id column so 2nd model's prediction can work
X_test.drop('id',axis ='columns',inplace=True)

In [9]:
#Model 2: using LGBM Regressor with parameters from Can

#Recreate 5 folds of data
kf5 = KFold(n_splits = 5, shuffle = False, random_state = 0)
final_test_predictions = []
final_valid_predictions = {}
scores = []

for fold, (train_index, valid_index) in enumerate(kf5.split(X, y), start=1):
    X_train_pre, y_train = X.iloc[train_index], y.iloc[train_index] 
    X_valid_pre, y_valid = X.iloc[valid_index], y.iloc[valid_index] 
    #Take out the columns not used in the model (id, kfold, target)
    X_train_pre = X_train_pre[useful_features]
    X_valid_pre = X_valid_pre[useful_features]
    
    valid_ids = X_valid_pre.index.values.tolist()
    X_train = processingPipeline.fit_transform(X_train_pre)
    X_valid = processingPipeline.transform(X_valid_pre)
    X_test_run = processingPipeline.transform(X_test)
    
    # Fitting the model
    model2.fit(X_train,y_train, eval_set=[(X_valid, y_valid)],verbose = False)

    # Prediction from X_valid
    preds_valid = model2.predict(X_valid)
    
    #Final predictions
    test_preds = model2.predict(X_test_run)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    
    #Scoring the model
    foldScore = mean_squared_error(y_valid, preds_valid, squared = False)
    scores.append(foldScore)
    #Using mean squared error to grade the algorithm, take the sqr root to find the RMSE
    print(f"RMSE for the {fold}th fold is {foldScore}") 
    
#Get the fold with the best score:
fold_scores = {i+1:scores[i] for i in range(len(scores))}
best_kfold = min(fold_scores, key=fold_scores.get)
print("the best kth fold is the " + str(best_kfold) +"th fold")

final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient ='index').reset_index()
final_valid_predictions.columns = ['id','target_pred_2']
final_valid_predictions.to_csv('train_pred_2.csv', index = False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis = 1) 
sample_submission.columns = ['id','target_pred_2']
sample_submission.to_csv('test_pred_2.csv', index = False)

RMSE for the 1th fold is 0.7180624139997346
RMSE for the 2th fold is 0.7234292240147855
RMSE for the 3th fold is 0.7168600036018065
RMSE for the 4th fold is 0.7206809620398457
RMSE for the 5th fold is 0.7189565508387971
the best kth fold is the 3th fold


In [10]:
#Model 3a: using Gradient Boosting Regressor

#Recreate 5 folds of data
kf5 = KFold(n_splits = 5, shuffle = False, random_state = 0)
final_test_predictions = []
final_valid_predictions = {}
scores = []

for fold, (train_index, valid_index) in enumerate(kf5.split(X, y), start=1):
    X_train, y_train = X.iloc[train_index], y.iloc[train_index] 
    X_valid, y_valid = X.iloc[valid_index], y.iloc[valid_index] 
    #Take out the columns not used in the model (id, kfold, target)
    X_train = X_train[useful_features]
    X_valid = X_valid[useful_features]
    
    valid_ids = X_valid.index.values.tolist()
    
    # Fitting the model
    myPipeLine3.fit(X_train,y_train)

    # Prediction from X_valid
    preds_valid = myPipeLine3.predict(X_valid)
    
    #Final predictions
    test_preds = myPipeLine3.predict(X_test)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    
    #Scoring the model
    foldScore = mean_squared_error(y_valid, preds_valid, squared = False)
    scores.append(foldScore)
    #Using mean squared error to grade the algorithm, take the sqr root to find the RMSE
    print(f"RMSE for the {fold}th fold is {foldScore}") 
    
#Get the fold with the best score:
fold_scores = {i+1:scores[i] for i in range(len(scores))}
best_kfold = min(fold_scores, key=fold_scores.get)
print("the best kth fold is the " + str(best_kfold) +"th fold")

final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient ='index').reset_index()
final_valid_predictions.columns = ['id','target_pred_3']
final_valid_predictions.to_csv('train_pred_3.csv', index = False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis = 1) 
sample_submission.columns = ['id','target_pred_3']
sample_submission.to_csv('test_pred_3.csv', index = False)

RMSE for the 1th fold is 0.7201492949576149
RMSE for the 2th fold is 0.7262897786868145
RMSE for the 3th fold is 0.720424395509168
RMSE for the 4th fold is 0.7237476831484214
RMSE for the 5th fold is 0.7216764132313171
the best kth fold is the 1th fold


In [11]:
# #Model 3b: using Catboost regressor (old)

# #Recreate 5 folds of data
# kf5 = KFold(n_splits = 5, shuffle = False, random_state = 0)
# final_test_predictions = []
# final_valid_predictions = {}
# scores = []

# for fold, (train_index, valid_index) in enumerate(kf5.split(X, y), start=1):
#     X_train, y_train = X.iloc[train_index], y.iloc[train_index] 
#     X_valid, y_valid = X.iloc[valid_index], y.iloc[valid_index] 
#     #Take out the columns not used in the model (id, kfold, target)
#     X_train = X_train[useful_features]
#     X_valid = X_valid[useful_features]
    
#     valid_ids = X_valid.index.values.tolist()
    
#     # Fitting the model
#     myPipeLine3.fit(X_train,y_train)

#     # Prediction from X_valid
#     preds_valid = myPipeLine3.predict(X_valid)
    
#     #Final predictions
#     test_preds = myPipeLine3.predict(X_test)
#     final_test_predictions.append(test_preds)
#     final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    
#     #Scoring the model
#     foldScore = mean_squared_error(y_valid, preds_valid, squared = False)
#     scores.append(foldScore)
#     #Using mean squared error to grade the algorithm, take the sqr root to find the RMSE
#     print(f"RMSE for the {fold}th fold is {foldScore}") 
    
# #Get the fold with the best score:
# fold_scores = {i+1:scores[i] for i in range(len(scores))}
# best_kfold = min(fold_scores, key=fold_scores.get)
# print("the best kth fold is the " + str(best_kfold) +"th fold")

# final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient ='index').reset_index()
# final_valid_predictions.columns = ['id','target_pred_3']
# final_valid_predictions.to_csv('train_pred_3.csv', index = False)

# sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis = 1) 
# sample_submission.columns = ['id','target_pred_3']
# sample_submission.to_csv('test_pred_3.csv', index = False)

In [12]:
#Combining results of 3 models together
X = pd.read_csv('/kaggle/input/30-days-of-ml/train.csv', index_col = 0)
X_test_final = pd.read_csv('/kaggle/input/30-days-of-ml/test.csv')
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

df1 = pd.read_csv('train_pred_1.csv')
df2 = pd.read_csv('train_pred_2.csv')
df3 = pd.read_csv('train_pred_3.csv')

df_test1 = pd.read_csv('test_pred_1.csv')
df_test2 = pd.read_csv('test_pred_2.csv')
df_test3 = pd.read_csv('test_pred_3.csv')

X = X.merge(df1, on='id', how='left')
X = X.merge(df2, on='id', how='left')
X = X.merge(df3, on='id', how='left')

X_test_final = X_test_final.merge(df_test1, on ='id', how ='left')
X_test_final = X_test_final.merge(df_test2, on ='id', how ='left')
X_test_final = X_test_final.merge(df_test3, on ='id', how ='left')

X_test_final.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target_pred_1,target_pred_2,target_pred_3
0,0,B,B,B,C,B,B,A,E,E,...,0.321832,0.445212,0.290258,0.244476,0.087914,0.301831,0.845702,8.041658,8.041658,8.041658
1,5,A,B,A,C,B,C,A,E,C,...,0.835961,0.391657,0.288276,0.549568,0.905097,0.850684,0.69394,8.322353,8.322353,8.322353
2,15,B,A,A,A,B,B,A,E,D,...,0.879379,0.275549,0.427871,0.491667,0.384315,0.376689,0.508099,8.402033,8.402033,8.402033
3,16,B,B,A,C,B,D,A,E,A,...,0.644315,1.024017,0.39109,0.98834,0.411828,0.393585,0.461372,8.455472,8.455472,8.455472
4,17,B,B,A,C,B,C,A,E,C,...,0.408874,0.447887,0.390253,0.648932,0.385935,0.370401,0.900412,8.205237,8.205237,8.205237


In [13]:
#Model blending 
useful_preds_as_features = ['target_pred_1', 'target_pred_2','target_pred_3']
X_test_final = X_test_final[useful_preds_as_features]

#Recreate 5 folds of data
kf5 = KFold(n_splits = 5, shuffle = False, random_state = 0)
final_predictions = []
scores = []

for fold, (train_index, valid_index) in enumerate(kf5.split(X, y), start=1):
    X_train, y_train = X.iloc[train_index], y.iloc[train_index] 
    X_valid, y_valid = X.iloc[valid_index], y.iloc[valid_index] 
    #Replace the columns with the results of the 3 previous models 
    X_train = X_train[useful_preds_as_features]
    X_valid = X_valid[useful_preds_as_features]
    
    #Using linear regression to get the weighted values of these
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Prediction from X_valid
    preds_valid = model.predict(X_valid)
    
    #Final predictions
    test_preds = model.predict(X_test_final)
    final_predictions.append(test_preds)
       
    #Scoring the model
    foldScore = mean_squared_error(y_valid, preds_valid, squared = False)
    scores.append(foldScore)
    #Using mean squared error to grade the algorithm, take the sqr root to find the RMSE
    print(f"RMSE for the {fold}th fold is {foldScore}") 
    
#Get the fold with the best score:
fold_scores = {i+1:scores[i] for i in range(len(scores))}
best_kfold = min(fold_scores, key=fold_scores.get)
print("the best kth fold is the " + str(best_kfold) +"th fold")
print('the average fold score is '+ str(sum(scores)/len(scores)))

RMSE for the 1th fold is 0.7170793215300751
RMSE for the 2th fold is 0.7228291918981864
RMSE for the 3th fold is 0.7163768897303818
RMSE for the 4th fold is 0.7199182961342376
RMSE for the 5th fold is 0.7182984816114351
the best kth fold is the 3th fold
the average fold score is 0.7189004361808632


In [14]:
# Run the code to save predictions in the format used for competition scoring
sample_submission.target = np.mean(np.column_stack(final_predictions), axis = 1) 
sample_submission.to_csv('5th_submission_model_blend_no_GPU.csv', index = False)