In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# For one-hot encoding categorical variables
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing

# from sklearn.model_selection import train_test_split We won't be needing this anymore!

# For the construction of the pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# For training the XGBoost model
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/30-days-of-ml/sample_submission.csv
/kaggle/input/30-days-of-ml/train.csv
/kaggle/input/30-days-of-ml/test.csv
/kaggle/input/train-folds-k-folds-30-days-of-ml/train_folds_20_folds.csv
/kaggle/input/train-folds-k-folds-30-days-of-ml/train_folds_50_folds.csv
/kaggle/input/train-folds-k-folds-30-days-of-ml/train_folds_15_folds.csv
/kaggle/input/train-folds-k-folds-30-days-of-ml/train_folds_30_folds.csv
/kaggle/input/train-folds-k-folds-30-days-of-ml/train_folds_5_folds.csv
/kaggle/input/train-folds-k-folds-30-days-of-ml/train_folds_25_folds.csv
/kaggle/input/train-folds-k-folds-30-days-of-ml/train_folds_10_folds.csv
/kaggle/input/train-folds-k-folds-30-days-of-ml/train_folds_12_folds.csv
/kaggle/input/train-folds-k-folds-30-days-of-ml/train_folds_60_folds.csv
/kaggle/input/train-folds-k-folds-30-days-of-ml/train_folds_40_folds.csv
/kaggle/input/train-folds-k-folds-30-days-of-ml/train_folds_3_folds.csv
/kaggle/input/train-folds-k-folds-30-days-of-ml/train_folds_6_folds.csv


### 1st Model, Optimized XGBoost, Standardization, no Target Encoding

In [2]:
# Load the training and test data. 
fold_num = 10
X_full = pd.read_csv("../input/train-folds-k-folds-30-days-of-ml/train_folds_"+ str(fold_num)+ "_folds.csv")
X_test_full = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")


In [3]:
# We select all features except "id", "target" and "kfold", as these are not predictors of our target.
useful_features = [c for c in X_full.columns if c not in ("id", "target", "kfold")]

# Select numerical columns by data type, not by column name
num_cols = [col for col in X_full[useful_features] if X_full[col].dtype in ['int64', 'float64']]

# We select categorical columns. Note that we dropped the cardinality check.
object_cols = [col for col in useful_features if 'cat' in col]

# We build X_test out of X_test_full, but only selecting the useful features.
X_test = X_test_full[useful_features]

# Preprocessing for numerical data
numerical_transformer = preprocessing.StandardScaler()

# Preprocessing for categorical data and one-hot encoding
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, num_cols),('cat', categorical_transformer, object_cols)])

# Define the model 
model = XGBRegressor(tree_method='gpu_hist',
                     gpu_id=0, 
                     predictor="gpu_predictor",
                     n_estimators = 25000,
                     learning_rate=0.011185700021155284,
                     reg_lambda=3.689489715666498e-07,
                     reg_alpha=1.219041306467414e-05,
                     subsample=0.5713235792096898,
                     colsample_bytree=0.449446046,
                     max_depth=3)

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

# We set up a list to store the final test and valid predictions.
final_test_predictions = []
final_valid_predictions = {}

# We set up a list for storing the mean non squared error scores.
scores = []

# We set the loop to loop across all of the folds.
for fold in range(fold_num):
    X_train = X_full[X_full.kfold != fold].reset_index(drop=True) # We set the training data to be all folds different from the current fold number in the loop. We also reset the indices.
    X_valid = X_full[X_full.kfold == fold].reset_index(drop=True) # The validation data is the current fold number in the loop. We also reset the indices.
    X_test_copy = X_test.copy() # We copy the original X_test to not alter or overwrite over it.
    
    valid_ids = X_valid.id.values.tolist()
    
    y_train = X_train.target # We set the training target equal to the target in the training set. This has to be done every iteration (as the fold and the data changes).
    y_valid = X_valid.target # We set the validation target equal to the target in the validation set. This has to be done every iteration (as the fold and the data changes).
    
    X_train = X_train[useful_features] # We set our training data to be the previously defined useful features of X_train.
    X_valid = X_valid[useful_features] # We set our validation data to be the previously defined useful features of X_valid.
    
    # We activate the pipeline, which preprocesses the training data and fits the model (will take about 10 minutes to run)
    my_pipeline.fit(X_train, y_train)

    preds_valid = my_pipeline.predict(X_valid) # We instruct the pipeline to make predictions on X_valid.
    preds_test = my_pipeline.predict(X_test) # We instruct the pipeline to make predictions on X_test.
    
    final_test_predictions.append(preds_test) # We append each of the test predictions on to our final_predictions list.
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    
    rmse = mean_squared_error(y_valid, preds_valid, squared=False) # We store the mean non squared error in a variable.
    print(fold, rmse) # Print the fold number, and the mean non squared error for each fold.
    scores.append(rmse) # We append the rmse value to the scores list.
    
print(np.mean(scores), np.std(scores)) # Print the mean non square error average, and its standard deviation

final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_1"]
final_valid_predictions.to_csv("train_pred_1.csv", index=False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_1"]
sample_submission.to_csv("test_pred_1.csv", index=False)

0 0.7155336816958349
1 0.716993928592888
2 0.7206393577105712
3 0.7273639518846338
4 0.7203349565079591
5 0.7186480903753499
6 0.7173526913226549
7 0.7210420259736751
8 0.7162417362246509
9 0.7120861252032764
0.7186236545491494 0.0038918586806434455


### 2nd Model, Optimized XGBoost, Standardization, Target Encoding

In [4]:
# Load the training and test data. 
X_full = pd.read_csv("../input/train-folds-k-folds-30-days-of-ml/train_folds_"+ str(fold_num)+ "_folds.csv")
X_test_full = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

# We select all features except "id", "target" and "kfold", as these are not predictors of our target.
useful_features = [c for c in X_full.columns if c not in ("id", "target", "kfold")]

# Select numerical columns by data type, not by column name
num_cols = [col for col in X_full[useful_features] if X_full[col].dtype in ['int64', 'float64']]

# We select categorical columns. Note that we dropped the cardinality check.
object_cols = [col for col in useful_features if col.startswith("cat")]

# We build X_test out of X_test_full, but only selecting the useful features.
X_test = X_test_full[useful_features]

# Next up, we set up the for loop which will perform the target encoding:
for col in object_cols: 
    temp_X_full = [] # We create a temporary list to store the dataframes.
    temp_test_feature = None # We create a temporary feature for the test set.
    
    for fold in range(fold_num): # We loop across all folds
        X_train = X_full[X_full.kfold != fold].reset_index(drop=True) 
        X_valid = X_full[X_full.kfold == fold].reset_index(drop=True) 
        feat = X_train.groupby(col)["target"].agg("mean") # We group the columns by target, and then we get the mean value of the values in "target" column.
        feat = feat.to_dict() # We convert the dataframe into a dictionary.
        X_valid.loc[:, f"tar_enc_{col}"] = X_valid[col].map(feat) # We map the mean values to a new column in X_valid.
        temp_X_full.append(X_valid) # We append X_valid to our temporary list.
        
        if temp_test_feature is None: # If we don't have a temp_test_feature...
            temp_test_feature = X_test[col].map(feat) # ...we assign it this value.
            
        else: # If its not None, (for folds above 0)...
            temp_test_feature = temp_test_feature + X_test[col].map(feat) # ...add to it the present value.
            
    temp_test_feature = temp_test_feature/fold_num # We divide by the number of folds to get the average.
    X_test.loc[:, f"tar_enc_{col}"] = temp_test_feature # We assign the temp_test_feat value to a new column.
    X_full = pd.concat(temp_X_full) # We build the new X_full dataframe with the new target encoding columns.
    
# Preprocessing for numerical data, we use a StandardScaler to apply standardization.
numerical_transformer = preprocessing.StandardScaler()

# Preprocessing for categorical data and one-hot encoding.
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, num_cols),('cat', categorical_transformer, object_cols)])

# Define the model 
model_2 = XGBRegressor(tree_method='gpu_hist',
                     gpu_id=0, 
                     predictor="gpu_predictor",
                     n_estimators = 25000,
                     learning_rate=0.03313934079213014,
                     reg_lambda=7.795455194937734e-07,
                     reg_alpha=11.375472681850685,
                     subsample=0.8202458209691414,
                     colsample_bytree=0.10071397127578051,
                     max_depth=3)

# Bundle preprocessing and modeling code in a pipeline
my_pipeline_2 = Pipeline(steps=[('preprocessor', preprocessor), ('model', model_2)])

# We set up a list to store the final test and valid predictions.
final_test_predictions_2 = []
final_valid_predictions_2 = {}

# We set up a list for storing the mean non squared error scores.
scores_2 = []

# We set the loop to loop across all of the folds.
for fold in range(fold_num):
    X_train = X_full[X_full.kfold != fold].reset_index(drop=True) # We set the training data to be all folds different from the current fold number in the loop. We also reset the indices.
    X_valid = X_full[X_full.kfold == fold].reset_index(drop=True) # The validation data is the current fold number in the loop. We also reset the indices.
    X_test_copy = X_test.copy() # We copy the original X_test to not alter or overwrite over it.
    
    valid_ids = X_valid.id.values.tolist()
    
    y_train = X_train.target # We set the training target equal to the target in the training set. This has to be done every iteration (as the fold and the data changes).
    y_valid = X_valid.target # We set the validation target equal to the target in the validation set. This has to be done every iteration (as the fold and the data changes).
    
    X_train = X_train[useful_features] # We set our training data to be the previously defined useful features of X_train.
    X_valid = X_valid[useful_features] # We set our validation data to be the previously defined useful features of X_valid.
    
    # We activate the pipeline, which preprocesses the training data and fits the model (will take about 10 minutes to run)
    my_pipeline_2.fit(X_train, y_train)

    preds_valid = my_pipeline_2.predict(X_valid) # We instruct the pipeline to make predictions on X_valid.
    preds_test = my_pipeline_2.predict(X_test) # We instruct the pipeline to make predictions on X_test.
    
    final_test_predictions_2.append(preds_test) # We append each of the test predictions on to our final_predictions list.
    final_valid_predictions_2.update(dict(zip(valid_ids, preds_valid)))
    
    final_test_predictions_2.append(preds_test) # We append each of the test predictions on to our final_predictions list.
    rmse = mean_squared_error(y_valid, preds_valid, squared=False) # We store the mean non squared error in a variable.
    print(fold, rmse) # Print the fold number, and the mean non squared error for each fold.
    scores_2.append(rmse) # We append the rmse value to the scores list.
    
print(np.mean(scores_2), np.std(scores_2)) # Print the mean non square error average, and its standard deviation

final_valid_predictions_2 = pd.DataFrame.from_dict(final_valid_predictions_2, orient="index").reset_index()
final_valid_predictions_2.columns = ["id", "pred_2"]
final_valid_predictions_2.to_csv("train_pred_2.csv", index=False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions_2), axis=1)
sample_submission.columns = ["id", "pred_2"]
sample_submission.to_csv("test_pred_2.csv", index=False)



0 0.7156060662672703




1 0.717208611267233




2 0.7209856692202776




3 0.7276157070892456




4 0.7204396971547282




5 0.718747089188121




6 0.7173712429619467




7 0.7212983795269985




8 0.7165080570375956




9 0.712405897990994
0.718818641770441 0.003903028226237659


### Model Blend

In [5]:
X_full = pd.read_csv("../input/train-folds-k-folds-30-days-of-ml/train_folds_"+ str(fold_num)+ "_folds.csv")
X_test_full = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

df1 = pd.read_csv("train_pred_1.csv")
df2 = pd.read_csv("train_pred_2.csv")

df_test1 = pd.read_csv("test_pred_1.csv")
df_test2 = pd.read_csv("test_pred_2.csv")

X_full = X_full.merge(df1, on="id", how="left")
X_full = X_full.merge(df2, on="id", how="left")

X_test_full = X_test_full.merge(df_test1, on="id", how="left")
X_test_full = X_test_full.merge(df_test2, on="id", how="left")

X_full.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont8,cont9,cont10,cont11,cont12,cont13,target,kfold,pred_1,pred_2
0,1,B,B,B,C,B,B,A,E,C,...,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634,9,8.527834,8.538169
1,2,B,B,A,A,B,D,A,F,A,...,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233,1,8.331272,8.350676
2,3,A,A,A,C,B,D,A,D,A,...,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351,8,8.22294,8.206738
3,4,B,B,A,C,B,D,A,E,C,...,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253,2,8.400646,8.46139
4,6,A,A,A,C,B,D,A,E,A,...,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226,1,8.209665,8.299101


In [6]:
useful_features = ["pred_1", "pred_2"]
X_test_full = X_test_full[useful_features]

final_predictions = []
scores = []
for fold in range(fold_num):
    X_train = X_full[X_full.kfold != fold].reset_index(drop=True) # We set the training data to be all folds different from the current fold number in the loop. We also reset the indices.
    X_valid = X_full[X_full.kfold == fold].reset_index(drop=True) # The validation data is the current fold number in the loop. We also reset the indices.
    X_test = X_test_full.copy() # We copy the original X_test to not alter or overwrite over it.
    
    y_train = X_train.target # We set the training target equal to the target in the training set. This has to be done every iteration (as the fold and the data changes).
    y_valid = X_valid.target # We set the validation target equal to the target in the validation set. This has to be done every iteration (as the fold and the data changes).
    
    X_train = X_train[useful_features] # We set our training data to be the previously defined useful features of X_train.
    X_valid = X_valid[useful_features] # We set our validation data to be the previously defined useful features of X_valid.
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

0 0.7151671615781626
1 0.716550659102781
2 0.7203973853382206
3 0.7270745422373982
4 0.7199691399633833
5 0.7181413316688922
6 0.7168325916818599
7 0.7205987961221332
8 0.7158858086237615
9 0.7117439291860375
0.718236134550263 0.003915163318265625


In [7]:
sample_submission.target = np.mean(np.column_stack(final_predictions), axis=1)
sample_submission.to_csv("submission.csv", index=False)