In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from xgboost import XGBRegressor, DMatrix

import matplotlib.pyplot as plt
%matplotlib inline            
import seaborn as sns

from time import time
import pprint
import joblib
from functools import partial

# Suppressing warnings because of skopt verbosity
import warnings
warnings.filterwarnings("ignore")

# Skopt functions
from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, DeltaYStopper
from skopt.space import Real, Categorical, Integer

# Model selection
from sklearn.model_selection import KFold


----------------------------------------------------------

In [None]:
df = pd.read_csv("../input/30days-kfolds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
numerical_cols = [col for col in useful_features if 'cont' in col]
df_test = df_test[useful_features]




final_predictions = []
scores=[]
for fold in range(6):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    # standarization
    scaler = preprocessing.StandardScaler()
    xtrain[numerical_cols] = scaler.fit_transform(xtrain[numerical_cols])
    xvalid[numerical_cols] = scaler.transform(xvalid[numerical_cols])
    xtest[numerical_cols] = scaler.transform(xtest[numerical_cols])
    
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0,predictor='gpu_predictor',objective='reg:squarederror',)
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold,rmse)
    scores.append(rmse)
    
print (np.mean(scores),np.std(scores))



0.7251556496577392 0.0019415578945857727<br>
0.7250257964049016 0.001937036680455837 <br>
0.725143199009303 0.00192218924080936<br>


In [None]:
# polinomial features
 
df = pd.read_csv("../input/30days-kfolds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
numerical_cols = [col for col in useful_features if 'cont' in col]
df_test = df_test[useful_features]

poly = preprocessing.PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
train_poly = poly.fit_transform(df[numerical_cols])
test_poly = poly.fit_transform(df_test[numerical_cols])

df_poly = pd.DataFrame(train_poly, columns= [f"poly_{i}" for i in range(train_poly.shape[1])])
df_test_poly = pd.DataFrame(test_poly, columns= [f"poly_{i}" for i in range(test_poly.shape[1])])

df = pd.concat([df, df_poly], axis = 1)
df_test = pd.concat([df_test, df_test_poly], axis = 1)

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
df_test = df_test[useful_features]

final_predictions = []
scores=[]
for fold in range(6):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
 
    
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0,predictor='gpu_predictor')
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold,rmse)
    scores.append(rmse)
    
print (np.mean(scores),np.std(scores))


In [None]:
# polinomial features
 
df = pd.read_csv("../input/30days-kfolds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
numerical_cols = [col for col in useful_features if 'cont' in col]
df_test = df_test[useful_features]

poly = preprocessing.PolynomialFeatures(degree=3, interaction_only=True, include_bias=False)
train_poly = poly.fit_transform(df[numerical_cols])
test_poly = poly.fit_transform(df_test[numerical_cols])

df_poly = pd.DataFrame(train_poly, columns= [f"poly_{i}" for i in range(train_poly.shape[1])])
df_test_poly = pd.DataFrame(test_poly, columns= [f"poly_{i}" for i in range(test_poly.shape[1])])

df = pd.concat([df, df_poly], axis = 1)
df_test = pd.concat([df_test, df_test_poly], axis = 1)

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
df_test = df_test[useful_features]

final_predictions = []
scores=[]
for fold in range(6):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
 
    
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0,predictor='gpu_predictor')
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold,rmse)
    scores.append(rmse)
    
print (np.mean(scores),np.std(scores))

degree 2= 0.7276382385675535 0.001629719053959<br>
degree 3= 0.7297184719788427 0.002042522648024433

## Binning the numerical feautures 
pd.cut

In [None]:
df.columns

In [None]:
fig, axes = plt.subplots(7, 2,figsize=(28, 40))
sns.distplot(ax=axes[0, 0],a=df['cont0'], kde=False)
sns.distplot(ax=axes[0, 1],a=df['cont1'], kde=False)
sns.distplot(ax=axes[1, 0],a=df['cont2'], kde=False)
sns.distplot(ax=axes[1, 1],a=df['cont3'], kde=False)
sns.distplot(ax=axes[2, 0],a=df['cont4'], kde=False)
sns.distplot(ax=axes[2, 1],a=df['cont5'], kde=False)
sns.distplot(ax=axes[3, 0],a=df['cont6'], kde=False)
sns.distplot(ax=axes[3, 1],a=df['cont7'], kde=False)
sns.distplot(ax=axes[4 ,0],a=df['cont8'], kde=False)
sns.distplot(ax=axes[4, 1],a=df['cont9'], kde=False)
sns.distplot(ax=axes[5, 0],a=df['cont10'], kde=False)
sns.distplot(ax=axes[5, 1],a=df['cont11'], kde=False)
sns.distplot(ax=axes[6, 0],a=df['cont12'], kde=False)
sns.distplot(ax=axes[6, 1],a=df['cont13'], kde=False)

In [None]:
quantile_list=[0,0.25,0.5,0.75,1.0]
quantile=df['cont0'].quantile(quantile_list)
quantile

In [None]:
quantile_labels = ['0-25Q', '25-50Q', '50-75Q', '75-100Q']
df['cont0_quantile_range'] = pd.qcut(
                                            df['cont0'], 
                                            q=quantile_list)
df['cont0_quantile_label'] = pd.qcut(
                                            df['cont0'], 
                                            q=quantile_list,       
                                            labels=quantile_labels)
df


In [None]:
# one hot encoding

df = pd.read_csv("../input/30days-kfolds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
numerical_cols = [col for col in useful_features if 'cont' in col]
df_test = df_test[useful_features]




final_predictions = []
scores=[]
for fold in range(6):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ohe = preprocessing.OneHotEncoder(sparse=False,handle_unknown="ignore")
    xtrain_ohe = ohe.fit_transform(xtrain[object_cols])
    xvalid_ohe = ohe.transform(xvalid[object_cols])
    xtest_ohe= ohe.transform(xtest[object_cols])
    
    xtrain_ohe = pd.DataFrame(xtrain_ohe, columns= [f"ohe_{i}" for i in range(xtrain_ohe.shape[1])])
    xvalid_ohe = pd.DataFrame(xvalid_ohe, columns= [f"ohe_{i}" for i in range(xvalid_ohe.shape[1])])
    xtest_ohe = pd.DataFrame(xtest_ohe, columns= [f"ohe_{i}" for i in range(xtest_ohe.shape[1])])

    xtrain = pd.concat([xtrain, xtrain_ohe], axis = 1)
    xvalid = pd.concat([xvalid, xvalid_ohe], axis = 1)
    xtest= pd.concat([xtest, xtest_ohe], axis = 1)
    
    xtrain = xtrain.drop(object_cols, axis = 1)
    xvalid = xvalid.drop(object_cols, axis = 1)
    xtest = xtest.drop(object_cols, axis = 1)    

    # standarization
    scaler = preprocessing.StandardScaler()
    xtrain[numerical_cols] = scaler.fit_transform(xtrain[numerical_cols])
    xvalid[numerical_cols] = scaler.transform(xvalid[numerical_cols])
    xtest[numerical_cols] = scaler.transform(xtest[numerical_cols])
    
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0,predictor='gpu_predictor')
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold,rmse)
    scores.append(rmse)
    
print (np.mean(scores),np.std(scores))

0.725143199009303 0.00192218924080936<br>
0.7253373267677583 0.001674973296325871

In [None]:
# one hot encoding  + sttandarization of ohe ¬ numerical


df = pd.read_csv("../input/30days-kfolds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
numerical_cols = [col for col in useful_features if 'cont' in col]
df_test = df_test[useful_features]




final_predictions = []
scores=[]
for fold in range(6):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ohe = preprocessing.OneHotEncoder(sparse=False,handle_unknown="ignore")
    xtrain_ohe = ohe.fit_transform(xtrain[object_cols])
    xvalid_ohe = ohe.transform(xvalid[object_cols])
    xtest_ohe= ohe.transform(xtest[object_cols])
    
    xtrain_ohe = pd.DataFrame(xtrain_ohe, columns= [f"ohe_{i}" for i in range(xtrain_ohe.shape[1])])
    xvalid_ohe = pd.DataFrame(xvalid_ohe, columns= [f"ohe_{i}" for i in range(xvalid_ohe.shape[1])])
    xtest_ohe = pd.DataFrame(xtest_ohe, columns= [f"ohe_{i}" for i in range(xtest_ohe.shape[1])])

    xtrain = pd.concat([xtrain, xtrain_ohe], axis = 1)
    xvalid = pd.concat([xvalid, xvalid_ohe], axis = 1)
    xtest= pd.concat([xtest, xtest_ohe], axis = 1)
    
    xtrain = xtrain.drop(object_cols, axis = 1)
    xvalid = xvalid.drop(object_cols, axis = 1)
    xtest = xtest.drop(object_cols, axis = 1)    

    # standarization
    scaler = preprocessing.StandardScaler()
    xtrain[:] = scaler.fit_transform(xtrain[:])
    xvalid[:] = scaler.transform(xvalid[:])
    xtest[:] = scaler.transform(xtest[:])
    
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0,predictor='gpu_predictor')
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold,rmse)
    scores.append(rmse)
    
print (np.mean(scores),np.std(scores))

In [None]:
# transform cat to numerical using counts

df = pd.read_csv("../input/30days-kfolds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
numerical_cols = [col for col in useful_features if 'cont' in col]
df_test = df_test[useful_features]

for col in object_cols:
    df[f"cont_{col}"] = df.groupby(col)[col].transform("count")
    df_test[f"cont_{col}"] = df_test.groupby(col)[col].transform("count")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith('cat')]
df_test = df_test[useful_features]    

final_predictions = []
scores=[]
for fold in range(6):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    
    # standarization
    scaler = preprocessing.StandardScaler()
    xtrain[numerical_cols] = scaler.fit_transform(xtrain[numerical_cols])
    xvalid[numerical_cols] = scaler.transform(xvalid[numerical_cols])
    xtest[numerical_cols] = scaler.transform(xtest[numerical_cols])
    
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0,predictor='gpu_predictor')
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold,rmse)
    scores.append(rmse)
    
print (np.mean(scores),np.std(scores))

0.7252247379444503 0.0018250341871516866 <br>

0.7252235823355567 0.0017443383032877378

# Combine cat variables
cat1_cat2
df[cat1] + "-" + df[cat2]

# Combine cat variables + numerical using groupby (mean, max..) 
cat1_cat2
df[cat1] + "-" + df[cat2]

In [None]:
preds = np.mean(np.column_stack(final_predictions), axis=1)

In [None]:
sample_submission.target = preds
sample_submission.to_csv("submission.csv", index=False)