In [4]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from xgboost import XGBRegressor, DMatrix

import matplotlib.pyplot as plt
%matplotlib inline            
import seaborn as sns

from time import time
import pprint
import joblib
from functools import partial

# Suppressing warnings because of skopt verbosity
import warnings
warnings.filterwarnings("ignore")

# Skopt functions
from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, DeltaYStopper
from skopt.space import Real, Categorical, Integer

# Model selection
from sklearn.model_selection import KFold


from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import lightgbm as lgb



In [7]:
df = pd.read_csv("input/train_5folds.csv")
df_test = pd.read_csv("input/test.csv")
sample_submission = pd.read_csv("input/sample_submission.csv")

df = df.drop(df[df['target'].lt(6)].index)
print(df.shape)

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
numerical_cols = [col for col in useful_features if 'cont' in col]
df_test = df_test[useful_features]


# target encoding
for col in object_cols:
    temp_df = []
    temp_test_feat = None
    for fold in range(5):
        xtrain =  df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        feat = xtrain.groupby(col)["target"].agg("mean")
        feat = feat.to_dict()
        xvalid.loc[:, f"tar_enc_{col}"] = xvalid[col].map(feat)
        temp_df.append(xvalid)
        if temp_test_feat is None:
            temp_test_feat = df_test[col].map(feat)
        else:
            temp_test_feat += df_test[col].map(feat)
    
    temp_test_feat /= 5
    df_test.loc[:, f"tar_enc_{col}"] = temp_test_feat
    df = pd.concat(temp_df)

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
df_test = df_test[useful_features]


# Processing categoricals with SVD encoding
X_seq = df[object_cols].apply(lambda x: " ".join(list([str(y) + str(i) for i, y in enumerate(x)])), axis=1)
X_test_seq = df_test[object_cols].apply(lambda x: " ".join(list([str(y) + str(i) for i, y in enumerate(x)])), axis=1)


latent_dims = 24

svd_feats = ['svd_'+str(l) for l in range(latent_dims)]
vectorizer = TfidfVectorizer()

dim_reductio = TruncatedSVD(n_components=24, random_state=0)
df[svd_feats] =  dim_reductio.fit_transform(vectorizer.fit_transform(X_seq))
df_test[svd_feats] = dim_reductio.transform(vectorizer.transform(X_test_seq))


    
# Processing categoricals with frequency encoding
object_cols = [item for item in df.columns if 'cat' in item]

for cat in object_cols:
    counts = dict(df[cat].value_counts() / len(df))
    df[cat+'_freq'] = df[cat].replace(counts)
    df_test[cat+'_freq'] = df_test[cat].replace(counts)

frequencies = [cat+'_freq' for cat in object_cols]


useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]




final_predictions = []
scores=[]
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

   
    # standarization
    scaler = preprocessing.StandardScaler()
    xtrain[numerical_cols] = scaler.fit_transform(xtrain[numerical_cols])
    xvalid[numerical_cols] = scaler.transform(xvalid[numerical_cols])
    xtest[numerical_cols] = scaler.transform(xtest[numerical_cols])



    # label encode columns 
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.fit_transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.fit_transform(xtest[object_cols])
 
    model= lgb.LGBMRegressor(boosting_type='gbdt',
                        metric='rmse',
                        n_jobs=-1, 
                        verbose=-1,
                        random_state=42,
                        n_estimators= 2683,
                        learning_rate= 0.010250629304555186,
                        num_leaves= 79,
                        max_depth= 256,
                        subsample= 0.7778732709684482,
                        subsample_freq= 9,
                        colsample_bytree= 0.35917838955653647,
                        reg_lambda= 2.943257012154159,
                        reg_alpha= 2.416846681288718                     
                        )


    print(ytrain)
    
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold,rmse)
    scores.append(rmse)
    
print (np.mean(scores),np.std(scores))



FileNotFoundError: [Errno 2] No such file or directory: '/input/train_folds.csv'

0.7289161215950625  ordinal + stand   <br>
0.7359939686055645  ordinal + normalizer <br>
0.7339300282189536  ordinal + standard + normalizer <br>
0.7359933943841589  ordinal + normalizer +standard   <br>
0.7288775872910201  ohe + stand   <br>
0.7359376887794242   ohe + normalizer   <br>
0.7289403434752502 (ohe+ 3 ordinal) + stand   <br>
0.7291070689887855  poly3 (T,F) (ohe+ 1 ordinal) + stand   <br>
0.72914235959686  poly3 (F,F) (ohe+ 1 ordinal) + stand   <br>
0.728907008813998  poly2 (F,F) (ohe+ 1 ordinal) + stand   <br>
0.7289321289479873   poly2 (F,T) (ohe+ 1 ordinal) + stand   <br>
0.7289501787229472   poly2 (T,T) (ohe+ 1 ordinal) + stand   <br>
0.7289416327232601   poly2 (T,F) (ohe+ 1 ordinal) + stand   <br>
0.7288644838881868  (ohe+ 1 ordinal) + stand   <br>
0.7189543356528036   T_outliers+ (ohe+ 1 ordinal) + stand <br>  
0.7205793549092518    T_encoding + T_outliers+ (ohe+ 1 ordinal) + stand <br>

-0.7086640526467916  opt + T_encoding + T_outliers+ (ohe+ 1 ordinal) + stand <br>
0.7119660598199944  opt + SVD encoding + T_encoding + T_outliers+ (ohe+ 1 ordinal) + stand <br>
6.7606918549139055 No_opt + SVD encoding + T_encoding + T_outliers+ (ohe+ 1 ordinal) + stand + LIGHTGBM (dart) <br>
0.7112411646909086 No_opt + SVD encoding + T_encoding + T_outliers+ (ohe+ 1 ordinal) + stand + LIGHTGBM (gbdt) <br>
_