In [None]:
# pip install bayesian-optimization

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
import numpy as np
import missingno as msno
from lightgbm import LGBMRegressor
import random
from sklearn.impute import SimpleImputer
import time 
from sklearn.metrics import mean_squared_error
from bayes_opt import BayesianOptimization
import lightgbm as lgb
import xgboost as xgb
import warnings
from tqdm import tqdm


warnings.filterwarnings("ignore")

%matplotlib inline

In [None]:
data = pd.read_csv("../input/tabular-playground-series-jun-2022/data.csv", index_col='row_id')

In [None]:
data.shape

In [None]:
data

In [None]:
msno.matrix(data)

In [None]:
nans_percentage = data.isna().sum().sum() / data.size * 100
print(f"Dataset contains {nans_percentage} % of Nans")

In [None]:
data.dtypes.value_counts()

In [None]:
# shows only int64 columns

mask_int64 = data.dtypes == 'int64'
data.loc[:, data.columns[mask_int64]].columns

In [None]:
def split_cols(col):
    splitted = col.split('_')
    return splitted[1], splitted[2]

In [None]:
list_col_2 = [x for x in data.columns if (split_cols(x)[0] == '2')]
data_col_2 = data[list_col_2]

print(f"Columns starting with F_2 have \
{data_col_2.isna().sum().sum()} Nans")

In [None]:
# correlation of all columns containing nans

pearsoncorr = data.loc[:, data.columns[~mask_int64]].corr()

plt.figure(figsize=(30, 30))
sns.heatmap(np.abs(pearsoncorr),
            xticklabels=pearsoncorr.columns,
            yticklabels=pearsoncorr.columns,
            cmap='RdBu_r',
            annot=True,
            linewidth=0.5)

In [None]:
# shows correlations of columns starting with F_4 only

pearsoncorr_4 = pearsoncorr.loc[[row for row in pearsoncorr.index
                                 if row.split('_')[1] == '4'],
                                [col for col in pearsoncorr.columns
                                 if (col.split('_')[1] == '4')]]

plt.figure(figsize=(15, 7))
sns.heatmap(np.abs(pearsoncorr_4),
            xticklabels=pearsoncorr_4.columns,
            yticklabels=pearsoncorr_4.columns,
            cmap='RdBu_r',
            annot=True,
            linewidth=0.5)

In [None]:
list_col_nans = [col for col in data.columns if (data[col].isna().sum() != 0)]

print(f"""Columns with nans have an average of \
{round(data[list_col_nans].isna().mean().mean()*100, 2)} \
% nans with a mininum of \
{round(data[list_col_nans].isna().mean().min()*100, 2)} \
% of nans and a maximum of \
{round(data[list_col_nans].isna().mean().max()*100, 2)} \
% of nans""")

In [None]:
# defining the training scope for Bayesian Optimization of group 4 columns

list_col_4 = [col for col in data.columns if (col.split('_')[1] == '4')]
target_col = 'F_4_1'
training_size = 100000

target_nan_idx = data[data[target_col].isnull()].index

train_set = data.drop(target_nan_idx, axis=0)
test_set = data[data.index.isin(target_nan_idx)]

X = train_set.drop([target_col], axis=1)
y = train_set[target_col]

subset_idx = random.sample(list(X.index), training_size)

X = X.loc[subset_idx]
y = y.loc[subset_idx]

In [None]:
X.shape

In [None]:
dtrain = xgb.DMatrix(X, label=y)


def xgb_evaluate(max_depth, gamma, eta, colsample_bytree):
    params = {'eval_metric': 'rmse',
              'max_depth': int(max_depth),
              'subsample': 0.8,
              'eta': eta,
              'gamma': gamma,
              'colsample_bytree': colsample_bytree}
    # Used around 1000 boosting rounds in the full model
    cv_result = xgb.cv(params, dtrain, num_boost_round=100, nfold=3)

    # Bayesian only knows how to maximize so return the negative RMSE
    return -1.0 * cv_result['test-rmse-mean'].iloc[-1]

In [None]:
xgb_bo = BayesianOptimization(xgb_evaluate, {'max_depth': (1, 10),
                                             'gamma': (0, 5),
                                             'eta': (0.01, 0.1),
                                             'colsample_bytree': (0.3, 0.9)})
# Use the expected improvement acquisition function to handle negative numbers
# Optimally needs quite a few more initiation points and number of iterations
xgb_bo.maximize(init_points=3, n_iter=5, acq='ei')

In [None]:
dtrain = lgb.Dataset(data=X, label=y)


def lgb_eval(learning_rate, num_iterations, num_leaves):
    params = {'objective': 'regression',
              'learning_rate': learning_rate,
              'num_iterations': int(num_iterations),
              'num_leaves': int(num_leaves),
              'max_bin': 50,
              'metric': 'rmse',
              'force_col_wise': 'true',
              'verbose': -1}
    
    cv_result = lgb.cv(params, train_set=dtrain, nfold=5, metrics='rmse', stratified=False)
    return -1.0 * min(cv_result['rmse-mean'])

In [None]:
lgbBO = BayesianOptimization(lgb_eval,
                             {'learning_rate': (0.01, 0.1),
                              'num_iterations':(100, 1000),
                              'num_leaves': (50, 150)
                             })

lgbBO.maximize(init_points=3, n_iter=5, acq='ei')

In [None]:
predictions = {}

data_col_4 = data[list_col_4]

for target_col in list_col_4:
  
    target_nan_idx = data_col_4[data_col_4[target_col].isnull()].index
    train_set = data_col_4.drop(target_nan_idx, axis=0)
    test_set = data_col_4[data_col_4.index.isin(target_nan_idx)]


    X_train = train_set.drop([target_col], axis=1)
    y_train = train_set[target_col]
    X_test = test_set.loc[target_nan_idx].drop(target_col, axis=1)

    model = LGBMRegressor(n_estimators=100,
                          metric='rmse',
                          learning_rate=0.09,
                          num_iterations=162,
                          num_leaves=148)
    print('\nProcessing Column Name : ', target_col)
    model.fit(X_train, y_train)
    predictions[target_col] = model.predict(X_test)
    print(target_col, ' processed')

In [None]:
for col in list(predictions.keys()):
    data[col][data[col].isna()] = predictions[col]

In [None]:
print(f"""Columns starting with F_4 now have \
{data[list_col_4].isna().sum().sum()} nans left""")

In [None]:
imp = SimpleImputer(
         missing_values=np.nan,
         strategy='mean') 
data[:] = imp.fit_transform(data)

In [None]:
print(f"The whole dataset now has \
{data.isna().sum().sum()} nans left")

In [None]:
path_sample = '../input/tabular-playground-series-jun-2022/sample_submission.csv'

submission = pd.read_csv(path_sample, index_col='row-col')
for i in tqdm(submission.index):
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    submission.loc[i, 'value'] = data.loc[row, col]

submission.to_csv('submission.csv')