In [1]:
import numpy as np
import pandas as pd
import catboost as ctb
import lightgbm as lgb
import copy
import pickle
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.callbacks import ModelCheckpoint, EarlyStopping

Using TensorFlow backend.


In [2]:
data = pd.read_excel("/usr/input/flipr-hackathon-dataset/Train_dataset.xlsx")

In [3]:
data = data.drop(["Name", "Designation", "Region", "people_ID"],axis = 1)
data = data.loc[~data.iloc[:,[0,1,3,4,7,10,11,]].isnull().any(axis=1)]
y = data["Infect_Prob"]
data = data.drop(["Infect_Prob"], axis = 1)
#y = (y >= 50).astype("float64")
y /= 100

In [4]:
X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(data.values, y, random_state = 42)

In [5]:
cat_features = [0,1,3,4,7,10,11,]

Best parameters of catboost:

In [6]:
clf_cat = ctb.CatBoostRegressor(boosting_type="Plain", loss_function="RMSE", learning_rate=0.01, border_count=254,\
                            verbose=0, n_estimators=10000,\
                            cat_features = cat_features, \
                            depth = 10, l2_leaf_reg = 1, subsample = 0.6,)
clf_cat.fit(X_train_cat, y_train_cat, eval_set = [(X_test_cat, y_test_cat)], early_stopping_rounds = 20,)

<catboost.core.CatBoostRegressor at 0x7fee668ecc50>

In [7]:
mean_squared_error(y_test_cat, clf_cat.predict(X_test_cat))

0.006410789035192972

Best parameters of LiteGBM

In [8]:
# preprocessing
le_list = []
for i in cat_features:
    le = LabelEncoder()
    data.iloc[:,i] = le.fit_transform(data.iloc[:,i])
    le_list.append(copy.copy(le))
X_train, X_test, y_train, y_test = train_test_split(data.values, y, random_state = 42)

In [9]:
clf_lit = lgb.LGBMRegressor(boosting_type="gbdt",objective="regression",learning_rate=0.001, \
                        bagging_freq=1, \
                        max_bin=100000, scale_pos_weight=1, \
                        metric= "mse", n_threads = 3,n_estimators= 99999, \
                        depth = 2, num_leaves = 5, min_child_weight = 250, \
                        colsample_bytree = 0.9, bagging_fraction = 0.8)
clf_lit.fit(X_train, y_train, eval_set = [(X_test, y_test)], early_stopping_rounds = 30, verbose = 0, categorical_feature = [0,1,2])

New categorical_feature is [0, 1, 2]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


LGBMRegressor(bagging_fraction=0.8, bagging_freq=1, boosting_type='gbdt',
              class_weight=None, colsample_bytree=0.9, depth=2,
              importance_type='split', learning_rate=0.001, max_bin=100000,
              max_depth=-1, metric='mse', min_child_samples=20,
              min_child_weight=250, min_split_gain=0.0, n_estimators=99999,
              n_jobs=-1, n_threads=3, num_leaves=5, objective='regression',
              random_state=None, reg_alpha=0.0, reg_lambda=0.0,
              scale_pos_weight=1, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0)

In [10]:
mean_squared_error(y_test, clf_lit.predict(X_test))

0.006359819464519343

In [11]:
mean_squared_error(y_test, (clf_lit.predict(X_test) + clf_cat.predict(X_test_cat)) / 2)

0.006355923461151434

Deep learning model:

In [12]:
data = pd.read_excel("/kaggle/input/flipr-hackathon-dataset/Train_dataset.xlsx")
data = data.drop(["Name", "Designation", "Region", "people_ID"],axis = 1)
data = data.loc[~data.iloc[:,[0,1,3,4,7,10,11,]].isnull().any(axis=1)]
y = data["Infect_Prob"]
data = data.drop(["Infect_Prob"], axis = 1)
#y = (y >= 50).astype("float64")
y /= 100
cat_features = [0,1,3,4,7,10,11,]
data_one_hot = pd.get_dummies(data = data, columns = data.columns[cat_features])
imputed_data = data_one_hot.fillna(data.mean())
X_train_nn, X_test_nn, y_train_nn, y_test_nn = train_test_split(imputed_data.values, y, random_state = 42)
# scale the data
scaler = StandardScaler()
X_train_nn = scaler.fit_transform(X_train_nn)
X_test_nn = scaler.transform(X_test_nn)

In [13]:
X_train_nn.shape

(7248, 44)

In [14]:
model = keras.Sequential([layers.Dense(8, activation='relu', input_dim=44, ), \
                          layers.Dropout(0.2), \
                          layers.Dense(4, activation='relu', ), \
                          layers.Dropout(0.2), \
                          layers.Dense(1, activation = "linear"), \
                         ])

In [15]:
model.compile(optimizer = "adam", loss = "mean_squared_error", metrics = ["mse"])
# use early stopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience = 10)
# save the best model
mc = ModelCheckpoint('best_model.h5', monitor='val_mse', mode='min', verbose=1, save_best_only=True)

In [16]:
model.fit(X_train_nn, y_train_nn, epochs = 1000, validation_data = (X_test_nn, y_test_nn), callbacks = [es, mc],)

Train on 7248 samples, validate on 2417 samples
Epoch 1/1000
Epoch 00001: val_mse improved from inf to 0.05945, saving model to best_model.h5
Epoch 2/1000
Epoch 00002: val_mse improved from 0.05945 to 0.02393, saving model to best_model.h5
Epoch 3/1000
Epoch 00003: val_mse improved from 0.02393 to 0.01432, saving model to best_model.h5
Epoch 4/1000
Epoch 00004: val_mse improved from 0.01432 to 0.01061, saving model to best_model.h5
Epoch 5/1000
Epoch 00005: val_mse improved from 0.01061 to 0.00927, saving model to best_model.h5
Epoch 6/1000
Epoch 00006: val_mse improved from 0.00927 to 0.00877, saving model to best_model.h5
Epoch 7/1000
Epoch 00007: val_mse improved from 0.00877 to 0.00855, saving model to best_model.h5
Epoch 8/1000
Epoch 00008: val_mse improved from 0.00855 to 0.00835, saving model to best_model.h5
Epoch 9/1000
Epoch 00009: val_mse improved from 0.00835 to 0.00827, saving model to best_model.h5
Epoch 10/1000
Epoch 00010: val_mse improved from 0.00827 to 0.00809, savin

<tensorflow.python.keras.callbacks.History at 0x7fee686bb048>

In [17]:
saved_model = tf.keras.models.load_model('best_model.h5')

In [18]:
saved_model.evaluate(X_test_nn, y_test_nn, verbose = 2)

2417/2417 - 0s - loss: 0.0071 - mse: 0.0071


[0.007133160234519685, 0.0071331593]

In [19]:
saved_model.predict(X_test_nn)

array([[0.48880127],
       [0.48880127],
       [0.48880127],
       ...,
       [0.54803896],
       [0.54184115],
       [0.48880127]], dtype=float32)

In [20]:
mean_squared_error(y_test, ( clf_lit.predict(X_test) + clf_cat.predict(X_test_cat) + saved_model.predict(X_test_nn).ravel() ) / 3 )

0.006495825545635415

Adding neural network does not seem to improve accuracy

Let us try Support Vector Regression:

In [21]:
clf_svr = SVR(kernel = "linear", C = 0.002)
clf_svr.fit(X_train_nn, y_train_nn)
mean_squared_error(y_test_nn, clf_svr.predict(X_test_nn))

0.007702836956989377

In [22]:
clf_svr = SVR(kernel = "rbf", C = 0.1)
clf_svr.fit(X_train_nn, y_train_nn)
mean_squared_error(y_test_nn, clf_svr.predict(X_test_nn))

0.008410247796822276

Let us try Linear Regression:

In [23]:
lr = LinearRegression()
lr.fit(X_train_nn, y_train_nn)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [24]:
mean_squared_error(y_test_nn, lr.predict(X_test_nn))

0.006877525258544001

In [25]:
rr = Lasso(alpha = 0.0007)
rr.fit(X_train_nn, y_train_nn)
mean_squared_error(y_test_nn, rr.predict(X_test_nn))

0.006846409458613856

In [26]:
ss = Ridge(alpha=0.1)
ss.fit(X_train_nn, y_train_nn)
mean_squared_error(y_test_nn, ss.predict(X_test_nn))

0.006858180407403158

In [27]:
mean_squared_error(y_test, ( clf_lit.predict(X_test) + clf_cat.predict(X_test_cat) + saved_model.predict(X_test_nn).ravel() + ss.predict(X_test_nn) \
                           + rr.predict(X_test_nn)) / 5)

0.006548083424637212

Well, seems that our validation scores have worsened! <br>
So, we will stick with our ensemble of LightGBM + CatBoost<br>
Prediction for the test data is:

In [28]:
test_data = pd.read_excel("/kaggle/input/flipr-hackathon-dataset/Test_dataset.xlsx")

In [29]:
# process the test data
series_id = test_data["people_ID"]
test_data = test_data.drop(["Name", "Designation", "Region", "people_ID"],axis = 1)
test_data = test_data.loc[~test_data.iloc[:,[0,1,3,4,7,10,11,]].isnull().any(axis=1)]

In [30]:
pred1 = clf_cat.predict(test_data.values)

In [31]:
np.sum((pred1 < 0) | (pred1 > 1))

0

All the probablility values are valid.

In [32]:
# preprocessing
for i in reversed(cat_features):
    le = le_list.pop()
    test_data.iloc[:,i] = le.transform(test_data.iloc[:,i])

In [33]:
pred2 = clf_lit.predict(test_data.values)

In [34]:
pred_vals = pd.DataFrame((pred1+pred2)/2, index = series_id)

In [35]:
pred_vals.to_csv("/kaggle/working/problem1.csv")

In [36]:
# save the models for use in 2nd question:
pickle.dump(clf_cat,open("cat_boost_model","wb"))
pickle.dump(clf_lit,open("lightgbm_boost_model","wb"))