In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, mean_squared_error, log_loss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.callbacks import ModelCheckpoint, EarlyStopping

Using TensorFlow backend.


In [2]:
tf.__version__

'2.1.0'

In [3]:
data = pd.read_excel("/usr/input/flipr-hackathon-dataset/Train_dataset.xlsx")

In [4]:
data = data.drop(["Name", "Designation",],axis = 1)
data = data.loc[~data.iloc[:,[1,2,3,5,6,9,12,13,]].isnull().any(axis=1)]
y = data["Infect_Prob"]
data = data.drop(["Infect_Prob"], axis = 1)
#y = (y >= 50).astype("float64")
y /= 100

These are the columns with the categorical features:

In [5]:
cat_features = [1,2,3,5,6,9,12,13,]

Convert into one-hot representation for the neural network:

In [6]:
data_one_hot = pd.get_dummies(data = data, columns = data.columns[cat_features])

Fill in the missing values using the mean of each columns:

In [7]:
imputed_data = data_one_hot.fillna(data.mean())

In [8]:
X_train, X_test, y_train, y_test = train_test_split(imputed_data.values, y, random_state = 42)

In [9]:
# scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
model = keras.Sequential([layers.Dense(8, activation='relu', input_dim=50, kernel_regularizer=keras.regularizers.l2(0.01)), \
                          layers.Dropout(0.2), \
                          layers.Dense(4, activation='relu',  kernel_regularizer=keras.regularizers.l2(0.01)), \
                          layers.Dropout(0.2), \
                          layers.Dense(1, activation = "linear"), \
                         ])

In [11]:
model.compile(optimizer = "adam", loss = "mean_squared_error", metrics = ["mse"])
# use early stopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience = 10)
# save the best model
mc = ModelCheckpoint('best_model.h5', monitor='val_mse', mode='min', verbose=1, save_best_only=True)

In [12]:
model.fit(X_train, y_train, epochs = 1000, validation_data = (X_test, y_test), callbacks = [es, mc])

Train on 7248 samples, validate on 2417 samples
Epoch 1/1000
Epoch 00001: val_mse improved from inf to 0.13313, saving model to best_model.h5
Epoch 2/1000
Epoch 00002: val_mse improved from 0.13313 to 0.03277, saving model to best_model.h5
Epoch 3/1000
Epoch 00003: val_mse improved from 0.03277 to 0.01725, saving model to best_model.h5
Epoch 4/1000
Epoch 00004: val_mse improved from 0.01725 to 0.01214, saving model to best_model.h5
Epoch 5/1000
Epoch 00005: val_mse improved from 0.01214 to 0.01033, saving model to best_model.h5
Epoch 6/1000
Epoch 00006: val_mse improved from 0.01033 to 0.00978, saving model to best_model.h5
Epoch 7/1000
Epoch 00007: val_mse improved from 0.00978 to 0.00965, saving model to best_model.h5
Epoch 8/1000
Epoch 00008: val_mse improved from 0.00965 to 0.00962, saving model to best_model.h5
Epoch 9/1000
Epoch 00009: val_mse did not improve from 0.00962
Epoch 10/1000
Epoch 00010: val_mse did not improve from 0.00962
Epoch 11/1000
Epoch 00011: val_mse did not im

<tensorflow.python.keras.callbacks.History at 0x7f639c5bcf28>

In [13]:
saved_model = tf.keras.models.load_model('best_model.h5')

In [14]:
saved_model.evaluate(X_test, y_test, verbose = 2)

2417/2417 - 0s - loss: 0.0077 - mse: 0.0072


[0.00774691671309904, 0.0072109546]