In [0]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
import xgboost as xgb

In [0]:
path='drive/My Drive/'
data_train=pd.read_excel(path+'Train_dataset.xlsx',header=0,names=np.arange(28))
data_train=data_train.drop(labels=[0,1,3,4],axis='columns')
data_train.columns=np.arange(24)

In [145]:
data_train.dtypes

0      object
1      object
2     float64
3      object
4      object
5       int64
6       int64
7      object
8       int64
9       int64
10     object
11     object
12    float64
13    float64
14    float64
15    float64
16    float64
17    float64
18      int64
19      int64
20    float64
21      int64
22    float64
23    float64
dtype: object

In [146]:
data_train.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
0,Female,YES,1.0,Farmer,Public,2,0,Hypertension,68,8,<400,Normal,441.0,154.0,93.0,233.0,82.0,58.0,27,7,3600000.0,1300000,2.0,49.13501
1,Female,YES,2.0,Farmer,Walk,2,0,Diabetes,64,15,<100,Stage-02,,121.0,56.0,328.0,89.0,68.0,5,6,1600000.0,400000,1.0,51.14788
2,Female,NO,1.0,Cleaner,Public,2,0,,19,13,<300,Elevated,416.0,124.0,137.0,213.0,77.0,43.0,40,6,3400000.0,900000,1.0,73.224
3,Female,YES,1.0,Driver,Car,2,0,Coronary Heart Disease,33,9,<200,Stage-01,410.0,98.0,167.0,275.0,64.0,60.0,27,7,700000.0,2300000,1.0,48.779225
4,Female,NO,2.0,Manufacturing,Car,2,0,Diabetes,23,7,<400,Normal,390.0,21.0,153.0,331.0,71.0,64.0,32,7,3200000.0,1100000,1.0,87.8688


In [147]:
data_train.isnull().any()

0     False
1     False
2      True
3      True
4      True
5     False
6     False
7      True
8     False
9     False
10    False
11     True
12     True
13     True
14     True
15     True
16     True
17     True
18    False
19    False
20     True
21    False
22     True
23    False
dtype: bool

In [0]:
Y_train=data_train.iloc[:,-1].values
X_train=data_train.iloc[:,:-1].values

def pre_process(X_train):

  print(X_train.shape)

  cols=data_train.columns
  cols_num=data_train._get_numeric_data().columns
  cols_cat=list(set(cols)-set(cols_num))
  cols_num=list(set(cols)-set(cols_cat))
  cols_num=cols_num[:-1]
  cols=list(cols)

  imputer_cat=SimpleImputer(strategy="most_frequent")
  X_train[:,cols_cat]=imputer_cat.fit_transform(X_train[:,cols_cat])

  imputer_num=SimpleImputer(strategy="mean")
  X_train[:,cols_num]=imputer_num.fit_transform(X_train[:,cols_num])

  print(X_train[:5,:])

  ct=ColumnTransformer([('encoder',OneHotEncoder(),cols_cat)],remainder='passthrough')
  X_train=ct.fit_transform(X_train)
  print(X_train.shape)

  X_train=X_train.astype(float)
  sc=StandardScaler();
  X_train=sc.fit_transform(X_train)

  return X_train

In [0]:
scorer = make_scorer(mean_squared_error,greater_is_better=False)

In [0]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.svm import SVR

In [0]:
from sklearn.model_selection import cross_val_score
models = [xgb.XGBRegressor(), AdaBoostRegressor(), GradientBoostingRegressor(), RandomForestRegressor(), LinearRegression(), Lasso(), Ridge(), SVR()]

In [0]:
models_score = []
for model in models:
  score = cross_val_score(model, X_train, Y_train, scoring=scorer, cv=5)
  models_score.append(score)
print(models_score)

[array([-69.10411515, -72.80988885, -88.15994838, -81.07605218,
       -80.04420178]), array([-70.8085118 , -76.94434265, -88.67253557, -83.4878719 ,
       -83.27827821]), array([-68.69639089, -72.09983516, -87.69230534, -80.62977625,
       -80.61269701]), array([-68.56619074, -72.72034971, -87.3072012 , -81.46475996,
       -80.78212474]), array([-71.89964583, -77.81758656, -92.52174653, -83.04224132,
       -85.43798038]), array([-73.83824819, -80.16311817, -98.24989593, -85.47338412,
       -94.24753681]), array([-71.87840416, -77.9761069 , -92.63499228, -83.14781397,
       -85.42528104]), array([-75.60504177, -80.90406919, -88.71971787, -86.75447247,
       -84.99667854])]


In [151]:
X_train = pre_process(X_train)
model_final = RandomForestRegressor().fit(X_train, Y_train)
print(model_final.score(X_train, Y_train))
print(np.sqrt(mean_squared_error(Y_train,model_final.predict(X_train))))

(10714, 23)
[['Female' 'YES' 1.0 'Farmer' 'Public' 2.0 0.0 'Hypertension' 68.0 8.0
  '<400' 'Normal' 441.0 154.0 93.0 233.0 82.0 58.0 27.0 7.0 3600000.0
  1300000.0 2.0]
 ['Female' 'YES' 2.0 'Farmer' 'Walk' 2.0 0.0 'Diabetes' 64.0 15.0 '<100'
  'Stage-02' 278.80626125225047 121.0 56.0 328.0 89.0 68.0 5.0 6.0
  1600000.0 400000.0 1.0]
 ['Female' 'NO' 1.0 'Cleaner' 'Public' 2.0 0.0 'None' 19.0 13.0 '<300'
  'Elevated' 416.0 124.0 137.0 213.0 77.0 43.0 40.0 6.0 3400000.0
  900000.0 1.0]
 ['Female' 'YES' 1.0 'Driver' 'Car' 2.0 0.0 'Coronary Heart Disease' 33.0
  9.0 '<200' 'Stage-01' 410.0 98.0 167.0 275.0 64.0 60.0 27.0 7.0
  700000.0 2300000.0 1.0]
 ['Female' 'NO' 2.0 'Manufacturing' 'Car' 2.0 0.0 'Diabetes' 23.0 7.0
  '<400' 'Normal' 390.0 21.0 153.0 331.0 71.0 64.0 32.0 7.0 3200000.0
  1100000.0 1.0]]
(10714, 44)
0.9053770400691024
3.2834646313427784


In [0]:
data_test=pd.read_excel(path+'Test_dataset.xlsx',header=0,names=np.arange(27))
data_test=data_test.drop(labels=[0,1,3,4],axis='columns')
data_test.columns=np.arange(23)

In [0]:
X_test = data_test.values
X_test = pre_process(X_test)
y_pred = model_final.predict(X_test)

(14498, 23)
[['Female' 'YES' 2.0 'Driver' 'Public' 4.0 1.0 'Diabetes' 52.0 3.0 '<200'
  'Stage-01' 388.0 153.0 196.0 240.0 85.0 53.0 17.0 3.0 3900000.0
  1300000.0 1.0]
 ['Male' 'YES' 2.0 'Legal' 'Walk' 4.0 1.0 'Diabetes' 53.0 11.0 '<200'
  'Stage-01' 409.0 95.0 138.0 241.0 81.0 61.0 2.0 5.0 1800000.0 1300000.0
  1.0]
 ['Female' 'YES' 2.0 'Sales' 'Car' 4.0 1.0 'Diabetes' 35.0 9.0 '<100'
  'Stage-02' 440.0 40.0 166.0 236.0 88.0 47.0 24.0 3.0 5000000.0
  2000000.0 2.0]
 ['Female' 'YES' 2.0 'Sales' 'Car' 4.0 1.0 'None' 31.0 12.0 '<300'
  'Elevated' 206.0 78.0 83.0 211.0 87.0 52.0 13.0 6.0 3100000.0 600000.0
  2.0]
 ['Female' 'YES' 2.0 'Business' 'Car' 4.0 1.0 'Diabetes' 51.0 6.0 '<200'
  'Stage-01' 229.0 109.0 207.0 312.0 94.0 68.0 39.0 5.0 2300000.0
  1500000.0 1.0]]
(14498, 44)


In [0]:
y_pred.shape

(14498,)

In [0]:
print(max(Y_train), min(Y_train))

97.632 29.2896


In [0]:
out_df = pd.DataFrame()
out_df['people_ID'] = pd.read_excel(path+'Test_dataset.xlsx',header=0,names=np.arange(27))[0]
out_df['infect_prob'] = y_pred

In [0]:
out_df.to_csv(path+'infect_prob_20_mar_2020.csv')

In [153]:
xls = pd.ExcelFile(path+'Train_dataset.xlsx')
data_train_1 = pd.read_excel(xls, 'Diuresis_TS')
data_train_1.head(5)

Unnamed: 0,people_ID,2020-03-20 00:00:00,2020-03-21 00:00:00,2020-03-22 00:00:00,2020-03-23 00:00:00,2020-03-24 00:00:00,2020-03-25 00:00:00,2020-03-26 00:00:00
0,1,441,544.2,669.64,821.368,1002.2416,1215.68992,1472.627904
1,2,151,197.2,253.24,321.688,399.9256,492.91072,606.892864
2,3,416,515.2,632.44,776.728,947.7736,1150.32832,1395.793984
3,4,410,506.0,621.4,761.88,928.156,1126.7872,1368.34464
4,5,390,483.0,595.0,730.2,892.84,1084.408,1315.8896


In [0]:
data_train_1 = data_train_1.values
sc=StandardScaler();
train_dataset = data_train_1[:10000,1:]
#train_dataset=sc.fit_transform(train_dataset)
val_dataset = data_train_1[10000:,1:]
#val_dataset=sc.fit_transform(val_dataset)

In [155]:
print(train_dataset.shape, val_dataset.shape)
print(train_dataset[:5,:])

(10000, 7) (714, 7)
[[ 441.        544.2       669.64      821.368    1002.2416   1215.68992
  1472.627904]
 [ 151.        197.2       253.24      321.688     399.9256    492.91072
   606.892864]
 [ 416.        515.2       632.44      776.728     947.7736   1150.32832
  1395.793984]
 [ 410.        506.        621.4       761.88      928.156    1126.7872
  1368.34464 ]
 [ 390.        483.        595.        730.2       892.84     1084.408
  1315.8896  ]]


In [0]:
X_train1 = np.reshape(train_dataset[:,:6], (10000,6,1))
Y_train1 = train_dataset[:,6]
X_val1 = np.reshape(val_dataset[:,:6], (714,6,1))
Y_val1 = val_dataset[:,6]

In [0]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

BATCH_SIZE = 16
BUFFER_SIZE = 10000

train_univariate = tf.data.Dataset.from_tensor_slices((X_train1, Y_train1))
train_univariate = train_univariate.cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE).repeat()

val_univariate = tf.data.Dataset.from_tensor_slices((X_val1, Y_val1))
val_univariate = val_univariate.batch(BATCH_SIZE).repeat()


In [158]:
train_univariate

<RepeatDataset shapes: ((None, 6, 1), (None,)), types: (tf.float64, tf.float64)>

In [0]:
simple_lstm_model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(units=64,return_sequences = True, input_shape = X_train.shape[-2:]),
    tf.keras.layers.Dropout(0.15), 
    tf.keras.layers.LSTM(units=32,return_sequences=True),
    tf.keras.layers.LSTM(units=32),
    tf.keras.layers.Dense(1)
])

simple_lstm_model.compile(optimizer='adam', loss='mae')

In [142]:
EVALUATION_INTERVAL = 200
EPOCHS = 200

simple_lstm_model.fit(train_univariate, epochs=EPOCHS,
                      steps_per_epoch=EVALUATION_INTERVAL,
                      validation_data=val_univariate, validation_steps=50)

Train for 200 steps, validate for 50 steps
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoc

<tensorflow.python.keras.callbacks.History at 0x7feaf99599e8>

In [0]:
y_pred = simple_lstm_model.predict(np.array(X_train1[:,:6,:]))

In [160]:
y_pred.shape

(10000, 1)

In [162]:
np.sqrt(mean_squared_error(y_pred, Y_train1))

18.047847571498416

In [168]:
data_train_1.shape

(10714, 8)

In [0]:
y_pred1 = simple_lstm_model.predict(np.array(np.reshape(data_train_1[:,2:], (10714,6,1))))

In [171]:
y_pred1[:20]

array([[1428.6642 ],
       [ 735.49207],
       [1428.6626 ],
       [1428.6616 ],
       [1428.6545 ],
       [1428.6521 ],
       [1328.4786 ],
       [ 867.361  ],
       [1280.5012 ],
       [1035.2844 ],
       [1428.6149 ],
       [ 867.1195 ],
       [1428.6602 ],
       [ 611.83496],
       [1428.6453 ],
       [1035.953  ],
       [ 886.3091 ],
       [ 834.9343 ],
       [ 836.9475 ],
       [ 971.86053]], dtype=float32)

In [0]:
data_train=pd.read_excel(path+'Train_dataset.xlsx',header=0,names=np.arange(28))
people_id = data_train[0]
data_train=data_train.drop(labels=[0,1,3,4],axis='columns')
data_train.columns=np.arange(24)

In [186]:
data_train.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
0,Female,YES,1.0,Farmer,Public,2,0,Hypertension,68,8,<400,Normal,441.0,154.0,93.0,233.0,82.0,58.0,27,7,3600000.0,1300000,2.0,49.13501
1,Female,YES,2.0,Farmer,Walk,2,0,Diabetes,64,15,<100,Stage-02,,121.0,56.0,328.0,89.0,68.0,5,6,1600000.0,400000,1.0,51.14788
2,Female,NO,1.0,Cleaner,Public,2,0,,19,13,<300,Elevated,416.0,124.0,137.0,213.0,77.0,43.0,40,6,3400000.0,900000,1.0,73.224
3,Female,YES,1.0,Driver,Car,2,0,Coronary Heart Disease,33,9,<200,Stage-01,410.0,98.0,167.0,275.0,64.0,60.0,27,7,700000.0,2300000,1.0,48.779225
4,Female,NO,2.0,Manufacturing,Car,2,0,Diabetes,23,7,<400,Normal,390.0,21.0,153.0,331.0,71.0,64.0,32,7,3200000.0,1100000,1.0,87.8688


In [189]:
people_id

0            1
1            2
2            3
3            4
4            5
         ...  
10709    22691
10710    22692
10711    22693
10712    22694
10713    22695
Name: 0, Length: 10714, dtype: int64

In [0]:
data_train[12]=y_pred1

In [188]:
data_train.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
0,Female,YES,1.0,Farmer,Public,2,0,Hypertension,68,8,<400,Normal,1428.664185,154.0,93.0,233.0,82.0,58.0,27,7,3600000.0,1300000,2.0,49.13501
1,Female,YES,2.0,Farmer,Walk,2,0,Diabetes,64,15,<100,Stage-02,735.492065,121.0,56.0,328.0,89.0,68.0,5,6,1600000.0,400000,1.0,51.14788
2,Female,NO,1.0,Cleaner,Public,2,0,,19,13,<300,Elevated,1428.662598,124.0,137.0,213.0,77.0,43.0,40,6,3400000.0,900000,1.0,73.224
3,Female,YES,1.0,Driver,Car,2,0,Coronary Heart Disease,33,9,<200,Stage-01,1428.661621,98.0,167.0,275.0,64.0,60.0,27,7,700000.0,2300000,1.0,48.779225
4,Female,NO,2.0,Manufacturing,Car,2,0,Diabetes,23,7,<400,Normal,1428.654541,21.0,153.0,331.0,71.0,64.0,32,7,3200000.0,1100000,1.0,87.8688


In [0]:
data_test=data_train.iloc[:,:-1].values

In [194]:
X_test = data_test
X_test = pre_process(X_test)

(10714, 23)
[['Female' 'YES' 1.0 'Farmer' 'Public' 2.0 0.0 'Hypertension' 68.0 8.0
  '<400' 'Normal' 1428.6641845703125 154.0 93.0 233.0 82.0 58.0 27.0 7.0
  3600000.0 1300000.0 2.0]
 ['Female' 'YES' 2.0 'Farmer' 'Walk' 2.0 0.0 'Diabetes' 64.0 15.0 '<100'
  'Stage-02' 735.4920654296875 121.0 56.0 328.0 89.0 68.0 5.0 6.0
  1600000.0 400000.0 1.0]
 ['Female' 'NO' 1.0 'Cleaner' 'Public' 2.0 0.0 'None' 19.0 13.0 '<300'
  'Elevated' 1428.66259765625 124.0 137.0 213.0 77.0 43.0 40.0 6.0
  3400000.0 900000.0 1.0]
 ['Female' 'YES' 1.0 'Driver' 'Car' 2.0 0.0 'Coronary Heart Disease' 33.0
  9.0 '<200' 'Stage-01' 1428.66162109375 98.0 167.0 275.0 64.0 60.0 27.0
  7.0 700000.0 2300000.0 1.0]
 ['Female' 'NO' 2.0 'Manufacturing' 'Car' 2.0 0.0 'Diabetes' 23.0 7.0
  '<400' 'Normal' 1428.654541015625 21.0 153.0 331.0 71.0 64.0 32.0 7.0
  3200000.0 1100000.0 1.0]]
(10714, 44)


In [0]:
y_pred2 = model_final.predict(X_test)

In [196]:
print(max(y_pred2),min(y_pred2))

89.55783359999987 38.27174400000004


In [0]:
out_df1 = pd.DataFrame()
out_df1['people_ID'] = people_id
out_df1['infect_prob'] = y_pred2


In [198]:
out_df1.head(5)

Unnamed: 0,people_ID,infect_prob
0,1,49.074164
1,2,51.093959
2,3,69.738538
3,4,48.9161
4,5,79.81416


In [0]:
out_df1.to_csv(path+'infect_prob_27_mar_2020.csv')