## importing

In [1]:
import pandas as pd
import numpy as np

In [2]:
unscaled_data = pd.read_csv("unscaled_inputs.csv")
unscaled_data.head()

Unnamed: 0,Bengaluru,Bhubaneshwar,Chandigarh,Chennai,Delhi,Female,Male,Married,Children,occ_group1,...,Diuresis,Platelets,HBB,d-dimer,Heart rate,HDL cholesterol,Charlson Index,Blood Glucose,FT/month,Infect_Prob
0,0,1,0,0,0,1,0,1,1.0,0,...,441.0,154.0,93.0,233.0,82.0,58.0,27,7,2.0,49.13501
1,0,1,0,0,0,1,0,1,2.0,0,...,276.0,121.0,56.0,328.0,89.0,68.0,5,6,1.0,51.14788
2,0,1,0,0,0,1,0,0,1.0,1,...,416.0,124.0,137.0,213.0,77.0,43.0,40,6,1.0,73.224
3,0,1,0,0,0,1,0,1,1.0,1,...,410.0,98.0,167.0,275.0,64.0,60.0,27,7,1.0,48.779225
4,0,1,0,0,0,1,0,0,2.0,1,...,390.0,21.0,153.0,331.0,71.0,64.0,32,7,1.0,87.8688


In [3]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [4]:
unscaled_inputs = unscaled_data.iloc[:,:-1]

targets = unscaled_data['Infect_Prob']

## standardizing the data

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

## custome scaler

In [6]:
# import the libraries needed to create the Custom Scaler
# note that all of them are a part of the sklearn package
# moreover, one of them is actually the StandardScaler module, 
# so you can imagine that the Custom Scaler is build on it

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

# create the Custom Scaler class

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        
        # scaler is nothing but a Standard Scaler object
        self.scaler = StandardScaler(copy,with_mean,with_std)
        # with some columns 'twist'
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    
    # the fit method, which, again based on StandardScale
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling

    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [7]:
unscaled_inputs.columns.values

array(['Bengaluru', 'Bhubaneshwar', 'Chandigarh', 'Chennai', 'Delhi',
       'Female', 'Male', 'Married', 'Children', 'occ_group1',
       'occ_group2', 'safe_transport', 'unsafe_transport', 'cases/1M',
       'Deaths/1M', 'heart_disease', 'No_disease', 'Age', 'Coma score',
       'Pulmonary score', 'Normal_pressure', 'High_pressure', 'Diuresis',
       'Platelets', 'HBB', 'd-dimer', 'Heart rate', 'HDL cholesterol',
       'Charlson Index', 'Blood Glucose', 'FT/month'], dtype=object)

In [8]:
columns_to_scale = ['Children','cases/1M','Deaths/1M','Age', 'Coma score','Pulmonary score','Diuresis','Platelets', 'HBB', 'd-dimer', 'Heart rate', 'HDL cholesterol','Charlson Index', 'Blood Glucose', 'FT/month']

In [9]:
# declare a scaler object, specifying the columns you want to scale
scaler = CustomScaler(columns_to_scale)



In [10]:
scaler.fit(unscaled_inputs)



CustomScaler(columns=['Children', 'cases/1M', 'Deaths/1M', 'Age', 'Coma score',
                      'Pulmonary score', 'Diuresis', 'Platelets', 'HBB',
                      'd-dimer', 'Heart rate', 'HDL cholesterol',
                      'Charlson Index', 'Blood Glucose', 'FT/month'],
             copy=None, with_mean=None, with_std=None)

In [11]:
scaled_inputs = scaler.transform(unscaled_inputs)
scaled_inputs.head()

Unnamed: 0,Bengaluru,Bhubaneshwar,Chandigarh,Chennai,Delhi,Female,Male,Married,Children,occ_group1,occ_group2,safe_transport,unsafe_transport,cases/1M,Deaths/1M,heart_disease,No_disease,Age,Coma score,Pulmonary score,Normal_pressure,High_pressure,Diuresis,Platelets,HBB,d-dimer,Heart rate,HDL cholesterol,Charlson Index,Blood Glucose,FT/month
0,0,1,0,0,0,1,0,1,-0.038658,0,1,0,1,-0.472554,-0.612505,1,0,1.691367,-0.272711,1.336249,1,0,1.715549,1.654509,-0.420281,-1.024927,0.507079,0.515137,0.589446,1.418137,1.286246
1,0,1,0,0,0,1,0,1,1.172395,0,1,1,0,-0.472554,-0.612505,1,0,1.430316,1.611645,-1.345259,0,1,-0.027667,0.829587,-1.091361,1.273339,1.004442,1.475128,-1.270564,0.703701,0.076445
2,0,1,0,0,0,1,0,0,-0.038658,1,0,0,1,-0.472554,-0.612505,0,1,-1.506507,1.073257,0.442413,1,0,1.451425,0.90458,0.37776,-1.508773,0.151819,-0.924849,1.688543,0.703701,0.076445
3,0,1,0,0,0,1,0,1,-0.038658,1,0,1,0,-0.472554,-0.612505,1,0,-0.592828,-0.003518,-0.451423,0,1,1.388036,0.254641,0.921879,-0.008851,-0.771855,0.707135,0.589446,1.418137,0.076445
4,0,1,0,0,0,1,0,0,1.172395,1,0,1,0,-0.472554,-0.612505,1,0,-1.245456,-0.541905,1.336249,1,0,1.176737,-1.670179,0.667956,1.345916,-0.274492,1.091131,1.012176,1.418137,0.076445


In [12]:
new_list = []
for i in range (len(targets)):
        if targets[i]>=50:
            targets[i] =1
        else:
            targets[i] =0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


## shuffling_the _dataset

In [13]:
shuffled_indices = np.arange(scaled_inputs.shape[0])

np.random.shuffle(shuffled_indices)

In [14]:
shuffled_indices

array([ 340, 8882, 1116, ..., 7148, 2109, 1017])

In [15]:
shuffled_inputs = scaled_inputs.iloc[shuffled_indices]

shuffled_targets = targets[shuffled_indices]

## train_test_validation

In [16]:
sample_count = scaled_inputs.shape[0]

In [17]:
train_count = int(0.8 * sample_count)

validation_count = int(0.1 * sample_count)

test_count = sample_count - (train_count+validation_count)

In [18]:
#spliting data

train_inputs = shuffled_inputs[:train_count]
train_targets = shuffled_targets[:train_count]

In [19]:
train_targets.head()

340     0.0
8882    1.0
1116    0.0
383     1.0
7030    0.0
Name: Infect_Prob, dtype: float64

In [20]:
validation_inputs =shuffled_inputs[train_count:train_count+validation_count]
validation_targets = shuffled_targets[train_count:train_count+validation_count]

In [21]:
test_inputs = shuffled_inputs[train_count+validation_count:]
test_targets =shuffled_targets[train_count+validation_count:]

## saving in npz form for tensorflow inputs

In [22]:
np.savez('covid_data_train', inputs=train_inputs, targets=train_targets)
np.savez('covid_data_validation', inputs=validation_inputs, targets=validation_targets)
np.savez('covid_data_test', inputs=test_inputs, targets=test_targets)