In [1]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn import metrics

### Load Data

In [2]:
data = pd.read_csv('FinalDataset.csv')

In [3]:
data.isnull().sum()

Declaration Type                     0
State                                0
Disaster Type                        0
Disaster Title                       0
Start Date                           0
End Date                             0
Individual Assistance Program        0
Individuals & Households Program     0
Public Assistance Program            0
Hazard Mitigation Program            0
Year                                 0
State Population                    66
Land Area                           66
Total Area                          66
Animals                             66
Arts, Culture, Humanities           66
Community Development               66
Education                           66
Environment                         66
Health                              66
Human Services                      66
Human and Civil Rights              66
International                       66
Religion                            66
Research and Public Policy          66
Total Nonprofits         

In [4]:
# drop rows with Null values
data.dropna(axis=0, inplace=True)

### Split data into train and test

In [5]:
# define X and y variable
X = data.loc[:, ['Disaster Type', 'Individual Assistance Program', 'Individuals & Households Program', 'Public Assistance Program',
                'Hazard Mitigation Program', 'State Population', 'Total Area', 'Animals', 'Education', 'Environment', 'Health',
                'Human Services', 'Human and Civil Rights', 'International', 'Religion', 'Research and Public Policy', 
                 'Total Nonprofits']]

y = data.loc[:, ['No. of Days']]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

### Split train data into numerical and categorical variables

In [7]:
cat_feats = X_train.dtypes[X_train.dtypes == 'object'].index.tolist()
num_feats = X_train.dtypes[~X_train.dtypes.index.isin(cat_feats)].index.tolist()

In [8]:
# use own function in Pipeline - to transform numerical and categorical columns
def numFeat(data):
    return data[num_feats]

def catFeat(data):
    return data[cat_feats]

In [40]:
# start two separate pipelines for each type of features
keep_num = FunctionTransformer(numFeat)
keep_cat = FunctionTransformer(catFeat)

### Create Pipelines

#### Pipeline for Categorical features

In [10]:
# to dense function - to apply after one hot encoding
class ToDenseTransformer():

    # here you define the operation it should perform
    def transform(self, X, y=None, **fit_params):
        return X.todense()

    # just return self
    def fit(self, X, y=None, **fit_params):
        return self

In [11]:
# categorical features pipeline
pipeline_cat = Pipeline([
    ('categorical_features', keep_cat),
    ('ohe', OneHotEncoder(handle_unknown='ignore')),
    ('to_dense',ToDenseTransformer())])

#### Pipeline for Numerical features

In [12]:
# numerical features pipeline
pipeline_num = Pipeline([
    ('numerical_features', keep_num),
    ('scaling', StandardScaler())])

#### Combine both pipelines

In [13]:
# combine pipelines by creating a feature union
feature_union = FeatureUnion([('num_variables', pipeline_num), 
                              ('cat_variables', pipeline_cat)])

#### Create main pipeline

In [14]:
# create main pipeline
pipeline = Pipeline(steps=[('features', feature_union),
                           ('regressor', LinearRegression())])

### Fit and Predict model

In [15]:
# fit the model create through pipeline
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('num_variables',
                                                 Pipeline(steps=[('numerical_features',
                                                                  FunctionTransformer(func=<function numFeat at 0x7f006241c0d0>)),
                                                                 ('scaling',
                                                                  StandardScaler())])),
                                                ('cat_variables',
                                                 Pipeline(steps=[('categorical_features',
                                                                  FunctionTransformer(func=<function catFeat at 0x7f006241c160>)),
                                                                 ('ohe',
                                                                  OneHotEncoder(handle_unknown='ignore')),
                                                      

In [16]:
X_train['Disaster Type'].unique()

array(['Storm', 'Snow', 'Hurricane', 'Flood', 'Winter', 'Fire', 'Tornado',
       'Ice', 'Earthquake', 'Other', 'Terrorism', 'Dam/Levee Break',
       'Chemical', 'Tsunami', 'Mud/Landslide'], dtype=object)

In [17]:
X_test['Disaster Type'].unique()

array(['Storm', 'Hurricane', 'Fire', 'Flood', 'Earthquake', 'Ice', 'Snow',
       'Tornado', 'Other', 'Winter', 'Mud/Landslide', 'Volcano',
       'Chemical', 'Tsunami'], dtype=object)

In [18]:
# predict the data 
y_pred = pipeline.predict(X_test)

In [19]:
# prediction score
pipeline.score(X_test,y_test)

0.23144355606305045

In [20]:
print(r2_score(y_test, y_pred))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

0.23144355606305045
Mean Absolute Error: 19.68571295805295
Mean Squared Error: 1042.9178012723435
Root Mean Squared Error: 32.29423789582816


### Prediction with RandomForestRegresser

In [21]:
# initialize regressor
clf = RandomForestRegressor(n_estimators=100)

In [22]:
# create main pipeline for RandomForestRegressor
pipeline_clf = Pipeline(steps=[('features', feature_union),
                           ('regressor', clf)])

In [23]:
# fit the RandomForestRegressor through pipeline
pipeline_clf.fit(X_train, y_train)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('num_variables',
                                                 Pipeline(steps=[('numerical_features',
                                                                  FunctionTransformer(func=<function numFeat at 0x7f006241c0d0>)),
                                                                 ('scaling',
                                                                  StandardScaler())])),
                                                ('cat_variables',
                                                 Pipeline(steps=[('categorical_features',
                                                                  FunctionTransformer(func=<function catFeat at 0x7f006241c160>)),
                                                                 ('ohe',
                                                                  OneHotEncoder(handle_unknown='ignore')),
                                                      

In [24]:
# predict the data with RandomForestRegressor
y_pred_clf = pipeline_clf.predict(X_test)

In [25]:
# prediction score with RandomForestRegressor
pipeline_clf.score(X_test,y_test)

0.7705375285641054

In [35]:
print(r2_score(y_test, y_pred_clf))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_clf))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_clf))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_clf)))

0.7705375285641054
Mean Absolute Error: 8.723727145164979
Mean Squared Error: 311.37660489653445
Root Mean Squared Error: 17.64586651022087


### GridSearch

In [27]:
param_grid = { 
    'regressor__n_estimators': [200, 500],
    'regressor__max_depth' : [4,5,6,7,8]}

#{'regressor_min_samples_split': range(2, 403, 10)}

In [28]:
grid_clf = GridSearchCV(pipeline_clf,
                  param_grid=param_grid, cv=5)

In [29]:
# ravel target variable
# y_train = np.array(y_train)
#y_train = y_train.ravel()
#y_test = np.array(y_test)
#y_test = y_test.ravel()

In [31]:
grid_clf.fit(X_train,y_train.values.ravel())

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('features',
                                        FeatureUnion(transformer_list=[('num_variables',
                                                                        Pipeline(steps=[('numerical_features',
                                                                                         FunctionTransformer(func=<function numFeat at 0x7f006241c0d0>)),
                                                                                        ('scaling',
                                                                                         StandardScaler())])),
                                                                       ('cat_variables',
                                                                        Pipeline(steps=[('categorical_features',
                                                                                         FunctionTransformer(func=<function catFeat at 0x7f006241c160>)),
        

In [32]:
# predict the data with GridSearch
y_pred_grid = grid_clf.predict(X_test)

In [33]:
grid_clf.score(X_test,y_test)

0.7023593879727535

In [34]:
print(r2_score(y_test, y_pred_grid))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_grid))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_grid))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_grid)))

0.7023593879727535
Mean Absolute Error: 12.32645099648816
Mean Squared Error: 403.8931624523288
Root Mean Squared Error: 20.09709338318178


### RandomForestRegressor without nonprofit information

#### Split the dataset

In [36]:
# define X and y variable
X1 = data.loc[:, ['Disaster Type', 'Individual Assistance Program', 'Individuals & Households Program', 'Public Assistance Program',
                'Hazard Mitigation Program', 'State Population', 'Total Area']]

y1 = data.loc[:, ['No. of Days']]

In [37]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.30)

In [38]:
# split data into numerical and categorical features
cat_feats1 = X_train1.dtypes[X_train1.dtypes == 'object'].index.tolist()
num_feats1 = X_train1.dtypes[~X_train1.dtypes.index.isin(cat_feats)].index.tolist()

In [41]:
# use own function in Pipeline - to transform numerical and categorical columns
def numFeat1(data):
    return data[num_feats1]

def catFeat1(data):
    return data[cat_feats1]

In [42]:
# start two separate pipelines for each type of features
keep_num1 = FunctionTransformer(numFeat1)
keep_cat1 = FunctionTransformer(catFeat1)

In [43]:
# categorical features pipeline
pipeline_cat1 = Pipeline([
    ('categorical_features', keep_cat1),
    ('ohe', OneHotEncoder(handle_unknown='ignore')),
    ('to_dense',ToDenseTransformer())])

In [44]:
# numerical features pipeline
pipeline_num1 = Pipeline([
    ('numerical_features', keep_num1),
    ('scaling', StandardScaler())])

In [45]:
# combine pipelines by creating a feature union
feature_union1 = FeatureUnion([('num_variables', pipeline_num1), 
                              ('cat_variables', pipeline_cat1)])

In [46]:
# create main pipeline
# create main pipeline for RandomForestRegressor
pipeline_clf1 = Pipeline(steps=[('features', feature_union1),
                           ('regressor', clf)])

In [48]:
# fit the RandomForestRegressor through pipeline
pipeline_clf1.fit(X_train1, y_train1.values.ravel())

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('num_variables',
                                                 Pipeline(steps=[('numerical_features',
                                                                  FunctionTransformer(func=<function numFeat1 at 0x7f005e034dc0>)),
                                                                 ('scaling',
                                                                  StandardScaler())])),
                                                ('cat_variables',
                                                 Pipeline(steps=[('categorical_features',
                                                                  FunctionTransformer(func=<function catFeat1 at 0x7f005e0349d0>)),
                                                                 ('ohe',
                                                                  OneHotEncoder(handle_unknown='ignore')),
                                                    

In [49]:
# predict the data with RandomForestRegressor
y_pred_clf1 = pipeline_clf1.predict(X_test1)

In [51]:
# prediction score with RandomForestRegressor
pipeline_clf1.score(X_test1,y_test1)

0.7774515172045482

In [52]:
print(r2_score(y_test1, y_pred_clf1))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test1, y_pred_clf1))
print('Mean Squared Error:', metrics.mean_squared_error(y_test1, y_pred_clf1))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test1, y_pred_clf1)))

0.7774515172045482
Mean Absolute Error: 8.724613964166862
Mean Squared Error: 301.74015517604244
Root Mean Squared Error: 17.370669393435662


In [53]:
X_train1.columns

Index(['Disaster Type', 'Individual Assistance Program',
       'Individuals & Households Program', 'Public Assistance Program',
       'Hazard Mitigation Program', 'State Population', 'Total Area'],
      dtype='object')