In [1]:
#######
# load data to pandas
# train
# tune parameters on holdout set
# test
# kfolds cross validation
#######
# comparisons:
# linear regression
# decision trees (random forest via scikit learn)
# ff neural network (lin, relu, lin) higher order polyn

In [1]:
%load_ext autoreload
%autoreload 2

# Imports
import pandas as pd
from core_insure.assessor.home_assessor import HomeAssessor
from ruamel.yaml import YAML
import numpy as np
import math
from torch import nn

LABEL_KEY = 'repairAmount'

In [2]:
# Load data to pandas
# This dataset was downloaded from: 
# https://www.fema.gov/openfema-dataset-individual-assistance-housing-registrants-large-disasters-v1
file = open('core_insure/data/IndividualAssistanceHousingRegistrantsLargeDisasters.csv', 'r')
pd_data = pd.read_csv(file)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Columns available, y = repairAmount
# For now, filter on just numerical data
# Later, can convert string columns (embedding, bins, num categories)
all_columns = list(pd_data.columns.values)
only_numerical_data = pd_data._get_numeric_data()
sorted_numerical_df = pd_data.reindex(sorted(only_numerical_data.columns), axis=1)
numeric_columns = sorted(sorted_numerical_df.columns.values)
# print(list(numeric_columns))

# remove all data where repairs were not made
final_sorted_df = sorted_numerical_df[sorted_numerical_df[LABEL_KEY].notnull()]

# change categorical to -1, 1
# final_sorted_df = final_sorted_df.replace(0,-1)
final_sorted_df = final_sorted_df.fillna(0)
columns = [
    'damagedZipCode',
    'disasterNumber',
    'repairAmount', 
    'foundationDamageAmount', 
    'grossIncome', 
    'householdComposition',
    'roofDamageAmount',
    'waterLevel'
]
final_sorted_df = pd.DataFrame(final_sorted_df, columns=columns)

In [4]:
print(final_sorted_df.head(10))

     damagedZipCode  disasterNumber  repairAmount  foundationDamageAmount  \
52            738.0            4339       2578.09                     0.0   
57            652.0            4339      15397.22                     0.0   
58          32258.0            4337       2164.50                     0.0   
70            757.0            4339       4189.79                     0.0   
82          34448.0            4337        499.50                     0.0   
94            956.0            4339       6048.26                     0.0   
128           795.0            4339        139.64                     0.0   
152           738.0            4339       2269.65                     0.0   
158           703.0            4339       1953.13                     0.0   
165           680.0            4339        269.81                     0.0   

     grossIncome  householdComposition  roofDamageAmount  waterLevel  
52        7080.0                     3            2064.6         0.0  
57        

In [5]:
# Processing and training methods
def value_of_interest(value):
    if pd.isna(value):
        return False
    return True

def get_row_xy(row):
    y_label = row[LABEL_KEY] if value_of_interest(row[LABEL_KEY]) else 0
    row.drop([LABEL_KEY])
    rowiter = row.iteritems()
    x_array = [item[1] if value_of_interest(item[1]) else 0 for item in rowiter]
    return x_array, y_label

def get_all_xy(df):
    all_x = []
    all_y = []
    for row in df.iterrows():
        row_contents = row[1]
        x, y = get_row_xy(row_contents)
        all_x.append(x)
        all_y.append([y])
    return all_x, all_y
    
# def train_with_config(linreg_config, df_data):
#     epochs = linreg_config.get('epochs')
#     epoch_loss = 0
#     for epoch in range(epochs):
#         for row in df_data.iterrows():
#             row_contents = row[1]
#             x, y = get_row_xy(row_contents)
#             
#             y_pred = self.model(self._torch_var(x))
#             loss = self.loss(y_pred, self._torch_var(y))
#             epoch_loss = loss.data[0]
#     
#             self.optimizer.zero_grad()
#             loss.backward()
#             self.optimizer.step()
#         print(f'Epoch {epoch} loss: {epoch_loss}')

# train_with_config(linreg_config, sorted_numerical_df)

In [6]:
# 80-20 Split
total_rows = final_sorted_df.shape[0]
print(f'Total test set: {total_rows}')
train_size = math.floor(total_rows*.80)
test_size = total_rows - train_size
print(f'Training set: {train_size}, Testing set: {test_size}')

train_x, train_y = get_all_xy(final_sorted_df.head(100))
print("Get training set")
test_x, test_y = get_all_xy(final_sorted_df.tail(10))
print("Get testing set")

Total test set: 274047
Training set: 219237, Testing set: 54810
Get training set
Get testing set


In [7]:
# Get config
default_config_file = open('core_insure/config.yaml', 'r')
yaml = YAML()
config = yaml.load(default_config_file)
print(config)

ordereddict([('base', ordereddict([('filepath', '.')])), ('assessor', ordereddict([('model', 'linear_regression'), ('linear_regression', ordereddict([('output_size', 1), ('lr', 0.01), ('momentum', 0.001), ('epochs', 500)])), ('simple_nn', ordereddict([('output_size', 1), ('lr', 0.001), ('epochs', 500), ('hidden_size', 50)])), ('filepath', '.')]))])


In [10]:
# Train linear regression
assessor_config =  config['assessor']
assessor_config['linear_regression']['epochs'] = 10
assessor = HomeAssessor(assessor_config)

# exploding gradients?
train_output = assessor.train(train_x, train_y)

# Train loss 
model = assessor.model
train_pred_y = model.eval(train_x)
loss = model.loss
train_loss = loss(train_pred_y, model._torch_var(train_y))
print(f'Train Loss: {train_loss}')
# Test linear regression
#pred_y = assessor.eval(test_x)
#val_loss = nn.MSELoss(y_pred, y_true)
#print(val_loss)

Epoch 0, Loss: 4958.27783203125, y_pred preview: tensor([-121.9391])
Epoch 1, Loss: 2335454.25, y_pred preview: tensor([1263215.1250])
Epoch 2, Loss: 248054.4375, y_pred preview: tensor([-92960.3828])
Epoch 3, Loss: 2335211.0, y_pred preview: tensor([1263122.2500])
Epoch 4, Loss: 248054.59375, y_pred preview: tensor([-92960.4297])
Epoch 5, Loss: 2335211.0, y_pred preview: tensor([1263122.2500])
Epoch 6, Loss: 248054.59375, y_pred preview: tensor([-92960.4297])
Epoch 7, Loss: 2335211.0, y_pred preview: tensor([1263122.2500])
Epoch 8, Loss: 248054.59375, y_pred preview: tensor([-92960.4297])
Epoch 9, Loss: 2335211.0, y_pred preview: tensor([1263122.2500])


TypeError: smooth_l1_loss(): argument 'input' (position 1) must be Tensor, not numpy.ndarray

In [67]:
# Train nn module
assessor_config =  config['assessor']
assessor_config['simple_nn']['epochs'] = 10
assessor_config['model'] = 'simple_nn'
assessor = HomeAssessor(assessor_config)

train_output = assessor.train(train_x, train_y)

Epoch 0, Loss: 2.1970219765740666e+26, y_pred preview: tensor([-15725909508096.])
Epoch 1, Loss: 2.1970219765740666e+26, y_pred preview: tensor([-15725909508096.])
Epoch 2, Loss: 2.1970219765740666e+26, y_pred preview: tensor([-15725909508096.])
Epoch 3, Loss: 2.1970219765740666e+26, y_pred preview: tensor([-15725909508096.])
Epoch 4, Loss: 2.1970219765740666e+26, y_pred preview: tensor([-15725909508096.])
Epoch 5, Loss: 2.1970219765740666e+26, y_pred preview: tensor([-15725909508096.])
Epoch 6, Loss: 2.1970219765740666e+26, y_pred preview: tensor([-15725909508096.])
Epoch 7, Loss: 2.1970219765740666e+26, y_pred preview: tensor([-15725909508096.])
Epoch 8, Loss: 2.1970219765740666e+26, y_pred preview: tensor([-15725909508096.])
Epoch 9, Loss: 2.1970219765740666e+26, y_pred preview: tensor([-15725909508096.])
