In [1]:
#######
# load data to pandas
# train
# tune parameters on holdout set
# test
# kfolds cross validation
#######
# comparisons:
# linear regression
# decision trees (random forest via scikit learn)
# ff neural network (lin, relu, lin) higher order polyn

In [1]:
%load_ext autoreload
%autoreload 2

# Imports
import pandas as pd
from core_insure.assessor.home_assessor import HomeAssessor
from ruamel.yaml import YAML
import numpy as np
import math
from torch import nn

LABEL_KEY = 'claimAmount'

In [2]:
# Load data to pandas
# This dataset was downloaded from: 
# https://www.fema.gov/openfema-dataset-individual-assistance-housing-registrants-large-disasters-v1
file = open('core_insure/data/IndividualAssistanceHousingRegistrantsLargeDisasters.csv', 'r')
pd_data = pd.read_csv(file)

  interactivity=interactivity, compiler=compiler, result=result)


In [22]:
# Columns available
all_columns = list(pd_data.columns.values)

# calculate label: claimAmount based on repairs and replacements
pd_data = pd_data.fillna(0)
pd_data[LABEL_KEY] = pd_data['repairAmount'] + pd_data['replacementAmount']

# For now, filter on just numerical data
# Later, can convert string columns (embedding, bins, num categories)
only_numerical_data = pd_data._get_numeric_data()
sorted_numerical_df = pd_data.reindex(sorted(only_numerical_data.columns), axis=1)
numeric_columns = sorted(sorted_numerical_df.columns.values)
print(list(numeric_columns))

# remove all data where repairs were not made
sorted_numerical_df = sorted_numerical_df.replace(0, np.nan)
sorted_numerical_df = sorted_numerical_df[sorted_numerical_df[LABEL_KEY].notnull()] 

['censusBlockId', 'censusYear', 'claimAmount', 'damagedZipCode', 'destroyed', 'disasterNumber', 'floodDamage', 'floodInsurance', 'foundationDamage', 'foundationDamageAmount', 'grossIncome', 'habitabilityRepairsRequired', 'homeOwnersInsurance', 'householdComposition', 'inspected', 'personalPropertyEligible', 'ppfvl', 'primaryResidence', 'rentalAssistanceAmount', 'rentalAssistanceEligible', 'rentalResourceZipCode', 'repairAmount', 'repairAssistanceEligible', 'replacementAmount', 'replacementAssistanceEligible', 'roofDamage', 'roofDamageAmount', 'rpfvl', 'sbaEligible', 'specialNeeds', 'tsaCheckedIn', 'tsaEligible', 'waterLevel']


In [75]:
# columns = [
#     'damagedZipCode',
#     'disasterNumber',
#     'foundationDamageAmount', 
#     'grossIncome', 
#     'householdComposition',
#     'roofDamageAmount',
#     'waterLevel',
#     LABEL_KEY
# ]
columns = [
    'repairAmount',
    LABEL_KEY
]
final_sorted_df = sorted_numerical_df[sorted_numerical_df['repairAmount'].notnull()] 
final_sorted_df = pd.DataFrame(sorted_numerical_df, columns=columns)
# final_sorted_df = final_sorted_df.replace(np.nan,1)
final_sorted_df = final_sorted_df.replace(0,1)
print(final_sorted_df.head(10))

     repairAmount  claimAmount
52        2578.09      2578.09
57       15397.22     15397.22
58        2164.50      2164.50
70        4189.79      4189.79
82         499.50       499.50
94        6048.26      6048.26
128        139.64       139.64
152       2269.65      2269.65
158       1953.13      1953.13
165        269.81       269.81


In [76]:
# Processing and training methods
def value_of_interest(value):
    if pd.isna(value):
        return False
    return True

def get_row_xy(row):
    y_label = row[LABEL_KEY] if value_of_interest(row[LABEL_KEY]) else 0
    row = row.drop([LABEL_KEY])
    rowiter = row.iteritems()
    x_array = [item[1] if value_of_interest(item[1]) else 0 for item in rowiter]
    return x_array, y_label

def get_all_xy(df):
    all_x = []
    all_y = []
    for row in df.iterrows():
        row_contents = row[1]
        x, y = get_row_xy(row_contents)
        all_x.append(x)
        all_y.append([y])
    return all_x, all_y

def calculate_train_val_loss(curr_assessor, train_x, train_y, test_x, test_y):
    # exploding gradients?
    train_output = curr_assessor.train(train_x, train_y)

    # Train loss 
    model = curr_assessor.model
    train_pred_y = model.eval(train_x)
    loss = model.loss
    train_loss = loss(model._torch_var(train_pred_y), model._torch_var(train_y))
    print(f'Train Loss: {train_loss}')
    
    # Test linear regression
    test_y_pred = model.eval(test_x)
    test_loss = loss(model._torch_var(test_y_pred), model._torch_var(test_y))
    print(f'Test Losss: {test_loss}')
    
    return train_loss, test_loss

# def train_with_config(linreg_config, df_data):
#     epochs = linreg_config.get('epochs')
#     epoch_loss = 0
#     for epoch in range(epochs):
#         for row in df_data.iterrows():
#             row_contents = row[1]
#             x, y = get_row_xy(row_contents)
#             
#             y_pred = self.model(self._torch_var(x))
#             loss = self.loss(y_pred, self._torch_var(y))
#             epoch_loss = loss.data[0]
#     
#             self.optimizer.zero_grad()
#             loss.backward()
#             self.optimizer.step()
#         print(f'Epoch {epoch} loss: {epoch_loss}')

# train_with_config(linreg_config, sorted_numerical_df)

In [77]:
# 80-20 Split
total_rows = final_sorted_df.shape[0]
print(f'Total test set: {total_rows}')
train_size = math.floor(total_rows*.80)
test_size = total_rows - train_size
print(f'Training set: {train_size}, Testing set: {test_size}')

train_x_split, train_y_split = get_all_xy(final_sorted_df.head(100))
print("Get training set")
test_x_split, test_y_split = get_all_xy(final_sorted_df.tail(10))
print("Get testing set")

Total test set: 277242
Training set: 221793, Testing set: 55449
Get training set
Get testing set


In [78]:
# Get config
default_config_file = open('core_insure/config.yaml', 'r')
yaml = YAML()
config = yaml.load(default_config_file)
print(config)

ordereddict([('base', ordereddict([('filepath', '.')])), ('assessor', ordereddict([('model', 'linear_regression'), ('linear_regression', ordereddict([('output_size', 1), ('lr', 1e-05), ('momentum', 0), ('epochs', 500)])), ('simple_nn', ordereddict([('output_size', 1), ('lr', 0.001), ('epochs', 500), ('hidden_size', 50)])), ('filepath', '.')]))])


In [79]:
# Train linear regression
assessor_config =  config['assessor']
assessor_config['linear_regression']['epochs'] = 1000
assessor = HomeAssessor(assessor_config)

calculate_train_val_loss(assessor, train_x_split, train_y_split, test_x_split, test_y_split)

  print(f'Epoch {epoch}, Loss: {loss}, y_pred preview: {y_pred.data[0]}')


Epoch 0, Loss: 4133.81103515625, y_pred preview: tensor([161.8046])
Epoch 1, Loss: 4016.253662109375, y_pred preview: tensor([250.1991])
Epoch 2, Loss: 3898.694580078125, y_pred preview: tensor([338.5935])
Epoch 3, Loss: 3781.1357421875, y_pred preview: tensor([426.9880])
Epoch 4, Loss: 3663.578369140625, y_pred preview: tensor([515.3824])
Epoch 5, Loss: 3546.0185546875, y_pred preview: tensor([603.7769])
Epoch 6, Loss: 3428.460205078125, y_pred preview: tensor([692.1714])
Epoch 7, Loss: 3310.901611328125, y_pred preview: tensor([780.5659])
Epoch 8, Loss: 3193.34375, y_pred preview: tensor([868.9603])
Epoch 9, Loss: 3075.785400390625, y_pred preview: tensor([957.3548])
Epoch 10, Loss: 2958.226806640625, y_pred preview: tensor([1045.7493])
Epoch 11, Loss: 2840.668212890625, y_pred preview: tensor([1134.1438])
Epoch 12, Loss: 2723.1103515625, y_pred preview: tensor([1222.5382])
Epoch 13, Loss: 2605.551025390625, y_pred preview: tensor([1310.9326])
Epoch 14, Loss: 2487.992431640625, y_pre

Epoch 349, Loss: 965.4285278320312, y_pred preview: tensor([2544.1777])
Epoch 350, Loss: 992.0347900390625, y_pred preview: tensor([2632.5503])
Epoch 351, Loss: 965.4578247070312, y_pred preview: tensor([2544.1560])
Epoch 352, Loss: 992.0054931640625, y_pred preview: tensor([2632.5286])
Epoch 353, Loss: 965.4863891601562, y_pred preview: tensor([2544.1340])
Epoch 354, Loss: 991.9763793945312, y_pred preview: tensor([2632.5068])
Epoch 355, Loss: 965.5154418945312, y_pred preview: tensor([2544.1123])
Epoch 356, Loss: 991.9476318359375, y_pred preview: tensor([2632.4849])
Epoch 357, Loss: 965.54443359375, y_pred preview: tensor([2544.0906])
Epoch 358, Loss: 991.9181518554688, y_pred preview: tensor([2632.4631])
Epoch 359, Loss: 965.5736694335938, y_pred preview: tensor([2544.0686])
Epoch 360, Loss: 991.8897705078125, y_pred preview: tensor([2632.4417])
Epoch 361, Loss: 965.6021118164062, y_pred preview: tensor([2544.0471])
Epoch 362, Loss: 991.861083984375, y_pred preview: tensor([2632.42

Epoch 652, Loss: 988.258544921875, y_pred preview: tensor([2629.7109])
Epoch 653, Loss: 969.2327270507812, y_pred preview: tensor([2541.3164])
Epoch 654, Loss: 988.2354736328125, y_pred preview: tensor([2629.6938])
Epoch 655, Loss: 969.2557983398438, y_pred preview: tensor([2541.2993])
Epoch 656, Loss: 988.2125854492188, y_pred preview: tensor([2629.6765])
Epoch 657, Loss: 969.27880859375, y_pred preview: tensor([2541.2820])
Epoch 658, Loss: 988.1893920898438, y_pred preview: tensor([2629.6594])
Epoch 659, Loss: 969.3016357421875, y_pred preview: tensor([2541.2649])
Epoch 660, Loss: 988.166748046875, y_pred preview: tensor([2629.6421])
Epoch 661, Loss: 969.3244018554688, y_pred preview: tensor([2541.2476])
Epoch 662, Loss: 988.1436767578125, y_pred preview: tensor([2629.6250])
Epoch 663, Loss: 969.3471069335938, y_pred preview: tensor([2541.2305])
Epoch 664, Loss: 988.1212768554688, y_pred preview: tensor([2629.6077])
Epoch 665, Loss: 969.3699340820312, y_pred preview: tensor([2541.213

Epoch 968, Loss: 984.7402954101562, y_pred preview: tensor([2627.0657])
Epoch 969, Loss: 972.7496948242188, y_pred preview: tensor([2538.6714])
Epoch 970, Loss: 984.7189331054688, y_pred preview: tensor([2627.0496])
Epoch 971, Loss: 972.771728515625, y_pred preview: tensor([2538.6550])
Epoch 972, Loss: 984.6970825195312, y_pred preview: tensor([2627.0332])
Epoch 973, Loss: 972.7932739257812, y_pred preview: tensor([2538.6387])
Epoch 974, Loss: 984.6754150390625, y_pred preview: tensor([2627.0168])
Epoch 975, Loss: 972.8152465820312, y_pred preview: tensor([2538.6226])
Epoch 976, Loss: 984.6536865234375, y_pred preview: tensor([2627.0005])
Epoch 977, Loss: 972.8369750976562, y_pred preview: tensor([2538.6062])
Epoch 978, Loss: 984.6318969726562, y_pred preview: tensor([2626.9844])
Epoch 979, Loss: 972.858154296875, y_pred preview: tensor([2538.5898])
Epoch 980, Loss: 984.6105346679688, y_pred preview: tensor([2626.9680])
Epoch 981, Loss: 972.8799438476562, y_pred preview: tensor([2538.5

(tensor(984.3937), tensor(3260.6987))

In [None]:
# Train nn module
assessor_config =  config['assessor']
assessor_config['simple_nn']['epochs'] = 10
assessor_config['model'] = 'simple_nn'
assessor = HomeAssessor(assessor_config)

calculate_train_val_loss(assessor, train_x_split, train_y_split, test_x_split, test_y_split)