In [1]:
#######
# load data to pandas
# train
# tune parameters on holdout set
# test
# kfolds cross validation
#######
# comparisons:
# linear regression
# decision trees (random forest via scikit learn)
# ff neural network (lin, relu, lin) higher order polyn

In [1]:
%load_ext autoreload
%autoreload 2

# Imports
import pandas as pd
from core_insure.assessor.home_assessor import HomeAssessor
from ruamel.yaml import YAML
import numpy as np
import math
from torch import nn

LABEL_KEY = 'claimAmount'

In [None]:
# Processing and training methods
def value_of_interest(value):
    if pd.isna(value):
        return False
    return True

def get_row_xy(row):
    y_label = row[LABEL_KEY] if value_of_interest(row[LABEL_KEY]) else 0
    row = row.drop([LABEL_KEY])
    rowiter = row.iteritems()
    x_array = [item[1] if value_of_interest(item[1]) else 0 for item in rowiter]
    return x_array, y_label

def get_all_xy(df):
    all_x = []
    all_y = []
    for row in df.iterrows():
        row_contents = row[1]
        x, y = get_row_xy(row_contents)
        all_x.append(x)
        all_y.append([y])
    return all_x, all_y

def calculate_train_val_loss(curr_assessor, train_x, train_y, test_x, test_y):
    # exploding gradients?
    train_output = curr_assessor.train(train_x, train_y)

    # Train loss 
    model = curr_assessor.model
    train_pred_y = model.eval(train_x)
    loss = model.loss
    train_loss = loss(model._torch_var(train_pred_y), model._torch_var(train_y))
    print(f'Train Loss: {train_loss}')
    
    # Test linear regression
    test_y_pred = model.eval(test_x)
    test_loss = loss(model._torch_var(test_y_pred), model._torch_var(test_y))
    print(f'Test Losss: {test_loss}')
    
    return train_loss, test_loss

def train_test_split(df, train_percent, test_percent):
    total_rows = df.shape[0]
    print(f'Total test set: {total_rows}')
    train_size = math.floor(total_rows*(train_percent/100))
    test_size = total_rows - train_size
    print(f'Training set: {train_size}, Testing set: {test_size}')
    train_x, train_y = get_all_xy(final_sorted_df.head(train_size))
    print(f'Train set created')
    test_x, test_y = get_all_xy(final_sorted_df.tail(test_size))
    print(f'Test set created')
    return {
        'train_x': train_x,
        'train_y': train_y,
        'test_x': test_x,
        'test_y': test_y
    }

# def train_with_config(linreg_config, df_data):
#     epochs = linreg_config.get('epochs')
#     epoch_loss = 0
#     for epoch in range(epochs):
#         for row in df_data.iterrows():
#             row_contents = row[1]
#             x, y = get_row_xy(row_contents)
#             
#             y_pred = self.model(self._torch_var(x))
#             loss = self.loss(y_pred, self._torch_var(y))
#             epoch_loss = loss.data[0]
#     
#             self.optimizer.zero_grad()
#             loss.backward()
#             self.optimizer.step()
#         print(f'Epoch {epoch} loss: {epoch_loss}')

# train_with_config(linreg_config, sorted_numerical_df)

In [2]:
# Load data to pandas
# This dataset was downloaded from: 
# https://www.fema.gov/openfema-dataset-individual-assistance-housing-registrants-large-disasters-v1
file = open('core_insure/data/IndividualAssistanceHousingRegistrantsLargeDisasters.csv', 'r')
pd_data = pd.read_csv(file)

  interactivity=interactivity, compiler=compiler, result=result)


In [100]:
# Get config
default_config_file = open('core_insure/config.yaml', 'r')
yaml = YAML()
config = yaml.load(default_config_file)
print(config)

ordereddict([('base', ordereddict([('filepath', '.')])), ('assessor', ordereddict([('model', 'linear_regression'), ('linear_regression', ordereddict([('output_size', 1), ('lr', 1e-05), ('momentum', 0), ('epochs', 500)])), ('simple_nn', ordereddict([('output_size', 1), ('lr', 0.001), ('epochs', 500), ('hidden_size', 50)])), ('filepath', '.')]))])


In [112]:
# Data processing
# Columns available
all_columns = list(pd_data.columns.values)

# calculate label: claimAmount based on repairs and replacements
pd_data = pd_data.fillna(0)
pd_data[LABEL_KEY] = pd_data['repairAmount'] + pd_data['replacementAmount']

# For now, filter on just numerical data
# Later, can convert string columns (embedding, bins, num categories)
only_numerical_data = pd_data._get_numeric_data()
sorted_numerical_df = pd_data.reindex(sorted(only_numerical_data.columns), axis=1)
numeric_columns = sorted(sorted_numerical_df.columns.values)
print(list(numeric_columns))

['censusBlockId', 'censusYear', 'claimAmount', 'damagedZipCode', 'destroyed', 'disasterNumber', 'floodDamage', 'floodInsurance', 'foundationDamage', 'foundationDamageAmount', 'grossIncome', 'habitabilityRepairsRequired', 'homeOwnersInsurance', 'householdComposition', 'inspected', 'personalPropertyEligible', 'ppfvl', 'primaryResidence', 'rentalAssistanceAmount', 'rentalAssistanceEligible', 'rentalResourceZipCode', 'repairAmount', 'repairAssistanceEligible', 'replacementAmount', 'replacementAssistanceEligible', 'roofDamage', 'roofDamageAmount', 'rpfvl', 'sbaEligible', 'specialNeeds', 'tsaCheckedIn', 'tsaEligible', 'waterLevel']


In [113]:
# remove all data where repairs were not made
nonzero_numerical_df = sorted_numerical_df.replace(0, np.nan)
nonzero_numerical_df = nonzero_numerical_df[sorted_numerical_df[LABEL_KEY].notnull()] 

In [114]:
# Feature Extraction
# columns = [
#     'damagedZipCode',
#     'disasterNumber',
#     'foundationDamageAmount', 
#     'grossIncome', 
#     'householdComposition',
#     'roofDamageAmount',
#     'waterLevel',
#     LABEL_KEY
# ]
columns = [
    'repairAmount',
    LABEL_KEY
]

final_sorted_df = pd.DataFrame(sorted_numerical_df, columns=columns)
final_sorted_df = final_sorted_df.replace(np.nan,0)

# final_sorted_df = sorted_numerical_df[sorted_numerical_df['repairAmount'].notnull()] 
# final_sorted_df = final_sorted_df.replace(0,1)
print(final_sorted_df.head(10))

   repairAmount  claimAmount
0           0.0          0.0
1           0.0          0.0
2           0.0          0.0
3           0.0          0.0
4           0.0          0.0
5           0.0          0.0
6           0.0          0.0
7           0.0          0.0
8           0.0          0.0
9           0.0          0.0


In [127]:
# 80-20 Split
# full_dataset = final_sorted_df
full_dataset = final_sorted_df.head(1000)
train_test_dataset = train_test_split(full_dataset, 80, 20)

Total test set: 1000
Training set: 800, Testing set: 200


In [129]:
# Train linear regression
assessor_config =  config['assessor']
assessor_config['linear_regression']['epochs'] = 500
assessor = HomeAssessor(assessor_config)

calculate_train_val_loss(assessor, 
                         train_test_dataset['train_x'], 
                         train_test_dataset['train_y'],
                         train_test_dataset['test_x'],
                         train_test_dataset['test_y'])

Epoch 0, Loss: 575.173095703125, y_pred preview: tensor([0.2446])
Epoch 1, Loss: 574.6365356445312, y_pred preview: tensor([0.2446])
Epoch 2, Loss: 574.0999755859375, y_pred preview: tensor([0.2446])
Epoch 3, Loss: 573.5633544921875, y_pred preview: tensor([0.2446])
Epoch 4, Loss: 573.0267333984375, y_pred preview: tensor([0.2446])
Epoch 5, Loss: 572.4902954101562, y_pred preview: tensor([0.2446])
Epoch 6, Loss: 571.9535522460938, y_pred preview: tensor([0.2446])
Epoch 7, Loss: 571.4169921875, y_pred preview: tensor([0.2446])
Epoch 8, Loss: 570.8803100585938, y_pred preview: tensor([0.2446])
Epoch 9, Loss: 570.3436889648438, y_pred preview: tensor([0.2446])
Epoch 10, Loss: 569.80712890625, y_pred preview: tensor([0.2446])
Epoch 11, Loss: 569.2705078125, y_pred preview: tensor([0.2446])
Epoch 12, Loss: 568.7339477539062, y_pred preview: tensor([0.2446])
Epoch 13, Loss: 568.1973266601562, y_pred preview: tensor([0.2446])
Epoch 14, Loss: 567.6607055664062, y_pred preview: tensor([0.2446])

Epoch 306, Loss: 410.9715270996094, y_pred preview: tensor([0.2442])
Epoch 307, Loss: 410.4350280761719, y_pred preview: tensor([0.2442])
Epoch 308, Loss: 409.8983459472656, y_pred preview: tensor([0.2442])
Epoch 309, Loss: 409.3617858886719, y_pred preview: tensor([0.2442])
Epoch 310, Loss: 408.8251647949219, y_pred preview: tensor([0.2442])
Epoch 311, Loss: 408.28851318359375, y_pred preview: tensor([0.2442])
Epoch 312, Loss: 407.7519226074219, y_pred preview: tensor([0.2442])
Epoch 313, Loss: 407.2152404785156, y_pred preview: tensor([0.2441])
Epoch 314, Loss: 406.6786804199219, y_pred preview: tensor([0.2441])
Epoch 315, Loss: 406.1420593261719, y_pred preview: tensor([0.2441])
Epoch 316, Loss: 405.6054992675781, y_pred preview: tensor([0.2441])
Epoch 317, Loss: 405.06890869140625, y_pred preview: tensor([0.2441])
Epoch 318, Loss: 404.5322265625, y_pred preview: tensor([0.2441])
Epoch 319, Loss: 403.99566650390625, y_pred preview: tensor([0.2441])
Epoch 320, Loss: 403.4590148925781

(tensor(306.8697), tensor(346.6814))

In [None]:
# Train nn module
assessor_config =  config['assessor']
assessor_config['simple_nn']['epochs'] = 10
assessor_config['model'] = 'simple_nn'
assessor = HomeAssessor(assessor_config)

calculate_train_val_loss(assessor, train_x_split, train_y_split, test_x_split, test_y_split)