In [1]:
#######
# load data to pandas
# train
# tune parameters on holdout set
# test
# kfolds cross validation
#######
# comparisons:
# linear regression
# decision trees (random forest via scikit learn)
# feedforward neural network (lin, relu, lin) higher order polyn

In [8]:
%load_ext autoreload
%autoreload 2

# Imports
import pandas as pd
from core_insure.assessor.home_assessor import HomeAssessor
from ruamel.yaml import YAML
import numpy as np
import math

LABEL_KEY = 'repairAmount'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
# Load data to pandas
# This dataset was downloaded from: 
# https://www.fema.gov/openfema-dataset-individual-assistance-housing-registrants-large-disasters-v1
file = open('core_insure/data/IndividualAssistanceHousingRegistrantsLargeDisasters.csv', 'r')
pd_data = pd.read_csv(file)

  interactivity=interactivity, compiler=compiler, result=result)


In [32]:
# Columns available, y = repairAmount
# For now, filter on just numerical data
# Later, can convert string columns (embedding, bins, num categories)
all_columns = list(pd_data.columns.values)
only_numerical_data = pd_data._get_numeric_data()
sorted_numerical_df = pd_data.reindex(sorted(only_numerical_data.columns), axis=1)
numeric_columns = sorted(sorted_numerical_df.columns.values)
# print(list(numeric_columns))

# remove all data where repairs were not made
final_sorted_numerical_df = sorted_numerical_df[sorted_numerical_df[LABEL_KEY].notnull()]

In [45]:
# Get config
default_config_file = open('core_insure/config.yaml', 'r')
yaml = YAML()
config = yaml.load(default_config_file)
print(config)

ordereddict([('base', ordereddict([('filepath', '.')])), ('assessor', ordereddict([('model', 'linear_regression'), ('linear_regression', ordereddict([('output_size', 1), ('lr', 0.001), ('momentum', 0.01), ('epochs', 500)])), ('simple_nn', ordereddict([('output_size', 1), ('lr', 0.001), ('momentum', 0.01), ('epochs', 500), ('hidden_size', 50)])), ('filepath', '.')]))])


In [21]:
# Processing and training methods
def value_of_interest(value):
    if pd.isna(value):
        return False
    return True

def get_row_xy(row):
    y_label = row[LABEL_KEY] if value_of_interest(row[LABEL_KEY]) else 0
    row.drop([LABEL_KEY])
    rowiter = row.iteritems()
    x_array = [item[1] if value_of_interest(item[1]) else 0 for item in rowiter]
    return x_array, y_label

def get_all_xy(df):
    all_x = []
    all_y = []
    for row in df.iterrows():
        row_contents = row[1]
        x, y = get_row_xy(row_contents)
        all_x.append(x)
        all_y.append([y])
    return all_x, all_y

# def train_with_config(linreg_config, df_data):
#     epochs = linreg_config.get('epochs')
#     epoch_loss = 0
#     for epoch in range(epochs):
#         for row in df_data.iterrows():
#             row_contents = row[1]
#             x, y = get_row_xy(row_contents)
#             
#             y_pred = self.model(self._torch_var(x))
#             loss = self.loss(y_pred, self._torch_var(y))
#             epoch_loss = loss.data[0]
#     
#             self.optimizer.zero_grad()
#             loss.backward()
#             self.optimizer.step()
#         print(f'Epoch {epoch} loss: {epoch_loss}')

# train_with_config(linreg_config, sorted_numerical_df)

In [41]:
# 80-20 Split
total_rows = final_sorted_numerical_df.shape[0]
print(f'Total test set: {total_rows}')
train_size = math.floor(total_rows*.80)
test_size = total_rows - train_size
print(f'Training set: {train_size}, Testing set: {test_size}')

train_x, train_y = get_all_xy(final_sorted_numerical_df.head(1000))
print("Get training set")
test_x, test_y = get_all_xy(final_sorted_numerical_df.tail(10))
print("Get testing set")

# Train linear regression
assessor_config =  config['assessor']
assessor_config['linear_regression']['epochs'] = 10
assessor = HomeAssessor(assessor_config)

train_output = assessor.train(train_x, train_y)
print(train_output)

Total test set: 274047
Training set: 219237, Testing set: 54810
Get training set
Get testing set
Epoch 0, Loss: 9020984262656.0, y_pred: tensor([[-10035882098688.0000],
        [-10030331985920.0000],
        [ -1675733368832.0000],
        [-10047137513472.0000],
        [ -1673843965952.0000],
        [           3699.0261],
        [-10039024680960.0000],
        [-10035882098688.0000],
        [-10029485785088.0000],
        [-10042000539648.0000],
        [-10043177041920.0000],
        [           2429.0459],
        [ -1683392954368.0000],
        [-10047577915392.0000],
        [-10044908240896.0000],
        [-10044227715072.0000],
        [-10047577915392.0000],
        [-10042000539648.0000],
        [ -1683392823296.0000],
        [-10029485785088.0000],
        [-10031408873472.0000],
        [-10028956254208.0000],
        [-10031988736000.0000],
        [ -1683666108416.0000],
        [-10046476910592.0000],
        [-10040612225024.0000],
        [-10036455669760.0000],

In [49]:
# Train nn module
assessor_config =  config['assessor']
assessor_config['simple_nn']['epochs'] = 10
assessor_config['model'] = 'simple_nn'
assessor = HomeAssessor(assessor_config)

train_output = assessor.train(train_x, train_y)
print(train_output)

Epoch 0, Loss: 1.5164996360694735e+26, y_pred: tensor([[-13065313583104.0000],
        [-13058088894464.0000],
        [ -2181570232320.0000],
        [-13079966384128.0000],
        [ -2179110273024.0000],
        [          -2122.1736],
        [-13069407223808.0000],
        [-13065313583104.0000],
        [-13056983695360.0000],
        [-13073278566400.0000],
        [-13074811584512.0000],
        [          -1369.3966],
        [ -2191541665792.0000],
        [-13080541003776.0000],
        [-13077061828608.0000],
        [-13076176830464.0000],
        [-13080541003776.0000],
        [-13073278566400.0000],
        [ -2191541665792.0000],
        [-13056983695360.0000],
        [-13059487694848.0000],
        [-13056297926656.0000],
        [-13060244766720.0000],
        [ -2191897919488.0000],
        [-13079104454656.0000],
        [-13071472918528.0000],
        [-13066060169216.0000],
        [-13060244766720.0000],
        [-13068222332928.0000],
        [-13075937755136.

  loss = self.loss(y_pred, self._torch_var(y_labels))
