<h1> Biomass Regression Model </h1>

In [None]:
import array
from PIL import Image, ImageDraw
import numpy as np
import pandas as pd
import json
import math
import statsmodels.api as sm
from matplotlib import pyplot as plt
pd.set_option('display.height', 500)
pd.set_option('display.max_rows', 500)

%matplotlib inline

<h1> Create dataframe with relevant features </h1>

In [None]:
data_dir_base = '/root/data/blender_v3/'
COMPLETE_SAMPLE_SIZE = 4007
df = pd.DataFrame()
for idx in range(COMPLETE_SAMPLE_SIZE):
    # get annotation data
    annotation_file_name = 'annot_{}.json'.format(idx)
    annotation_file_path = '{}/{}/{}'.format(data_dir_base, 'annotations', annotation_file_name)
    annotation_data = json.load(open(annotation_file_path, 'rb'))
    df = df.append(annotation_data, ignore_index=True)

# add columns to DataFrame
df['log_volume'] = np.log(df.volume)
df['log_length'] = np.log(df.length)
df['log_width'] = np.log(df.height)
df['log_breadth'] = np.log(df.width)

<h1> Simple Allometric Models </h1>

We first consider the model $ W = al^b $, where $W$ and $l$ represent biomass and weight, respectively. First, rewrite the model as $\log{W} = a + b\log{l}$. We will fit the points to this model.

In [None]:
features = ['log_length']
target = 'log_volume'

model = sm.OLS(df[target], df.filter(features)).fit()
print(model.summary())

A = np.vstack([df.filter(features).values.T, np.ones(df.shape[0])]).T
res = np.linalg.lstsq(A, df[target].values)
b, a = res[0]

In [None]:
df

Now consider the following model, which is the first model but generalized to incorporate other dimensions as well (width and breadth):

$$W=al^b w^c b^d$$

Here, $W$, $l$, $w$ and $b$ represent biomass, length, width, and breadth, respectively. Taking logarithms, this gives 

$$ \log{W} = a + b\log{l} + c\log{w} + d\log{b} $$

In [None]:
features = ['log_length', 'log_width', 'log_breadth']
target = 'log_volume'

model = sm.OLS(df[target], df.filter(features)).fit()
print(model.summary())


In [None]:
model.params

In [None]:
features = ['log_length', 'log_width', 'log_breadth']
A = np.vstack([df[features].values.T, np.ones(df.shape[0])]).T
res = np.linalg.lstsq(A, df[target].values)


In [None]:
np.dot(res[0].T, A.T)

In [None]:
np.dot(res[0].T, A.T)

In [None]:
res[0].T.shape, A.T.shape

In [None]:
res

<h1> k-fold Cross Validation </h1>

In [None]:
results_df = pd.DataFrame()

# specify the allometric models that we want to compare

models = {}
models['model_1'] = {
    'features': ['log_length'],
    'target': 'log_volume'
}
models['model_2'] = {
    'features': ['log_length', 'log_width', 'log_breadth'],
    'target': 'log_volume'
}

# specify the cross validation size
k = 10

# perform k-fold cross validation for all of the models
bucket_endpoints = np.linspace(0, df.shape[0], k + 1).round()
for model_name, model in models.items():
    features = model['features']
    target = model['target']
    for idx in range(len(bucket_endpoints)-1):
        # specify in-sample and out-of-sample subsets
        out_of_sample_mask = (df.index >= bucket_endpoints[idx]) & (df.index < bucket_endpoints[idx+1])
        in_sample_data = df.loc[~out_of_sample_mask].copy(deep=True)
        
        # train the model
        A_in_sample = np.vstack([in_sample_data[features].values.T, np.ones(in_sample_data.shape[0])]).T
        res = np.linalg.lstsq(A_in_sample, in_sample_data[target])
        
        # test the model on out of sample data (the performance metric we will use here is mean squared error)
        out_of_sample_data = df.loc[out_of_sample_mask].copy(deep=True)
        A_out_of_sample = np.vstack([out_of_sample_data[features].values.T, np.ones(out_of_sample_data.shape[0])]).T
        out_of_sample_data['prediction'] = np.dot(res[0].T, A_out_of_sample.T)[0]
        print 
        
        # compute mean squared eror
        mse = ((out_of_sample_data['prediction'] - out_of_sample_data[target])**2).mean()
        line_to_append = {
            'model_name': model_name,
            'features': features,
            'target': target,
            'mse': mse
        }
        results_df = results_df.append(line_to_append, ignore_index=True)
        
        
        

    
    


In [None]:
results_df