<h1> Biomass Regression Model </h1>

In [1]:
import array
from PIL import Image, ImageDraw
import numpy as np
import pandas as pd
import json
import math
import statsmodels.api as sm
from matplotlib import pyplot as plt
pd.set_option('display.height', 500)
pd.set_option('display.max_rows', 500)

%matplotlib inline

<h1> Create dataframe with relevant features </h1>

In [2]:
data_dir_base = '/root/data/blender_v3/'
COMPLETE_SAMPLE_SIZE = 4007
df = pd.DataFrame()
for idx in range(COMPLETE_SAMPLE_SIZE):
    # get annotation data
    annotation_file_name = 'annot_{}.json'.format(idx)
    annotation_file_path = '{}/{}/{}'.format(data_dir_base, 'annotations', annotation_file_name)
    annotation_data = json.load(open(annotation_file_path, 'rb'))
    df = df.append(annotation_data, ignore_index=True)

# add columns to DataFrame
df['log_volume'] = np.log(df.volume)
df['log_length'] = np.log(df.length)
df['log_width'] = np.log(df.height)
df['log_breadth'] = np.log(df.width)

<h1> Simple Allometric Models </h1>

We first consider the model $ W = al^b $, where $W$ and $l$ represent biomass and weight, respectively. First, rewrite the model as $\log{W} = a + b\log{l}$. We will fit the points to this model.

In [3]:
features = ['log_length']
target = 'log_volume'

model = sm.OLS(df[target], df.filter(features)).fit()
print(model.summary())

A = np.vstack([df.filter(features).values.T, np.ones(df.shape[0])]).T
res = np.linalg.lstsq(A, df[target].values)
b, a = res[0]

                            OLS Regression Results                            
Dep. Variable:             log_volume   R-squared:                       0.998
Model:                            OLS   Adj. R-squared:                  0.998
Method:                 Least Squares   F-statistic:                 1.927e+06
Date:                Thu, 14 Jun 2018   Prob (F-statistic):               0.00
Time:                        20:10:26   Log-Likelihood:                -1558.2
No. Observations:                4007   AIC:                             3118.
Df Residuals:                    4006   BIC:                             3125.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
log_length     1.7590      0.001   1388.162      0.0

  


In [4]:
df

Unnamed: 0,focal_length,height,interocular_distance,length,sensor_height,sensor_width,unit,unit_system,volume,width,log_volume,log_length,log_width,log_breadth
0,1.0,20.760459,0.65,103.802296,3.2,1.8,centimeter,METRIC,4410.102565,17.300383,8.391653,4.642488,3.033050,2.850729
1,1.0,21.529521,0.65,107.647604,3.2,1.8,centimeter,METRIC,4918.065291,17.941267,8.500670,4.678863,3.069425,2.887103
2,1.0,22.001732,0.65,110.008658,3.2,1.8,centimeter,METRIC,5248.575445,18.334776,8.565712,4.700559,3.091121,2.908800
3,1.0,18.319376,0.65,91.596881,3.2,1.8,centimeter,METRIC,3031.227486,15.266147,8.016723,4.517397,2.907959,2.725638
4,1.0,18.734798,0.65,93.673988,3.2,1.8,centimeter,METRIC,3241.954948,15.612331,8.083932,4.539821,2.930383,2.748061
5,1.0,16.761576,0.65,83.807878,3.2,1.8,centimeter,METRIC,2322.505339,13.967980,7.750402,4.428527,2.819089,2.636768
6,1.0,20.545638,0.65,102.728189,3.2,1.8,centimeter,METRIC,4274.685574,17.121365,8.360466,4.632087,3.022649,2.840327
7,1.0,21.861212,0.65,109.306060,3.2,1.8,centimeter,METRIC,5148.768655,18.217677,8.546513,4.694152,3.084714,2.902392
8,1.0,23.221622,0.65,116.108112,3.2,1.8,centimeter,METRIC,6170.065679,19.351352,8.727465,4.754522,3.145084,2.962762
9,1.0,11.978642,0.65,59.893212,3.2,1.8,centimeter,METRIC,848.807730,9.982202,6.743833,4.092563,2.483125,2.300804


Now consider the following model, which is the first model but generalized to incorporate other dimensions as well (width and breadth):

$$W=al^b w^c b^d$$

Here, $W$, $l$, $w$ and $b$ represent biomass, length, width, and breadth, respectively. Taking logarithms, this gives 

$$ \log{W} = a + b\log{l} + c\log{w} + d\log{b} $$

In [15]:
features = ['log_length', 'log_width', 'log_breadth']
target = 'log_volume'

model = sm.OLS(df[target], df.filter(features)).fit()
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:             log_volume   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 1.663e+11
Date:                Thu, 24 May 2018   Prob (F-statistic):               0.00
Time:                        23:17:38   Log-Likelihood:                 30051.
No. Observations:                4007   AIC:                        -6.010e+04
Df Residuals:                    4005   BIC:                        -6.009e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
log_length     -0.2384   1.18e-05  -2.02e+04      

ValueError: too many values to unpack (expected 2)

In [19]:
model.params

log_length    -0.238440
log_width      1.517973
log_breadth    1.716944
dtype: float64

In [16]:
features = ['log_length', 'log_width', 'log_breadth']
A = np.vstack([df[features].values.T, np.ones(df.shape[0])]).T
res = np.linalg.lstsq(A, df[target].values)


In [5]:
np.dot(res[0].T, A.T)

array([[ 8.39167452,  8.50067115,  8.56568289, ...,  7.32076955,
         8.72807741,  8.77813947]])

In [6]:
np.dot(res[0].T, A.T)

array([[ 8.39167452,  8.50067115,  8.56568289, ...,  7.32076955,
         8.72807741,  8.77813947]])

In [7]:
res[0].T.shape, A.T.shape

((1, 4), (4, 4007))

In [10]:
res

(array([ -7.19356492e+07,   9.15796487e+07,  -1.96439965e+07,
          1.12194436e+08]),
 array([], dtype=float64),
 3,
 array([  3.60299720e+02,   8.58442905e+00,   1.08146712e-13,
          9.54306990e-15]))

<h1> k-fold Cross Validation </h1>

In [8]:
results_df = pd.DataFrame()

# specify the allometric models that we want to compare

models = {}
models['model_1'] = {
    'features': ['log_length'],
    'target': 'log_volume'
}
models['model_2'] = {
    'features': ['log_length', 'log_width', 'log_breadth'],
    'target': 'log_volume'
}

# specify the cross validation size
k = 10

# perform k-fold cross validation for all of the models
bucket_endpoints = np.linspace(0, df.shape[0], k + 1).round()
for model_name, model in models.items():
    features = model['features']
    target = model['target']
    for idx in range(len(bucket_endpoints)-1):
        # specify in-sample and out-of-sample subsets
        out_of_sample_mask = (df.index >= bucket_endpoints[idx]) & (df.index < bucket_endpoints[idx+1])
        in_sample_data = df.loc[~out_of_sample_mask].copy(deep=True)
        
        # train the model
        A_in_sample = np.vstack([in_sample_data[features].values.T, np.ones(in_sample_data.shape[0])]).T
        res = np.linalg.lstsq(A_in_sample, in_sample_data[target])
        
        # test the model on out of sample data (the performance metric we will use here is mean squared error)
        out_of_sample_data = df.loc[out_of_sample_mask].copy(deep=True)
        A_out_of_sample = np.vstack([out_of_sample_data[features].values.T, np.ones(out_of_sample_data.shape[0])]).T
        out_of_sample_data['prediction'] = np.dot(res[0].T, A_out_of_sample.T)[0]
        print 
        
        # compute mean squared eror
        mse = ((out_of_sample_data['prediction'] - out_of_sample_data[target])**2).mean()
        line_to_append = {
            'model_name': model_name,
            'features': features,
            'target': target,
            'mse': mse
        }
        results_df = results_df.append(line_to_append, ignore_index=True)
        
        
        

    
    


In [9]:
results_df

Unnamed: 0,features,model_name,mse,target
0,[log_length],model_1,1.216388,log_volume
1,[log_length],model_1,1.72725,log_volume
2,[log_length],model_1,1.901433,log_volume
3,[log_length],model_1,0.770773,log_volume
4,[log_length],model_1,1.294196,log_volume
5,[log_length],model_1,3.821907,log_volume
6,[log_length],model_1,2.336363,log_volume
7,[log_length],model_1,1.956853,log_volume
8,[log_length],model_1,0.974054,log_volume
9,[log_length],model_1,3.687536,log_volume
