In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn import preprocessing
from sklearn.linear_model import Lasso

In [2]:
v = pd.read_csv('v.csv')
v['crushdate'] = pd.to_datetime(v['crushdate'], infer_datetime_format=True)
v.tail()

Unnamed: 0,crushdate,year,month,day,pblk,avg_brix,tier,origin,long,lat,...,humidity_2,humidity_3,cloud_cover_1,cloud_cover_2,cloud_cover_3,maxtemp_1,maxtemp_2,maxtemp_3,earliest_date,time_diff
6098,2014-09-17,2014,9,17,138553,25.471429,VAL,CAL,-8469.2237,2653.3059,...,56.87029,-0.058875,47.576303,-6.902911,-7.489102,6729.152438,5.38641,-0.172832,2014-07-28,51 days 00:00:00.000000000
6099,2014-09-18,2014,9,18,102722,23.533333,OAK,LDT,-8506.0542,2681.2464,...,58.821047,1.169062,54.530384,-17.83442,-14.386652,6644.170037,9.770275,1.167371,2014-07-28,52 days 00:00:00.000000000
6100,2014-09-18,2014,9,18,131157,26.166667,VAL,CAL,-8475.6392,2657.1293,...,57.186324,0.197274,46.818752,-11.44135,-9.490292,6714.229281,3.716271,-3.304802,2014-07-28,52 days 00:00:00.000000000
6101,2014-09-18,2014,9,18,133581,27.275,OAK,LDT,-8507.8084,2685.452,...,58.561797,1.032702,54.900021,-17.776238,-17.790562,6661.151729,5.188015,-2.01681,2014-07-28,52 days 00:00:00.000000000
6102,2014-09-18,2014,9,18,138553,24.934783,VAL,CAL,-8469.2237,2653.3059,...,56.87029,-0.058875,47.576303,-6.902911,-7.489102,6729.152438,5.38641,-0.172832,2014-07-28,52 days 00:00:00.000000000


In [10]:
v['avg_brix'].describe()

count    6103.000000
mean       23.598565
std         1.592588
min        17.345000
25%        22.800000
50%        23.842857
75%        24.621429
max        29.500000
Name: avg_brix, dtype: float64

In [3]:
v['time_diff'] = v['time_diff'].apply(lambda x: x[:2]).astype(int)
v = v.drop(['day', 'earliest_date'], axis=1)

In [4]:
v.tail()

Unnamed: 0,crushdate,year,month,pblk,avg_brix,tier,origin,long,lat,yesterday_ddays50,...,humidity_1,humidity_2,humidity_3,cloud_cover_1,cloud_cover_2,cloud_cover_3,maxtemp_1,maxtemp_2,maxtemp_3,time_diff
6098,2014-09-17,2014,9,138553,25.471429,VAL,CAL,-8469.2237,2653.3059,24.03,...,4.985406,56.87029,-0.058875,47.576303,-6.902911,-7.489102,6729.152438,5.38641,-0.172832,51
6099,2014-09-18,2014,9,102722,23.533333,OAK,LDT,-8506.0542,2681.2464,22.29,...,5.213777,58.821047,1.169062,54.530384,-17.83442,-14.386652,6644.170037,9.770275,1.167371,52
6100,2014-09-18,2014,9,131157,26.166667,VAL,CAL,-8475.6392,2657.1293,24.57,...,5.043957,57.186324,0.197274,46.818752,-11.44135,-9.490292,6714.229281,3.716271,-3.304802,52
6101,2014-09-18,2014,9,133581,27.275,OAK,LDT,-8507.8084,2685.452,23.02,...,5.189385,58.561797,1.032702,54.900021,-17.776238,-17.790562,6661.151729,5.188015,-2.01681,52
6102,2014-09-18,2014,9,138553,24.934783,VAL,CAL,-8469.2237,2653.3059,25.37,...,4.985406,56.87029,-0.058875,47.576303,-6.902911,-7.489102,6729.152438,5.38641,-0.172832,52


In [5]:
# Standardize continuous columns and create dummy for categorical
continuous = ['time_diff', 'long', 'lat', 'yesterday_ddays50', 'yesterday_ddays50_97', 'yesterday_ddays97',
              'yesterday_humidity', 'yesterday_cloud_cover', 'yesterday_pressure', 'yesterday_maxtemp',
              'yesterday_mintemp', 'week_ddays50', 'week_ddays50_97', 'week_ddays97', 'week_humidity',
              'week_cloud_cover', 'week_pressure', 'week_maxtemp', 'week_mintemp', 'month_ddays50',
              'month_ddays50_97', 'month_ddays97', 'month_humidity', 'month_cloud_cover', 'month_pressure',
              'month_maxtemp', 'month_mintemp', 'month3_ddays50', 'month3_ddays50_97', 'month3_ddays97',
              'month3_humidity', 'month3_cloud_cover', 'month3_pressure', 'month3_maxtemp', 'month3_mintemp',
              'ddays50_1', 'ddays50_2', 'ddays50_3', 'humidity_1', 'humidity_2', 'humidity_3', 'cloud_cover_1',
              'cloud_cover_2', 'cloud_cover_3', 'maxtemp_1', 'maxtemp_2', 'maxtemp_3']
categorical = ['year', 'month', 'origin', 'tier']
useless = ['crushdate', 'pblk']

for item in categorical:
    dummy = pd.get_dummies(v[item], prefix=item)
    v = v.join(dummy.ix[:, :])
    v = v.drop(item, axis=1)
    
scalar = preprocessing.StandardScaler()
scalar.fit(v[continuous])
v[continuous] = scalar.transform(v[continuous])

v = v.drop(useless, axis=1)

In [6]:
v.head()

Unnamed: 0,avg_brix,long,lat,yesterday_ddays50,yesterday_ddays50_97,yesterday_ddays97,yesterday_humidity,yesterday_cloud_cover,yesterday_pressure,yesterday_maxtemp,...,tier_MRU,tier_OAK,tier_OFX,tier_OKX,tier_PRE,tier_PRS,tier_RWC,tier_SVL,tier_TLV,tier_VAL
0,22.9,-0.609619,0.507389,-0.644272,-0.495267,-0.323883,0.906516,0.553765,1.556668,-0.431837,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,23.2,-0.601403,0.574836,-0.517227,-0.324215,-0.323883,0.597813,0.13596,1.468461,-0.289593,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,22.571429,-0.050168,-0.263187,-0.225024,0.069205,-0.323883,0.803615,-0.142577,1.37642,-0.166217,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,21.675,0.40631,-0.675187,-0.019937,0.345332,-0.323883,0.597813,-0.281846,1.257532,-0.082032,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,23.55,-0.350124,0.522309,-0.402887,-0.170268,-0.323883,0.700714,-0.142577,1.437781,-0.15896,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [11]:
v.columns.tolist()

['avg_brix',
 'long',
 'lat',
 'yesterday_ddays50',
 'yesterday_ddays50_97',
 'yesterday_ddays97',
 'yesterday_humidity',
 'yesterday_cloud_cover',
 'yesterday_pressure',
 'yesterday_maxtemp',
 'yesterday_mintemp',
 'week_ddays50',
 'week_ddays50_97',
 'week_ddays97',
 'week_humidity',
 'week_cloud_cover',
 'week_pressure',
 'week_maxtemp',
 'week_mintemp',
 'month_ddays50',
 'month_ddays50_97',
 'month_ddays97',
 'month_humidity',
 'month_cloud_cover',
 'month_pressure',
 'month_maxtemp',
 'month_mintemp',
 'month3_ddays50',
 'month3_ddays50_97',
 'month3_ddays97',
 'month3_humidity',
 'month3_cloud_cover',
 'month3_pressure',
 'month3_maxtemp',
 'month3_mintemp',
 'ddays50_1',
 'ddays50_2',
 'ddays50_3',
 'humidity_1',
 'humidity_2',
 'humidity_3',
 'cloud_cover_1',
 'cloud_cover_2',
 'cloud_cover_3',
 'maxtemp_1',
 'maxtemp_2',
 'maxtemp_3',
 'time_diff',
 'year_2001',
 'year_2002',
 'year_2003',
 'year_2004',
 'year_2005',
 'year_2006',
 'year_2007',
 'year_2008',
 'year_2009',
 'y

In [7]:
y = v['avg_brix']
X = v.drop(['avg_brix'], axis=1)

model = Lasso(alpha=0.01)
model.fit(X, y)
print('R^2 score: ', model.score(X, y))

R^2 score:  0.65822325792


In [8]:
# calculate mean absolute error for training set
y_pred = model.predict(X)
print('Mean absolute error: ', abs(y - y_pred).mean())
print('Mwan absolute percentage error: ', (abs(y - y_pred) / y).mean())

Mean absolute error:  0.717353627830613
Mwan absolute percentage error:  0.030766834182286682


In [52]:
coef = pd.DataFrame(columns=['feature', 'coef'])
coef['feature'] = X.columns.tolist()
coef['coef'] = model.coef_

print('Intercept: ', model.intercept_)

coef = coef.sort_values('coef')
coef.head(30)

Intercept:  23.8546442408


Unnamed: 0,feature,coef
77,tier_CHP,-2.070239
51,year_2005,-0.54132
65,origin_CAL,-0.467825
42,cloud_cover_3,-0.344833
92,tier_VAL,-0.318719
28,month3_ddays97,-0.169926
19,month_ddays50_97,-0.14025
5,yesterday_humidity,-0.101727
0,long,-0.101248
58,year_2012,-0.096878
