###  Do some often needed feature transformations and save.

One thing that's become apparent is that it may be very helpful if I want to try a bunch of models to pre-compute and save everything in a master modeling dataset.

Here's the set I want to end up with.  Don't lose original source fields; I might need them.

Before I save, I want to question whether I need to save out install_week and install_date.  I can see keeping install_date.
But I can't think of a reason to use the other two for modeling - sklearn can't deal with them (AFAIK).  I'll dispense with them for now.

1. install_date ==> num_days
1. install_week ==> num_weeks
1. install_month ==> num_months
1. size_kw	== scaleSize
1. state ==> one-hot rep  -- actually, since it's very fast to do this, I'll leave state alone (saves a lot of i/o)
1. cost_per_watt gets left alone

In [1]:
import datetime

# the ususal suspects
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.rc('figure', figsize=(10, 8))
np.set_printoptions(precision=4, suppress=False)
# please show all columns
pd.set_option("display.max_columns", 60)

import seaborn as sns
sns.set()

# Import sklearn stuff
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import r2_score, mean_squared_error, make_scorer
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline

In [2]:
# lodd up
dfMod = pd.read_csv('../local/data/LBNL_openpv_tts_data/ModelData.csv',
                    encoding='iso-8859-1', # avoids windows encoding issue
                    index_col='row_id',
                    parse_dates=['install_date'])

In [3]:
# some transformed field are already here, i.e. scaleSize, num_days
dfMod.head()

Unnamed: 0_level_0,num_days,size_kw,state,cost_per_watt,install_date,scaleSize
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,2.2824,CA,10.734315,1998-01-09,0
2,21,1.8504,CA,11.108701,1998-01-30,0
4,26,2.3076,CA,8.667013,1998-02-04,0
6,84,2.3316,CA,13.270286,1998-04-03,0
7,111,0.93,CA,14.654839,1998-04-30,0


In [4]:
# capture the installation month for each row
month = dfMod.install_date.apply(lambda x: x.to_period('M'))

# save in a new column
dfMod = dfMod.assign(install_month=month)

In [26]:
dfMod.head()

Unnamed: 0_level_0,num_days,num_weeks,num_months,size_kw,scaleSize,state,cost_per_watt,install_date
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0.0,0.0,0.0,2.2824,0.0,CA,10.734315,1998-01-09
2,21.0,3.0,0.0,1.8504,0.0,CA,11.108701,1998-01-30
4,26.0,4.0,1.0,2.3076,0.0,CA,8.667013,1998-02-04
6,84.0,12.0,3.0,2.3316,0.0,CA,13.270286,1998-04-03
7,111.0,16.0,3.0,0.93,0.0,CA,14.654839,1998-04-30


In [6]:
week = dfMod.install_date.apply(lambda x: x.to_period('W'))

In [7]:
dfMod = dfMod.assign(install_week=week)
dfMod.head()

Unnamed: 0_level_0,num_days,size_kw,state,cost_per_watt,install_date,scaleSize,install_month,install_week
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,2.2824,CA,10.734315,1998-01-09,0,1998-01,1998-01-05/1998-01-11
2,21,1.8504,CA,11.108701,1998-01-30,0,1998-01,1998-01-26/1998-02-01
4,26,2.3076,CA,8.667013,1998-02-04,0,1998-02,1998-02-02/1998-02-08
6,84,2.3316,CA,13.270286,1998-04-03,0,1998-04,1998-03-30/1998-04-05
7,111,0.93,CA,14.654839,1998-04-30,0,1998-04,1998-04-27/1998-05-03


In [8]:
### go from install_week to number of weeks
week0 = dfMod.install_week.iloc[0]

In [9]:
type(week0)

pandas._libs.period.Period

In [10]:
num_weeks = (dfMod['install_week'] - week0 )
num_weeks = num_weeks.astype(np.float)
num_weeks[:5]

row_id
1     0.0
2     3.0
4     4.0
6    12.0
7    16.0
Name: install_week, dtype: float64

In [11]:
dfMod = dfMod.assign(num_weeks=num_weeks); dfMod.head()

Unnamed: 0_level_0,num_days,size_kw,state,cost_per_watt,install_date,scaleSize,install_month,install_week,num_weeks
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,2.2824,CA,10.734315,1998-01-09,0,1998-01,1998-01-05/1998-01-11,0.0
2,21,1.8504,CA,11.108701,1998-01-30,0,1998-01,1998-01-26/1998-02-01,3.0
4,26,2.3076,CA,8.667013,1998-02-04,0,1998-02,1998-02-02/1998-02-08,4.0
6,84,2.3316,CA,13.270286,1998-04-03,0,1998-04,1998-03-30/1998-04-05,12.0
7,111,0.93,CA,14.654839,1998-04-30,0,1998-04,1998-04-27/1998-05-03,16.0


In [12]:
### same thing with months
### go from install_week to number of weeks
month0 = dfMod.install_month.iloc[0]

In [13]:
type(month0)

pandas._libs.period.Period

In [14]:
num_months = (dfMod['install_month'] - month0 )
num_months = num_months.astype(np.float)
num_months[:5]

row_id
1    0.0
2    0.0
4    1.0
6    3.0
7    3.0
Name: install_month, dtype: float64

In [15]:
dfMod = dfMod.assign(num_months=num_months); dfMod.head()

Unnamed: 0_level_0,num_days,size_kw,state,cost_per_watt,install_date,scaleSize,install_month,install_week,num_weeks,num_months
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,2.2824,CA,10.734315,1998-01-09,0,1998-01,1998-01-05/1998-01-11,0.0,0.0
2,21,1.8504,CA,11.108701,1998-01-30,0,1998-01,1998-01-26/1998-02-01,3.0,0.0
4,26,2.3076,CA,8.667013,1998-02-04,0,1998-02,1998-02-02/1998-02-08,4.0,1.0
6,84,2.3316,CA,13.270286,1998-04-03,0,1998-04,1998-03-30/1998-04-05,12.0,3.0
7,111,0.93,CA,14.654839,1998-04-30,0,1998-04,1998-04-27/1998-05-03,16.0,3.0


In [16]:
dfMod.columns

Index(['num_days', 'size_kw', 'state', 'cost_per_watt', 'install_date',
       'scaleSize', 'install_month', 'install_week', 'num_weeks',
       'num_months'],
      dtype='object')

In [17]:
### reorder columns
dfMod = dfMod[['num_days', 'num_weeks', 'num_months', 'size_kw', 'scaleSize', 'state', 'cost_per_watt', 'install_date']]

In [18]:
dfMod.head()

Unnamed: 0_level_0,num_days,num_weeks,num_months,size_kw,scaleSize,state,cost_per_watt,install_date
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,0.0,0.0,2.2824,0,CA,10.734315,1998-01-09
2,21,3.0,0.0,1.8504,0,CA,11.108701,1998-01-30
4,26,4.0,1.0,2.3076,0,CA,8.667013,1998-02-04
6,84,12.0,3.0,2.3316,0,CA,13.270286,1998-04-03
7,111,16.0,3.0,0.93,0,CA,14.654839,1998-04-30


In [19]:
_days = dfMod.num_days.astype(np.float)
dfMod = dfMod.assign(num_days=_days)

In [20]:
_scaleSize = dfMod.scaleSize.astype(np.float)
dfMod = dfMod.assign(scaleSize=_scaleSize)

In [21]:
dfMod.to_csv('../local/data/LBNL_openpv_tts_data/model_data_2.csv')

In [22]:
thing = pd.read_csv('../local/data/LBNL_openpv_tts_data/model_data_2.csv', index_col='row_id', parse_dates=['install_date'])

In [23]:
thing.head()

Unnamed: 0_level_0,num_days,num_weeks,num_months,size_kw,scaleSize,state,cost_per_watt,install_date
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0.0,0.0,0.0,2.2824,0.0,CA,10.734315,1998-01-09
2,21.0,3.0,0.0,1.8504,0.0,CA,11.108701,1998-01-30
4,26.0,4.0,1.0,2.3076,0.0,CA,8.667013,1998-02-04
6,84.0,12.0,3.0,2.3316,0.0,CA,13.270286,1998-04-03
7,111.0,16.0,3.0,0.93,0.0,CA,14.654839,1998-04-30


In [24]:
dfMod.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 364212 entries, 1 to 1094908
Data columns (total 8 columns):
num_days         364212 non-null float64
num_weeks        364212 non-null float64
num_months       364212 non-null float64
size_kw          364212 non-null float64
scaleSize        364212 non-null float64
state            364212 non-null object
cost_per_watt    364212 non-null float64
install_date     364212 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(6), object(1)
memory usage: 25.0+ MB


In [25]:
thing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 364212 entries, 1 to 1094908
Data columns (total 8 columns):
num_days         364212 non-null float64
num_weeks        364212 non-null float64
num_months       364212 non-null float64
size_kw          364212 non-null float64
scaleSize        364212 non-null float64
state            364212 non-null object
cost_per_watt    364212 non-null float64
install_date     364212 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(6), object(1)
memory usage: 25.0+ MB


### some information about scale sizes that's needed when size_kw is a feature instead of scaleSize

In [28]:
dfMod.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 364212 entries, 1 to 1094908
Data columns (total 8 columns):
num_days         364212 non-null float64
num_weeks        364212 non-null float64
num_months       364212 non-null float64
size_kw          364212 non-null float64
scaleSize        364212 non-null float64
state            364212 non-null object
cost_per_watt    364212 non-null float64
install_date     364212 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(6), object(1)
memory usage: 25.0+ MB


In [30]:
dfMod.scaleSize.value_counts()

1.0    116541
2.0    111929
3.0     60571
0.0     36554
4.0     24431
5.0      7780
6.0      3310
7.0      1667
8.0       893
9.0       536
Name: scaleSize, dtype: int64

#### Median size in scaleSize 1, 2 and 3 (the biggest groups)

In [39]:
[dfMod[dfMod.scaleSize == x].size_kw.median() for x in [1.0, 2.0, 3.0]]

[3.84, 6.0, 8.5]

#### Overall median size

In [41]:
dfMod.size_kw.median(), dfMod.size_kw.mean()

(5.5, 6.042169246822802)