In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=False)
# please show all columns
pd.set_option("display.max_columns", 60)
import seaborn as sns
sns.set()

In [3]:
# reads in with no explosions
dftts = pd.read_csv('../local/data/LBNL_openpv_tts_data/live_20180118',
                    parse_dates=['install_date'],
                    na_values=[-9999],
                    encoding = 'utf-8',
                    error_bad_lines=False,
                    warn_bad_lines=True) 
dftts.head()

Unnamed: 0.1,Unnamed: 0,row_id,file_row,data_provider,sysid_dp,sysid_tts,install_date,size_kw,price,appraised_value,cust_type,new_const,tracking,ground_mounted,battery,zipcode,city,county,state,third-party,uinverter,dc_optimizer,cost_per_watt
0,0,1078236,20378236,Austin Energy,4,TX_AE_6232,1998-01-01,100.0,,False,GOV,,,,,,,,TX,0.0,,,
1,1,108220,10108220,California Public Utilities Commission (Curren...,PGE-INT-11328 & CA_ERP_24698,CA-NEM-12257,1998-01-09,2.2824,24500.0,False,RES,0.0,,,,94107.0,San Francisco,San Francisco,CA,,,,10.734315
2,2,107162,10107162,California Public Utilities Commission (Curren...,PGE-INT-11220 & CA_ERP_24687,CA-NEM-11180,1998-01-30,1.8504,20555.54,False,RES,0.0,,,,95949.0,Nevada City,Nevada,CA,,,,11.108701
3,3,969877,20269877,Public Service Company of New Mexico,Qfs<10kW #1,NM-PNM-496,1998-02-01,1.1,,False,RES,,,,,87505.0,Santa Fe,,NM,,,,
4,4,107618,10107618,California Public Utilities Commission (Curren...,PGE-INT-11268 & CA_ERP_24540,CA-NEM-11641,1998-02-04,2.3076,20000.0,False,RES,0.0,,,,94518.0,Concord,Contra Costa,CA,,,0.0,8.667013


#### Get rid of useless columns and rename the index for future reading

In [5]:
dftts.drop(['Unnamed: 0', 'row_id'], axis='columns', inplace=True)

In [6]:
dftts.index.name = 'row_id'

In [7]:
dftts.head()

Unnamed: 0_level_0,file_row,data_provider,sysid_dp,sysid_tts,install_date,size_kw,price,appraised_value,cust_type,new_const,tracking,ground_mounted,battery,zipcode,city,county,state,third-party,uinverter,dc_optimizer,cost_per_watt
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,20378236,Austin Energy,4,TX_AE_6232,1998-01-01,100.0,,False,GOV,,,,,,,,TX,0.0,,,
1,10108220,California Public Utilities Commission (Curren...,PGE-INT-11328 & CA_ERP_24698,CA-NEM-12257,1998-01-09,2.2824,24500.0,False,RES,0.0,,,,94107.0,San Francisco,San Francisco,CA,,,,10.734315
2,10107162,California Public Utilities Commission (Curren...,PGE-INT-11220 & CA_ERP_24687,CA-NEM-11180,1998-01-30,1.8504,20555.54,False,RES,0.0,,,,95949.0,Nevada City,Nevada,CA,,,,11.108701
3,20269877,Public Service Company of New Mexico,Qfs<10kW #1,NM-PNM-496,1998-02-01,1.1,,False,RES,,,,,87505.0,Santa Fe,,NM,,,,
4,10107618,California Public Utilities Commission (Curren...,PGE-INT-11268 & CA_ERP_24540,CA-NEM-11641,1998-02-04,2.3076,20000.0,False,RES,0.0,,,,94518.0,Concord,Contra Costa,CA,,,0.0,8.667013


#### write it out.  now row_id is the index

In [12]:
dftts.to_csv('../local/data/LBNL_openpv_tts_data/live20180119.csv', encoding = 'utf-8')

#### read it back in to make sure all is well.

In [18]:
dftts = pd.read_csv('../local/data/LBNL_openpv_tts_data/live20180119.csv', index_col='row_id',
                   encoding = 'utf-8', dtype={'zipcode' : np.object}) 

  mask |= (ar1 == a)


In [19]:
dftts.head()

Unnamed: 0_level_0,file_row,data_provider,sysid_dp,sysid_tts,install_date,size_kw,price,appraised_value,cust_type,new_const,tracking,ground_mounted,battery,zipcode,city,county,state,third-party,uinverter,dc_optimizer,cost_per_watt
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,20378236,Austin Energy,4,TX_AE_6232,1998-01-01,100.0,,False,GOV,,,,,,,,TX,0.0,,,
1,10108220,California Public Utilities Commission (Curren...,PGE-INT-11328 & CA_ERP_24698,CA-NEM-12257,1998-01-09,2.2824,24500.0,False,RES,0.0,,,,94107.0,San Francisco,San Francisco,CA,,,,10.734315
2,10107162,California Public Utilities Commission (Curren...,PGE-INT-11220 & CA_ERP_24687,CA-NEM-11180,1998-01-30,1.8504,20555.54,False,RES,0.0,,,,95949.0,Nevada City,Nevada,CA,,,,11.108701
3,20269877,Public Service Company of New Mexico,Qfs<10kW #1,NM-PNM-496,1998-02-01,1.1,,False,RES,,,,,87505.0,Santa Fe,,NM,,,,
4,10107618,California Public Utilities Commission (Curren...,PGE-INT-11268 & CA_ERP_24540,CA-NEM-11641,1998-02-04,2.3076,20000.0,False,RES,0.0,,,,94518.0,Concord,Contra Costa,CA,,,0.0,8.667013


#### Looks good

#### What else needs to happen?

1.  Think about NaNs.  What do they mean? In the pseudo-Boolean columns (*new_const, tracking, ground_mounted, battery,	third-party, uinverter,	dc_optimizer*)?  In the numeric columns?
    * converted vars above to boolean.  Only True where value is 1.0; False otherwise.

1. There are 250k unpriced installations.  Those don't need to be here.

1. cust_type: there are 45k installs with no type.  Don't want to keep those.

1. missing 40k zipcodes (city and county are missing more values while every row has a value for state).  Don't want to keep those.  We can hope that deleting the rows with no zipcode will clean up city and county too.

Okay, that seems to cover missing data.  Delete the rows called out above.


In [49]:
dftts.describe()

Unnamed: 0,file_row,size_kw,price,cost_per_watt
count,1094909.0,1094909.0,825792.0,825792.0
mean,13901750.0,11.77723,339592.8,67739480000.0
std,4731976.0,81.32362,244462200.0,61556960000000.0
min,10000000.0,9.768e-16,0.01,2.184503e-09
25%,10273730.0,4.07,19360.14,4.0
50%,10547450.0,5.836735,27930.0,4.914778
75%,20121180.0,8.25,39781.06,5.82
max,20394910.0,9001.656,222000000000.0,5.593872e+16


In [50]:
dftts.describe(include=[np.object])

Unnamed: 0,data_provider,sysid_dp,sysid_tts,install_date,cust_type,zipcode,city,county,state
count,1094909,1033970,1094909,1094909,1049192,1059502.0,1054671,846825,1094909
unique,66,1025242,1094909,5512,7,11756.0,12842,582,26
top,California Public Utilities Commission (Curren...,18,CA-NEM-490195,2016-01-19,RES,93312.0,SAN DIEGO,SAN DIEGO,CA
freq,405758,14,1,1637,1005492,4058.0,30845,95960,652529


In [51]:
dftts.describe(include=[np.bool])

Unnamed: 0,appraised_value,new_const,tracking,ground_mounted,battery,third-party,uinverter,dc_optimizer
count,1094909,1094909,1094909,1094909,1094909,1094909,1094909,1094909
unique,2,2,2,2,2,2,2,2
top,False,False,False,False,False,False,False,False
freq,775331,1060365,1091464,1081225,1094730,636050,871631,946179


In [22]:
dftts.cust_type.value_counts()

RES           1005492
COM             24949
NON-RES          9408
GOV              4516
SCHOOL           2383
NON-PROFIT       2371
TAX-EXEMPT         73
Name: cust_type, dtype: int64

In [62]:
dftts.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1094909 entries, 0 to 1094908
Data columns (total 21 columns):
file_row           1094909 non-null int64
data_provider      1094909 non-null object
sysid_dp           1033970 non-null object
sysid_tts          1094909 non-null object
install_date       1094909 non-null object
size_kw            1094909 non-null float64
price              825792 non-null float64
appraised_value    1094909 non-null bool
cust_type          1049192 non-null object
new_const          1094909 non-null bool
tracking           1094909 non-null bool
ground_mounted     1094909 non-null bool
battery            1094909 non-null bool
zipcode            1059502 non-null object
city               1054671 non-null object
county             846825 non-null object
state              1094909 non-null object
third-party        1094909 non-null bool
uinverter          1094909 non-null bool
dc_optimizer       1094909 non-null bool
cost_per_watt      825792 non-null float64
d

In [23]:
dftts.new_const.value_counts()

0.0    314867
1.0     34544
Name: new_const, dtype: int64

#### For the pseudo-Boolean columns (*new_const, tracking, ground_mounted, battery,	third-party, uinverter,	dc_optimizer*), I think the right thing is to assume NA means False.  They are all characteristics that are True if and only if they contain 1.0.  Otherwise (0.0 or NaN), they are False. 

In [31]:
for col in ['new_const', 'tracking', 'ground_mounted', 'battery', 
       'third-party', 'uinverter', 'dc_optimizer']:
    dftts[col].fillna(value=0, inplace=True)

In [42]:
# now make them booleans
for col in ['new_const', 'tracking', 'ground_mounted', 'battery', 
       'third-party', 'uinverter', 'dc_optimizer']:
    dftts[col] = dftts[col].astype(np.bool)

In [43]:
#  okay, all boolean now
dftts.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1094909 entries, 0 to 1094908
Data columns (total 21 columns):
file_row           1094909 non-null int64
data_provider      1094909 non-null object
sysid_dp           1033970 non-null object
sysid_tts          1094909 non-null object
install_date       1094909 non-null object
size_kw            1094909 non-null float64
price              825792 non-null float64
appraised_value    1094909 non-null bool
cust_type          1049192 non-null object
new_const          1094909 non-null bool
tracking           1094909 non-null bool
ground_mounted     1094909 non-null bool
battery            1094909 non-null bool
zipcode            1059502 non-null object
city               1054671 non-null object
county             846825 non-null object
state              1094909 non-null object
third-party        1094909 non-null bool
uinverter          1094909 non-null bool
dc_optimizer       1094909 non-null bool
cost_per_watt      825792 non-null float64
d

In [61]:
# how many Trues do I have for each boolean column
for col in ['appraised_value', 'new_const', 'tracking', 'ground_mounted', 'battery', 
       'third-party', 'uinverter', 'dc_optimizer']:
    yes = len(dftts.loc[dftts[col] == True])
    print('{:>15}: {:>8} True'.format(col, yes))

appraised_value:   319578 True
      new_const:    34544 True
       tracking:     3445 True
 ground_mounted:    13684 True
        battery:      179 True
    third-party:   458859 True
      uinverter:   223278 True
   dc_optimizer:   148730 True


In [48]:
dftts.state.value_counts()

CA    652529
AZ     91584
MA     67462
NJ     66995
NY     64731
CO     33460
NV     21252
CT     19519
TX     14936
NM     11996
MD      8843
PA      7333
MO      6841
VT      3956
NH      3538
OR      3036
DE      2981
DC      2708
WI      2627
OH      2238
FL      2167
MN      1345
IL      1207
UT       965
ME       555
AR       105
Name: state, dtype: int64

In [64]:
ttsClean = dftts.dropna(axis=0, subset=['price', 'cust_type', 'zipcode'])

In [65]:
ttsClean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 786580 entries, 1 to 1094908
Data columns (total 21 columns):
file_row           786580 non-null int64
data_provider      786580 non-null object
sysid_dp           772671 non-null object
sysid_tts          786580 non-null object
install_date       786580 non-null object
size_kw            786580 non-null float64
price              786580 non-null float64
appraised_value    786580 non-null bool
cust_type          786580 non-null object
new_const          786580 non-null bool
tracking           786580 non-null bool
ground_mounted     786580 non-null bool
battery            786580 non-null bool
zipcode            786580 non-null object
city               782628 non-null object
county             602759 non-null object
state              786580 non-null object
third-party        786580 non-null bool
uinverter          786580 non-null bool
dc_optimizer       786580 non-null bool
cost_per_watt      786580 non-null float64
dtypes: bool(8), flo

In [68]:
ttsClean.describe(include=[np.number])

Unnamed: 0,file_row,size_kw,price,cost_per_watt
count,786580.0,786580.0,786580.0,786580.0
mean,13760650.0,11.03484,353846.7,71116380000.0
std,4685861.0,69.27191,250481200.0,63072650000000.0
min,10000000.0,9.768e-16,0.01,2.184503e-09
25%,10298810.0,4.08,19428.65,3.99387
50%,10550570.0,5.87,27955.2,4.92
75%,20118330.0,8.25,39892.69,5.817021
max,20394910.0,7569.0,222000000000.0,5.593872e+16


In [69]:
ttsClean.describe(include=[np.bool])

Unnamed: 0,appraised_value,new_const,tracking,ground_mounted,battery,third-party,uinverter,dc_optimizer
count,786580,786580,786580,786580,786580,786580,786580,786580
unique,2,2,2,2,2,2,2,2
top,False,False,False,False,False,False,False,False
freq,556275,756513,783355,773882,786418,434114,624869,662043


In [70]:
ttsClean.describe(include=[np.object])

Unnamed: 0,data_provider,sysid_dp,sysid_tts,install_date,cust_type,zipcode,city,county,state
count,786580,772671,786580,786580,786580,786580.0,782628,602759,786580
unique,52,765891,786580,5436,7,9730.0,9506,477,19
top,California Public Utilities Commission (Curren...,21,CA-NEM-485581,2015-09-30,RES,95762.0,SAN DIEGO,SAN DIEGO,CA
freq,229489,10,1,1511,755954,2655.0,22736,69828,466987


In [73]:
ttsClean.to_csv('../local/data/LBNL_openpv_tts_data/ttsclean20180119.csv', encoding='utf-8')