# Further Data Cleaning 

#### OpenPV Dataset

In [29]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import pickle
import fancyimpute
from pprint import pprint
from sklearn.preprocessing import Imputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_validate
pd.set_option("max_columns", 100)

%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [30]:
sns.set_style('darkgrid')

In [31]:
with open("pv_data.pkl", "rb") as f:
    df = pickle.load(f)

In [32]:
df.drop(['tech_1', 'model1_clean'], axis=1, inplace=True)

In [33]:
df.zipcode.nunique()

11780

In [34]:
# list(zip(df.groupby("state")['date_installed'].count(), df.groupby("state")['annual_PV_prod'].count()))

In [35]:
df.annual_PV_prod.median()

8327.2

In [36]:
df.shape

(742808, 37)

In [37]:
# df.groupby('state')[['annual_PV_prod', 'annual_insolation']].median()

In [38]:
def adjust_inflation(amount, start_year, end_year):
    '''
    Returns the amount in dollars when adjusting
    for inflation from start_year to end_year.
    Amount must be an integer of float. 
    Start_year and end_year inputs must be strings.
    '''
    cpi_dict={
     '1980': '78.0',
     '1981': '87.2',
     '1982': '94.4',
     '1983': '97.9',
     '1984': '102.1',
     '1985': '105.7',
     '1986': '109.9',
     '1987': '111.4',
     '1988': '116.0',
     '1989': '121.2',
     '1990': '127.5',
     '1991': '134.7',
     '1992': '138.3',
     '1993': '142.8',
     '1994': '146.3',
     '1995': '150.5',
     '1996': '154.7',
     '1997': '159.4',
     '1998': '162.0',
     '1999': '164.7',
     '2000': '169.3',
     '2001': '175.6',
     '2002': '177.7',
     '2003': '182.600',
     '2004': '186.300',
     '2005': '191.600',
     '2006': '199.300',
     '2007': '203.437',
     '2008': '212.174',
     '2009': '211.933',
     '2010': '217.488',
     '2011': '221.187',
     '2012': '227.842',
     '2013': '231.679',
     '2014': '235.347',
     '2015': '234.836',
     '2016': '237.990',
     '2017': '244.028',
     '2018': '249.245'}

    return float("{0:.2f}".format(amount*(float(cpi_dict[end_year])/float(cpi_dict[start_year]))))

In [39]:
adjust_inflation(400, '1980', '2018')

1278.18

In [40]:
cost_df = df[['cost', 'year']]
cost_df['end_year'] = '2018'
cost_df['year'] = cost_df['year'].astype('str')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [41]:
adj_cost = []
for cost, year, end_year in list(zip(cost_df.cost, cost_df.year, cost_df.end_year)):
    adj_cost.append(adjust_inflation(cost, year, end_year))

In [42]:
df['adjusted_cost'] = pd.Series(adj_cost)

In [43]:
costperw_df = df[['cost_per_watt', 'year']]
costperw_df['end_year'] = '2018'
costperw_df['year'] = cost_df['year'].astype('str')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [44]:
adj_cost_per_watt = []
for cost_per_watt, year, end_year in list(zip(costperw_df.cost_per_watt, costperw_df.year, costperw_df.end_year)):
    adj_cost_per_watt.append(adjust_inflation(cost_per_watt, year, end_year))

In [45]:
df['adj_cost_per_watt'] = pd.Series(adj_cost_per_watt)

In [46]:
df.columns.tolist()

['state',
 'date_installed',
 'incentive_prog_names',
 'type',
 'size_kw',
 'zipcode',
 'install_type',
 'installer',
 'cost_per_watt',
 'cost',
 'lbnl_tts',
 'city',
 'utility_clean',
 'county',
 'annual_PV_prod',
 'annual_insolation',
 'rebate',
 'sales_tax_cost',
 'Agricultural',
 'Commercial',
 'Educational',
 'Government',
 'Nonprofit',
 'Utility',
 'month',
 'year',
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 'adjusted_cost',
 'adj_cost_per_watt']

In [47]:
# Rearranging order of columns
solar_columns = df.columns.tolist()
solar_columns = solar_columns[:10] + solar_columns[-2:] + solar_columns[10:37]

In [48]:
df = df[solar_columns]

In [49]:
time_df = df.groupby('year')['adjusted_cost'].median()

In [51]:
# df.groupby(['state', 'zipcode'])['adjusted_cost'].median()

In [52]:
# df[['incentive_prog_names', 'installer', 'utility_clean']]
# df.incentive_prog_names.value_counts()

In [53]:
df_copy = df.copy()

# NEW PICKLE

In [54]:
with open("pv_data_2.pkl", "wb") as f:
    pickle.dump(df_copy, f)

In [55]:
with open("pv_data_2.pkl", "rb") as f:
    df_copy = pickle.load(f)

# Imputation of Missing Values in `annual_PV_prod` and `annual_insolation`

In [56]:
df_copy.annual_PV_prod.isnull().sum()

202568

In [57]:
df_copy.drop(['cost', 'cost_per_watt'], axis=1, inplace=True)

In [58]:
ann_pv = df_copy[['size_kw', 'annual_PV_prod']]
#ann_pv

In [1]:
res = fancyimpute.MICE(init_fill_method='median').complete(ann_pv)

In [60]:
newdf = pd.DataFrame(res)
annual_pv_imputed = pd.Series(newdf[1])

In [61]:
df_copy['annual_pv_imputed'] = annual_pv_imputed

In [62]:
df_copy['annual_pv_imputed'] = abs(df_copy['annual_pv_imputed'])

In [63]:
df_copy.annual_pv_imputed.isnull().sum()

0

In [64]:
print(f"Annual PV Produced Median (With NaNs): {df_copy.annual_PV_prod.median()}")
print(f"Annual PV Produced Median (Imputed): {df_copy.annual_pv_imputed.median()}")

Annual PV Produced Median (With NaNs): 8327.2
Annual PV Produced Median (Imputed): 8305.638518498721


In [2]:
pd.set_option('max_rows', 100)
df_copy[['zipcode', 'adjusted_cost', 'size_kw', 'annual_PV_prod', 'annual_pv_imputed']]

In [67]:
# Dummy variables for the States 
state_dummies = pd.get_dummies(df_copy['state'])
del state_dummies['CA']
df_copy = pd.concat([df_copy, state_dummies], axis=1)

In [68]:
#df_copy

In [69]:
zipcode_counts = df_copy.groupby('zipcode')['date_installed'].count()
zipcode_counts[zipcode_counts > 0].sort_values()

zipcode
00000       1
38606       1
38611       1
38673       1
38677       1
38801       1
38866       1
38916       1
38930       1
39057       1
39074       1
39090       1
39094       1
38583       1
39110       1
39145       1
39149       1
39170       1
39232       1
39305       1
39307       1
39341       1
39464       1
39482       1
39648       1
39667       1
39701       1
39111       1
39730       1
38542       1
38476       1
37135       1
37153       1
37167       1
37205       1
37321       1
37356       1
37367       1
37391       1
37397       1
37405       1
37421       1
37642       1
38483       1
37650       1
37774       1
37777       1
37814       1
37821       1
37862       1
         ... 
95404    1226
92234    1226
92336    1228
92270    1230
95060    1237
94566    1244
92131    1248
94558    1250
93720    1258
93722    1261
92040    1284
92129    1292
85379    1309
91350    1314
92584    1380
92262    1385
95765    1395
85326    1403
95037    1407
93727    140

In [70]:
labels_to_drop = df_copy[(df_copy.year == 1983) | (df_copy.year == 1996) | (df_copy.year == 1995)].index.tolist()

In [71]:
df_copy.drop(labels_to_drop, inplace=True)

In [72]:
df_copy.year.value_counts()

2015    186427
2012    103864
2011     75802
2013     75776
2010     72482
2014     71955
2009     48416
2008     28571
2007     26656
2006     16127
2004     10019
2005      9858
2003      6637
2002      5024
2001      2722
1999      1172
2000       497
2016       333
2017       309
1998        84
1997        41
2018        32
Name: year, dtype: int64

# Encode `type` column 

### uir = 0 
### pir = 1

In [73]:
df_copy.type.replace('uir', 0, inplace=True)
df_copy.type.replace('pir', 1, inplace=True)

In [75]:
# df_copy.corr()

# Updated Pickle

In [77]:
 with open("pv_data_2.pkl", "wb") as f:
    pickle.dump(df_copy, f)

In [78]:
with open("pv_data_2.pkl", "rb") as f:
    df_copy = pickle.load(f)