In [100]:
import pandas as pd
import numpy as np
import pickle

In [196]:
recs = pd.read_csv('Datasets/recs2015_public_v4.csv', index_col = 'DOEID')

In [197]:
with open('Datasets/variable_map.pickle','rb') as handle:
    var_map = pickle.load(handle)

In [198]:
with open('Datasets/variable_descriptions.pickle','rb') as handle:
    var_des = pickle.load(handle)

In [199]:
with open('Datasets/variable_labels.pickle','rb') as handle:
    var_labels = pickle.load(handle)

In [200]:
with open('Datasets/data_info.pickle','rb') as handle:
    data_info = pickle.load(handle)

In [201]:
nullcounts = recs.count()
nullcounts[nullcounts<5686]

NGXBTU    3304
dtype: int64

In [202]:
len(recs[recs['USENG'] == 0])

2382

In [203]:
recs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5686 entries, 10001 to 15686
Columns: 758 entries, REGIONC to ZLPAMOUNT
dtypes: float64(262), int64(492), object(4)
memory usage: 32.9+ MB


In [204]:
imputed_mask = pd.Series(['Imputation' in des for des in var_des.values()], 
                         index = var_des.keys())
flag_cols = recs.iloc[:,imputed_mask.values].columns
non_flag_cols = recs.iloc[:,np.invert(imputed_mask.values)].columns
all_cols = recs.columns

In [205]:
imputed_counts = recs[flag_cols].apply(pd.value_counts).fillna(0).iloc[1]
imputed_counts.describe()

count     217.000000
mean     4022.534562
std      1965.258396
min        48.000000
25%      2193.000000
50%      5066.000000
75%      5589.000000
max      5686.000000
Name: 0, dtype: float64

In [206]:
recs.iloc[:,imputed_mask.values] = recs.iloc[:,imputed_mask.values].replace(to_replace = 1
                                                                            , value = np.nan)

In [207]:
imputed_cols = [col[1:] for col in flag_cols if col[1:] in all_cols]

In [208]:
imputed_dict = dict(zip(flag_cols,imputed_cols))

In [209]:
tng_cols = ['CUFEETNG','BTUNG']
el_cols = ['BTUEL','KWH']
fo_cols = ['GALLONFO', 'BTUFO','GALLONFO']
lp_cols = ['GALLONLP', 'BTULP']

In [210]:
imputed_dict['ZELAMOUNT'] = el_cols
imputed_dict['ZFOAMOUNT'] = fo_cols
imputed_dict['ZLPAMOUNT'] = lp_cols
imputed_dict['ZNGAMOUNT'] = tng_cols

In [211]:
for flag_col,col  in imputed_dict.items():
    missing_mask = recs[flag_col].isnull()
    recs.loc[missing_mask,col] = np.nan

In [212]:
dk_col_mask = pd.Series(['know' in label if type(label) is str else False for label in var_labels.values()], 
                         index = var_labels.keys())

In [213]:
dk_cols = recs.iloc[:,dk_col_mask.values].columns
recs.loc[:,dk_cols] = recs.loc[:,dk_cols].replace(to_replace = -9, value = np.nan)

In [214]:
metro_encoding = {'METRO': 1,'MICRO': 2,'NONE': 3}
ua_encoding= {'U': 1, 'C': 2, 'R': 3}
climate_encoding = {'Cold/Very Cold': 1, 'Hot-Dry/Mixed-Dry': 2,'Hot-Humid': 3,'Mixed-Humid':4,'Marine': 5}
iecc_encoding = {'1A-2A': 1, '2B': 2,'3A': 3,'3B-4B': 4,'3C': 5, '4A': 6,'4C': 7,'5A': 8, '5B-5C': 9,'6A-6B': 10,
                 '7A-7B-7AK-8AK': 11}

In [215]:
recs.METROMICRO = recs.METROMICRO.replace(metro_encoding)
recs.UATYP10 = recs.UATYP10.replace(ua_encoding)
recs.CLIMATE_REGION_PUB = recs.CLIMATE_REGION_PUB.replace(climate_encoding)
recs.IECC_CLIMATE_PUB = recs.IECC_CLIMATE_PUB.replace(iecc_encoding)

In [216]:
conv_mask = pd.Series(['conv' in des for des in var_des.values()], 
                         index = var_des.keys())

In [217]:
conv_cols = recs.iloc[:,conv_mask.values].columns

In [218]:
na_mask = pd.Series(['Not applicable' in label if type(label) is str else False for label in var_labels.values()], 
                         index = var_labels.keys())
int_mask = data_info['data_type'] == 'INTEGER'
recs.loc[:,na_mask & int_mask] = recs.loc[:,na_mask & int_mask].replace(-2,-1)

In [219]:
recs_slim = recs.loc[:,data_info.include]
recs_slim.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5686 entries, 10001 to 15686
Columns: 300 entries, REGIONC to PERIODLP
dtypes: float64(252), int64(48)
memory usage: 13.1 MB


In [220]:
len(flag_cols)

217

In [221]:
len(non_flag_cols)

541

In [222]:
recs_slim.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5686 entries, 10001 to 15686
Columns: 300 entries, REGIONC to PERIODLP
dtypes: float64(252), int64(48)
memory usage: 13.1 MB


In [223]:
recs_slim.to_pickle('Datasets/recs.pickle')

In [224]:
recs.to_pickle('Datasets/recs_large.pickle')