# Data Wrangling
1. Data Collection 
2. Data Definition
3. Data Cleaning

## 1. Data Collection

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Importing raw data

measures_file = 'C:/Users/javie/OneDrive/Documents/springboard_projects/Capstone-2/data/raw/bbnp_installed_measures_res_sf.csv'
project_file = 'C:/Users/javie/OneDrive/Documents/springboard_projects/Capstone-2/data/raw/bbnp_retrofit_project_table_res_sf.csv'

measures_df = pd.read_csv(measures_file, index_col=None, low_memory=False)
project_df = pd.read_csv(project_file, index_col=None, low_memory=False)

In [3]:
# Inspecting df

measures_df.shape, project_df.shape

((75110, 89), (75110, 31))

In [4]:
measures_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75110 entries, 0 to 75109
Data columns (total 89 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   AWARDEENUMBER                75110 non-null  int64  
 1   PROJECTID                    75110 non-null  int64  
 2   INST_FURNACE                 75110 non-null  int64  
 3   INST_FURNACE_AFUE            3478 non-null   float64
 4   INST_FURNACE_FUEL            3629 non-null   object 
 5   INST_BOILER                  75110 non-null  int64  
 6   INST_BOILER_AFUE             963 non-null    float64
 7   INST_BOILER_FUEL             1062 non-null   object 
 8   INST_WOOD_STOVE              75110 non-null  int64  
 9   INST_WATER_HEATER            75110 non-null  int64  
 10  INST_WATER_HEATER_FUEL       1835 non-null   object 
 11  INST_WATER_HEATER_EF         976 non-null    float64
 12  INST_HEAT_PUMP               75110 non-null  int64  
 13  INST_HEAT_PUMP_T

In [5]:
project_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75110 entries, 0 to 75109
Data columns (total 31 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   AWARDEENUMBER                           75110 non-null  int64  
 1   PROJECTID                               75110 non-null  int64  
 2   RETROFITJOBHOURS                        63670 non-null  float64
 3   RETROFITINVOICEDCOST                    66281 non-null  float64
 4   BUILDINGACTIVITYTYPE                    71212 non-null  float64
 5   OCCUPANCY                               54373 non-null  float64
 6   ISLOWINCOME                             32566 non-null  object 
 7   FLOORAREA                               75110 non-null  object 
 8   FLOORAREAUNIT                           58863 non-null  object 
 9   ISDIRECTINSTALLATIONUSED                56195 non-null  object 
 10  LOANAMOUNT                              15118 non-null  fl

In [13]:
# Checking for uniqueness ID for projects.

measures_df['PROJECTID'].nunique(), project_df['PROJECTID'].nunique()

(75110, 75110)

In [8]:
# Consolidating data

df = pd.merge(measures_df, project_df, how='inner', on='PROJECTID')
df.drop(['AWARDEENUMBER_x', 'AWARDEENUMBER_y', 'PROJECTID'], axis=1, inplace=True)

**There are many variables that are not useful for our analyisis at this point:**
1. Not relevant project information.
2. Undefined retrofit measures installed.
3. Building conditioning systems and construction elements removed.

In [11]:
# Removing other columns with little or not interesting information (case 1 and 2)

columns2drop = ['STATE_BASED_ON', 
                'FLOORAREAUNIT', 
                'RETROFITSTARTMONTHYEAR', 
                'RETROFITCOMPLETIONMONTHYEAR', 
                'AUDITCOMPLETIONMONTHYEAR',
                'LOANAPPROVALMONTHYEAR',
                'INST_CORE_ENERGY',
                'INST_HEALTH_SAFETY',
                'REP_WINDOWS',
                'LOANAMOUNT',
                'DAYSFROMRETROFITSTARTTOCOMPLETION',
                'DAYSFROMAUDITCOMPLETIONTORETROFITSTART']

df.drop(columns2drop, axis=1, inplace=True)

In [9]:
# Removing RMV columns (case 3).

columns2drop = [i for i in df.columns if i.startswith('RMV')]
df.drop(columns2drop, axis=1, inplace=True)

In [12]:
df.shape

(75110, 88)

In [13]:
# Saving consolidated data

file = 'C:/Users/javie/OneDrive/Documents/springboard_projects/Capstone-2/data/interim/consolidated.csv'
df.to_csv(file)