# 3. Pre-processing

1. Importing packages and dataset
2. Create dummies for categorical variables
3. Scale numerical variables
4. Split dataset in train and test sets

### 3.1. Importing packages and dataset

In [1]:
# Import the required packages

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
# Import pre-processed dataset and load it into a dataframe

# Create path variable
file = 'C:/Users/javie/OneDrive/Documents/springboard_projects/Capstone-2/data/interim/clean.csv'

# Create dictionary with categorial variables to specify when loading csv in dataframe
cat_dict = {'COOL_TYPE':'category',
            'EQUIP_AGE':'category',
            'HEAT_TYPE':'category',
            'CLIMATE_REGION':'category',
            'OWNERSHIP':'category',
            'UNIT_TYPE':'category',
            'DECADE_BUILT':'category',
            'ATTIC':'category',
            'BASEMENT':'category',
            'BEDROOMS':'category',
            'BATHROOMS':'category',
            'HIGH_CEIL':'category',
            'THERMOSTAT':'category',
            'STORIES':'category',
            'SIZEOFGARAGE':'category'
              }

# Load csv in dataframe
df_noprocessed = pd.read_csv(file, index_col=0, dtype=cat_dict)

In [3]:
# Check # of columns and rows imported
df_noprocessed.shape

(5666, 22)

In [4]:
# Print info

df_noprocessed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5666 entries, 0 to 5685
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   UNIT_TYPE           5666 non-null   category
 1   CLIMATE_REGION      5666 non-null   category
 2   DECADE_BUILT        5666 non-null   category
 3   HDD65_log           5666 non-null   float64 
 4   CDD65_log           5666 non-null   float64 
 5   FLOOR_AREA_log      5666 non-null   float64 
 6   COOL_TYPE           5666 non-null   category
 7   EQUIP_AGE           5666 non-null   category
 8   HEAT_TYPE           5666 non-null   category
 9   THERMOSTAT          5666 non-null   category
 10  HIGH_CEIL           5666 non-null   category
 11  ATTIC               5666 non-null   category
 12  BASEMENT            5666 non-null   category
 13  BEDROOMS            5666 non-null   category
 14  BATHROOMS           5666 non-null   category
 15  SIZEOFGARAGE        5666 non-null   ca

---
### 3.2. Create dummies for categorical variables 

In [5]:
# Get dummy variables for each categorical variable
df = pd.get_dummies(df_noprocessed, drop_first=True)

# Check number of total variables
df.shape

(5666, 63)

---
### 3.3. Scale numerical variables

In [6]:
# Initialize a StandardScaler
ss= StandardScaler()

# Transform data
scaled = ss.fit_transform(df)

# Converto to a DataFrame
columns = list(df.columns)
df_scaled = pd.DataFrame(scaled, columns=columns)

# Print sample
df_scaled.sample(5)

Unnamed: 0,HDD65_log,CDD65_log,FLOOR_AREA_log,COOL_COST_log,HEAT_COST_log,WATERHEAT_COST_log,ELECT_COST_log,UNIT_TYPE_Apartment in bld with 5+ units,UNIT_TYPE_Attached house,UNIT_TYPE_Detached house,...,BATHROOMS_5.5,SIZEOFGARAGE_1.0,SIZEOFGARAGE_2.0,SIZEOFGARAGE_3.0,STORIES_1,STORIES_2,STORIES_3,STORIES_Split-level,OWNERSHIP_Owned,OWNERSHIP_Rented
1240,1.389703,-1.426682,1.158314,-0.67212,1.812598,0.967992,1.299843,-0.420985,-0.303539,0.718181,...,-0.044104,2.958404,-0.648937,-0.237279,-1.499618,1.631739,-0.122672,-0.141358,0.669047,-0.652234
5342,0.48857,-0.032963,-1.528677,-0.848361,-0.512813,0.032348,-0.47244,2.375382,-0.303539,-1.392407,...,-0.044104,-0.33802,-0.648937,-0.237279,0.666837,-0.612843,-0.122672,-0.141358,-1.494663,1.533193
4294,0.72693,-0.214864,1.492132,0.365246,0.300791,-0.19779,0.842732,-0.420985,-0.303539,0.718181,...,-0.044104,-0.33802,-0.648937,4.21445,-1.499618,1.631739,-0.122672,-0.141358,0.669047,-0.652234
1554,1.345237,-1.279294,1.859747,-0.14911,1.917198,2.073511,1.842578,-0.420985,-0.303539,0.718181,...,-0.044104,-0.33802,1.540983,-0.237279,-1.499618,1.631739,-0.122672,-0.141358,0.669047,-0.652234
5234,-0.698325,0.128392,-0.183397,0.340561,0.094772,-5.415794,-0.515094,-0.420985,3.294474,-1.392407,...,-0.044104,2.958404,-0.648937,-0.237279,-1.499618,1.631739,-0.122672,-0.141358,0.669047,-0.652234


---
### 3.4. Split dataset in train and test sets

In [7]:
# Select features and target variables
X = df_scaled.drop(['COOL_COST_log', 'HEAT_COST_log', 'WATERHEAT_COST_log', 'ELECT_COST_log'], axis=1)
y = df_scaled[['COOL_COST_log', 'HEAT_COST_log', 'WATERHEAT_COST_log', 'ELECT_COST_log']]

# Slit dataset in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#### Save train, test sets for modelling

In [8]:
# Save processed dataset for next step

X_train_path = 'C:/Users/javie/OneDrive/Documents/springboard_projects/Capstone-2/data/processed/X_train.csv'
X_test_path = 'C:/Users/javie/OneDrive/Documents/springboard_projects/Capstone-2/data/processed/X_test.csv'
y_train_path = 'C:/Users/javie/OneDrive/Documents/springboard_projects/Capstone-2/data/processed/y_train.csv'
y_test_path = 'C:/Users/javie/OneDrive/Documents/springboard_projects/Capstone-2/data/processed/y_test.csv'

# Dump train, test sets in csv files
X_train.to_csv(X_train_path)
X_test.to_csv(X_test_path)
y_train.to_csv(y_train_path)
y_test.to_csv(y_test_path)