# 3. Pre-processing

1. Importing packages and dataset
2. Create dummies for categorical variables
3. Scale numerical variables
4. Split dataset in train and test sets

### 3.1. Importing packages and dataset

In [1]:
# Import the required packages

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
# Import pre-processed dataset and load it into a dataframe

# Create path variable
file = 'C:/Users/javie/OneDrive/Documents/springboard_projects/Capstone-2/data/interim/clean.csv'

# Create dictionary with categorial variables to specify when loading csv in dataframe
cat_dict = {'COOL_TYPE':'category',
            'EQUIP_AGE':'category',
            'HEAT_TYPE':'category',
            'CLIMATE_REGION':'category',
            'OWNERSHIP':'category',
            'UNIT_TYPE':'category',
            'DECADE_BUILT':'category',
            'ATTIC':'category',
            'BASEMENT':'category',
            'BEDROOMS':'category',
            'BATHROOMS':'category',
            'HIGH_CEIL':'category',
            'THERMOSTAT':'category',
            'STORIES':'category',
            'SIZEOFGARAGE':'category'
              }

# Load csv in dataframe
df_noprocessed = pd.read_csv(file, index_col=0, dtype=cat_dict)

In [3]:
# Check # of columns and rows imported
df_noprocessed.shape

(5686, 21)

In [4]:
# Print info

df_noprocessed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5686 entries, 0 to 5685
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   UNIT_TYPE           5686 non-null   category
 1   CLIMATE_REGION      5686 non-null   category
 2   DECADE_BUILT        5686 non-null   category
 3   FLOOR_AREA_log      5686 non-null   float64 
 4   COOL_TYPE           5686 non-null   category
 5   EQUIP_AGE           5686 non-null   category
 6   HEAT_TYPE           5686 non-null   category
 7   THERMOSTAT          5686 non-null   category
 8   HIGH_CEIL           5686 non-null   category
 9   ATTIC               5686 non-null   category
 10  BASEMENT            5686 non-null   category
 11  BEDROOMS            5555 non-null   category
 12  BATHROOMS           5683 non-null   category
 13  SIZEOFGARAGE        5686 non-null   category
 14  STORIES             5686 non-null   category
 15  OWNERSHIP           5686 non-null   ca

---
### 3.2. Create dummies for categorical variables 

In [5]:
# Get dummy variables for each categorical variable
df = pd.get_dummies(df_noprocessed, drop_first=True)

# Check number of total variables
df.shape

(5686, 53)

---
### 3.3. Split dataset in train and test sets

In [6]:
# Select features and target variables
X = df.drop(['COOL_COST_log', 'HEAT_COST_log', 'WATERHEAT_COST_log', 'ELECT_COST_log', 'TOT_COST_log'], axis=1)
y = df[['COOL_COST_log', 'HEAT_COST_log', 'WATERHEAT_COST_log', 'ELECT_COST_log', 'TOT_COST_log']]

# Slit dataset in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

#### Save train, test sets for modelling

In [7]:
# Save processed dataset for next step

X_train_path = 'C:/Users/javie/OneDrive/Documents/springboard_projects/Capstone-2/data/processed/X_train.csv'
X_test_path = 'C:/Users/javie/OneDrive/Documents/springboard_projects/Capstone-2/data/processed/X_test.csv'
y_train_path = 'C:/Users/javie/OneDrive/Documents/springboard_projects/Capstone-2/data/processed/y_train.csv'
y_test_path = 'C:/Users/javie/OneDrive/Documents/springboard_projects/Capstone-2/data/processed/y_test.csv'

# Dump train, test sets in csv files
X_train.to_csv(X_train_path)
X_test.to_csv(X_test_path)
y_train.to_csv(y_train_path)
y_test.to_csv(y_test_path)