# Capstone 2

## Preprocessing and Training Data Development
In this step, the data will be preprocessed with the following steps:

1. Create dummy or indicator features for categorical variables
2. Standardize the magnitude of numeric features using a scaler (z-scoring)
3. Split the data into test and training datasets

In [1]:
import pandas as pd
reds = pd.read_csv('../downloads/DataFolder/winequality-red.csv',sep=';')
whites = pd.read_csv('../downloads/DataFolder/winequality-white.csv',sep=';')
whites['type'] = 'white'
reds['type'] = 'red'

all_wines = pd.concat([whites,reds])

## Create dummy features

In [2]:
all_wines = pd.get_dummies(all_wines,drop_first=True)
all_wines.sample(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type_white
3853,6.6,0.32,0.26,4.6,0.031,26.0,120.0,0.99198,3.4,0.73,12.5,7,1
1632,7.1,0.28,0.49,6.5,0.041,28.0,111.0,0.9926,3.41,0.58,12.2,8,1
1586,9.3,0.31,0.49,1.3,0.042,34.0,147.0,0.9948,3.11,0.46,9.8,5,1
3127,7.6,0.3,0.38,2.1,0.043,10.0,98.0,0.99296,3.17,0.65,11.0,5,1
2220,7.0,0.22,0.32,1.6,0.045,40.0,120.0,0.9914,2.98,0.44,10.5,6,1
2885,6.9,0.4,0.3,10.6,0.033,24.0,87.0,0.99265,3.15,0.45,12.8,6,1
537,8.1,0.825,0.24,2.1,0.084,5.0,13.0,0.9972,3.37,0.77,10.7,6,0
4886,6.2,0.21,0.28,5.7,0.028,45.0,121.0,0.99168,3.21,1.08,12.15,7,1
1744,6.6,0.22,0.58,1.1,0.133,52.0,136.0,0.9932,3.1,0.3,9.1,5,1
1941,6.8,0.31,0.32,7.6,0.052,35.0,143.0,0.9959,3.14,0.38,9.0,5,1


## Center and scale the data

In [3]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

columns_to_scale = set(all_wines.columns)
unscaled_columns = set(['quality','type_white'])
columns_to_scale = list(columns_to_scale - unscaled_columns)

std_data = pd.DataFrame(scaler.fit_transform(all_wines[columns_to_scale]),columns = columns_to_scale)
std_data.index = all_wines.index
std_data['quality'] = all_wines['quality']
std_data['type_white'] = all_wines['type_white']
std_data.sample(5)

Unnamed: 0,total sulfur dioxide,pH,fixed acidity,density,sulphates,chlorides,volatile acidity,residual sugar,free sulfur dioxide,alcohol,citric acid,quality,type_white
3799,1.083831,-0.115073,-0.166089,-1.012737,0.260304,-0.400614,-0.240949,-0.681719,-0.536697,0.761516,-0.265874,5,1
3936,-0.13703,1.377698,-1.168917,0.054484,0.529131,-0.400614,-0.666161,0.526915,-0.818419,-0.160823,-0.472334,6,1
993,-0.844775,1.439897,-0.088949,0.748177,-0.344558,0.741235,0.002029,-0.723758,0.026746,-0.915464,-0.265874,5,0
1016,0.358392,-1.483446,-0.32037,0.701486,2.343715,0.427227,0.184263,1.210056,0.759221,-1.083162,1.041706,5,1
20,-0.72092,0.693511,-0.783214,-1.833163,-0.949419,-0.771715,1.945855,-0.891916,-0.085943,1.935402,1.110526,8,1


## Split the data into training and test sets

In [4]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(std_data, train_size = 0.8)

std_data.to_csv('../data/std_data.csv',index=False)
train.to_csv('../data/train.csv',index=False)
test.to_csv('../data/test.csv',index=False)