# Capstone 2

## Preprocessing and Training Data Development
In this step, the data will be preprocessed with the following steps:

1. Create dummy or indicator features for categorical variables
2. Standardize the magnitude of numeric features using a scaler (z-scoring)
3. Split the data into test and training datasets

In [1]:
import pandas as pd
reds = pd.read_csv('../downloads/DataFolder/winequality-red.csv',sep=';')
whites = pd.read_csv('../downloads/DataFolder/winequality-white.csv',sep=';')
whites['type'] = 'white'
reds['type'] = 'red'

all_wines = pd.concat([whites,reds])

## Create dummy features

In [2]:
all_wines = pd.get_dummies(all_wines,drop_first=True)
all_wines.sample(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type_white
4677,6.0,0.28,0.52,6.2,0.028,37.0,104.0,0.99161,3.28,0.51,11.8,7,1
198,5.4,0.835,0.08,1.2,0.046,13.0,93.0,0.9924,3.57,0.85,13.0,7,0
2776,7.0,0.36,0.32,10.5,0.045,35.0,135.0,0.9935,3.09,0.33,11.6,8,1
4308,6.2,0.26,0.29,2.0,0.036,16.0,87.0,0.99081,3.33,0.61,11.8,6,1
3448,6.7,0.47,0.29,4.75,0.034,29.0,134.0,0.99056,3.29,0.46,13.0,7,1
910,6.3,0.33,0.2,5.8,0.04,24.0,144.0,0.99425,3.15,0.63,9.9,5,1
4872,6.0,0.42,0.41,12.4,0.032,50.0,179.0,0.99622,3.14,0.6,9.7,5,1
4356,6.2,0.29,0.29,5.6,0.046,35.0,178.0,0.99313,3.25,0.51,10.533333,5,1
798,9.4,0.5,0.34,3.6,0.082,5.0,14.0,0.9987,3.29,0.52,10.7,6,0
4003,6.8,0.14,0.35,1.5,0.047,40.0,117.0,0.99111,3.07,0.72,11.1,6,1


## Center and scale the data

In [3]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

columns_to_scale = set(all_wines.columns)
unscaled_columns = set(['quality','type_white'])
columns_to_scale = list(columns_to_scale - unscaled_columns)

std_data = pd.DataFrame(scaler.fit_transform(all_wines[columns_to_scale]),columns = columns_to_scale)
std_data.index = all_wines.index
std_data['quality'] = all_wines['quality']
std_data['type_white'] = all_wines['type_white']
std_data.sample(5)

Unnamed: 0,residual sugar,alcohol,total sulfur dioxide,pH,sulphates,free sulfur dioxide,chlorides,citric acid,volatile acidity,fixed acidity,density,quality,type_white
606,0.432326,-0.747766,0.747652,1.315499,-1.016626,0.421155,-0.20079,-0.197054,-0.544672,0.065333,0.601434,5,1
3579,0.379777,0.845365,-0.667839,-1.732242,-1.083833,-0.931107,-0.800261,-0.265874,-0.544672,0.142473,-0.772612,7,1
1772,0.852721,0.342271,1.101525,-1.110254,-0.680592,0.08309,0.084672,-0.403514,0.366496,0.451036,0.267928,6,1
2694,1.020878,0.174573,-0.13703,-0.115073,-1.15104,-0.254976,-0.743168,-0.334694,-0.362438,-0.783214,-0.272353,6,1
3822,0.053971,1.683855,1.561559,-0.301669,0.730751,0.4775,-0.65753,-0.265874,-0.058716,-0.166089,-0.979386,6,1


## Split the data into training and test sets

In [4]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(std_data, train_size = 0.8)

std_data.to_csv('../data/std_data.csv',index=False)
train.to_csv('../data/train.csv',index=False)
test.to_csv('../data/test.csv',index=False)