# Capstone 2

## Preprocessing and Training Data Development
In this step, the data will be preprocessed with the following steps:

1. Create dummy or indicator features for categorical variables
2. Standardize the magnitude of numeric features using a scaler (z-scoring)
3. Split the data into test and training datasets

In [2]:
import pandas as pd
reds = pd.read_csv('../downloads/DataFolder/winequality-red.csv',sep=';')
whites = pd.read_csv('../downloads/DataFolder/winequality-white.csv',sep=';')
whites['type'] = 'white'
reds['type'] = 'red'

all_wines = pd.concat([whites,reds])

## Create dummy features

In [3]:
all_wines = pd.get_dummies(all_wines,drop_first=True)
all_wines.sample(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type_white
2355,7.6,0.48,0.33,7.0,0.024,14.0,130.0,0.9918,3.25,0.45,12.5,7,1
612,7.5,0.23,0.68,11.0,0.047,37.0,133.0,0.9978,2.99,0.38,8.8,5,1
1988,8.0,0.22,0.32,10.4,0.043,63.0,201.0,0.997,3.11,0.53,9.5,6,1
847,7.4,0.68,0.16,1.8,0.078,12.0,39.0,0.9977,3.5,0.7,9.9,6,0
2405,6.6,0.23,0.18,8.5,0.044,59.0,188.0,0.99558,3.16,0.49,9.5,5,1
109,6.9,0.33,0.28,1.3,0.051,37.0,187.0,0.9927,3.27,0.6,10.3,5,1
4437,6.5,0.29,0.3,9.15,0.051,25.0,166.0,0.99339,3.24,0.56,11.35,6,1
678,5.8,0.32,0.38,4.75,0.033,23.0,94.0,0.991,3.42,0.42,11.8,7,1
1803,6.4,0.26,0.21,7.1,0.04,35.0,162.0,0.9956,3.39,0.58,9.9,6,1
4293,7.2,0.21,0.36,15.7,0.045,68.0,183.0,0.99922,3.25,0.76,9.4,5,1


## Split the data into training and test sets

In [28]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(all_wines.drop('quality',axis=1),all_wines['quality'], test_size=0.33, stratify=all_wines.quality)

### Check to see counts of each quality level in the split

In [29]:
print('Training:\n',y_train.value_counts().sort_index(),sep='')
print('\nTest:\n',y_test.value_counts().sort_index(),sep='')

Training:
3      20
4     145
5    1432
6    1900
7     723
8     129
9       3
Name: quality, dtype: int64

Test:
3     10
4     71
5    706
6    936
7    356
8     64
9      2
Name: quality, dtype: int64


## Center and scale the data

In [32]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

column_names = all_wines.drop('quality',axis=1).columns
X_train = pd.DataFrame(scaler.transform(X_train),columns=column_names)
X_test = pd.DataFrame(scaler.transform(X_test),columns=column_names)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type_white
0,0.922462,1.695213,-1.377428,-0.763721,0.514091,0.422540,-1.228998,0.120298,1.007637,2.461123,1.079902,-1.751368
1,-0.236731,-0.358129,0.651995,-0.784649,-0.176887,-1.144738,0.360983,-0.559007,-0.365949,1.226977,0.745943,0.570982
2,-0.854967,0.366580,-0.047806,-0.952073,-0.239703,-0.417074,0.961643,-0.559007,0.008665,1.638359,-0.840363,0.570982
3,-0.545849,-1.022445,-0.047806,-0.826505,-0.459560,0.198643,0.095986,-1.251632,-0.990307,-0.487114,-0.005465,0.570982
4,2.468052,0.245795,1.001896,-0.700937,1.519149,-1.368635,-1.864991,0.653086,-0.615692,0.815596,1.079902,-1.751368
...,...,...,...,...,...,...,...,...,...,...,...,...
2140,-0.468569,-0.720483,1.491757,2.019713,-0.114071,-0.473048,0.360983,1.305752,-1.739536,-1.241314,-0.923852,0.570982
2141,-1.086805,-0.478914,-0.957547,-0.889289,-0.773640,0.534488,-0.027679,-1.291591,1.257380,-0.898496,0.411984,0.570982
2142,2.158934,-0.297736,0.582015,-0.533512,1.330700,-1.144738,-1.582328,1.971738,1.631995,2.598251,-0.673383,-1.751368
2143,-0.159451,-1.082838,0.022174,0.596604,-0.333927,0.422540,0.184319,0.373372,-0.053770,-1.309878,-1.090832,0.570982


In [36]:
X_train.to_csv('../data/X_train.csv')
X_test.to_csv('../data/X_test.csv')
y_train.to_csv('../data/y_train.csv')
y_test.to_csv('../data/y_test.csv')