In [3]:
# Determine quality from other traits
# Cluster and determine wine type

import pandas as pd
from pandas import DataFrame, Series
from patsy import dmatrices
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import tree
import statsmodels.api as sm
import os

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [4]:
# Load red wines
dfRed = pd.read_csv('data/winequality-red.csv', sep=";")
dfRed['color'] = 'red'

# Load white wines
dfWhite = pd.read_csv('data/winequality-white.csv', sep=";")
dfWhite['color'] = 'white'

# Create master dataframe
df = pd.concat([dfRed, dfWhite])
df = df.reset_index(drop=True)
names = ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar', 'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality', 'color']
df.columns = names

df[:5]

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,color
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red


In [5]:
# Cool stat!
df['color'].value_counts()

white    4898
red      1599
Name: color, dtype: int64

In [6]:
# Exclude color (categorical) and quality from regressing on quality
traitsToExclude = ['color', 'quality']

# Generate formula
initialFormula = 'quality ~ 0 + ' + " + ".join([column for column in df if column not in traitsToExclude])
initialFormula

'quality ~ 0 + fixed_acidity + volatile_acidity + citric_acid + residual_sugar + chlorides + free_sulfur_dioxide + total_sulfur_dioxide + density + pH + sulphates + alcohol'

In [7]:
# Load inital design matrix
y, X = dmatrices(initialFormula, data=df, return_type='dataframe')

# Fit initial model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
model = sm.OLS(y_train, X_train)
result = model.fit()
print result.summary()

                            OLS Regression Results                            
Dep. Variable:                quality   R-squared:                       0.984
Model:                            OLS   Adj. R-squared:                  0.984
Method:                 Least Squares   F-statistic:                 2.577e+04
Date:                Sat, 29 Apr 2017   Prob (F-statistic):               0.00
Time:                        15:28:56   Log-Likelihood:                -5075.8
No. Observations:                4547   AIC:                         1.017e+04
Df Residuals:                    4536   BIC:                         1.024e+04
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
fixed_acidity            0.0036 

In [8]:
# Looks great! But we can do better
# Create new formula and fit new design matrix
exemptions = ['color', 'quality', 'fixed_acidity', 'density', 'citric_acid', 'chlorides']
formula = 'quality ~ 0 + ' + " + ".join([column for column in df if column not in exemptions])
y, X = dmatrices(formula, data=df, return_type='dataframe')

# Fit new model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
model = sm.OLS(y_train, X_train)
result = model.fit()
print result.summary()

                            OLS Regression Results                            
Dep. Variable:                quality   R-squared:                       0.984
Model:                            OLS   Adj. R-squared:                  0.984
Method:                 Least Squares   F-statistic:                 3.997e+04
Date:                Sat, 29 Apr 2017   Prob (F-statistic):               0.00
Time:                        15:28:57   Log-Likelihood:                -5107.0
No. Observations:                4547   AIC:                         1.023e+04
Df Residuals:                    4540   BIC:                         1.027e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
volatile_acidity        -1.4151 

In [9]:
# Looks great!
result.params

volatile_acidity       -1.415096
residual_sugar          0.029670
free_sulfur_dioxide     0.005345
total_sulfur_dioxide   -0.001572
pH                      0.588491
sulphates               0.663992
alcohol                 0.372699
dtype: float64

In [10]:
# Determine color from other traits

# Create formula to identify color
formula = "color ~ 0 + " + " + ".join([column for column in df if column != 'color'])
formula

'color ~ 0 + fixed_acidity + volatile_acidity + citric_acid + residual_sugar + chlorides + free_sulfur_dioxide + total_sulfur_dioxide + density + pH + sulphates + alcohol + quality'

In [11]:
# Split up design matrices
Y, X = dmatrices(formula, data=df, return_type="dataframe")
# Since color is a binary categorical variable, we can look just for red
y = Y['color[red]'].values

# Split into test data and fit model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
model = LogisticRegression()
result = model.fit(X_train, y_train)
result

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [12]:
# Score model - nice!
prediction = model.predict(X_test)
metrics.accuracy_score(y_test, prediction)

0.97692307692307689

In [13]:
# What's important?
weights = Series(model.coef_[0], index=X.columns.values)
weights.sort_values()

density                -2.315902
citric_acid            -0.858144
alcohol                -0.749702
quality                -0.189203
residual_sugar         -0.146158
total_sulfur_dioxide   -0.064950
free_sulfur_dioxide     0.045675
fixed_acidity           0.685169
pH                      1.918495
chlorides               2.020058
sulphates               6.924124
volatile_acidity        8.143458
dtype: float64

In [14]:
# Can we make a chart to figure out how great a wine is?
traitsToExclude = ['color', 'quality']
formula = 'quality ~ 0 + C(color) + ' + " + ".join([column for column in df if column not in traitsToExclude])
formula

'quality ~ 0 + C(color) + fixed_acidity + volatile_acidity + citric_acid + residual_sugar + chlorides + free_sulfur_dioxide + total_sulfur_dioxide + density + pH + sulphates + alcohol'

In [15]:
Y, X = dmatrices(formula, df, return_type='dataframe')
y = Y['quality'].values

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [17]:
model = tree.DecisionTreeClassifier(criterion='entropy', max_depth=10)
result = model.fit(X_train, y_train)
prediction = model.predict(X_test)
print metrics.accuracy_score(y_test, prediction)

0.551282051282


In [18]:
tree.export_graphviz(model, feature_names=X.columns)
os.system('dot -Tpng tree.dot -o tree.png')



0