In [39]:
# Determine quality from other traits
# Determine color from other traits
# Cluster and determine wine type
# Find most important traits for wine quality

import pandas as pd
from patsy import dmatrices
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import statsmodels.api as sm

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [51]:
# Load red wines
dfRed = pd.read_csv('data/winequality-red.csv', sep=";")
dfRed['color'] = 'red'

# Load white wines
dfWhite = pd.read_csv('data/winequality-white.csv', sep=";")
dfWhite['color'] = 'white'

# Create master dataframe
df = pd.concat([dfRed, dfWhite])
df = df.reset_index(drop=True)

df[:5]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red


In [41]:
# Cool stat!
df['color'].value_counts()

white    4898
red      1599
Name: color, dtype: int64

In [42]:
# Exclude color (categorical) and quality from regressing on quality
traitsToExclude = ['color', 'quality']
initialFormula = 'quality ~ 0 + ' + " + ".join(['Q("'+x+'")' for x in df if x not in traitsToExclude])
initialFormula

'quality ~ 0 + Q("fixed acidity") + Q("volatile acidity") + Q("citric acid") + Q("residual sugar") + Q("chlorides") + Q("free sulfur dioxide") + Q("total sulfur dioxide") + Q("density") + Q("pH") + Q("sulphates") + Q("alcohol")'

In [43]:
y, X = dmatrices(initialFormula, data=df, return_type='dataframe')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
model = sm.OLS(y_train, X_train)
result = model.fit()
print result.summary()

                            OLS Regression Results                            
Dep. Variable:                quality   R-squared:                       0.984
Model:                            OLS   Adj. R-squared:                  0.984
Method:                 Least Squares   F-statistic:                 2.577e+04
Date:                Sat, 29 Apr 2017   Prob (F-statistic):               0.00
Time:                        13:59:53   Log-Likelihood:                -5075.8
No. Observations:                4547   AIC:                         1.017e+04
Df Residuals:                    4536   BIC:                         1.024e+04
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Q("fixed acidity")    

In [44]:
simpleFormula = 'quality ~ 0 + ' + " + ".join(['Q("'+x+'")' for x in df if x not in ['color', 'quality', 'fixed acidity', 'density', 'citric acid', 'chlorides']])
y, X = dmatrices(simpleFormula, data=df, return_type='dataframe')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
model = sm.OLS(y_train, X_train)
result = model.fit()
print result.summary()

                            OLS Regression Results                            
Dep. Variable:                quality   R-squared:                       0.984
Model:                            OLS   Adj. R-squared:                  0.984
Method:                 Least Squares   F-statistic:                 3.997e+04
Date:                Sat, 29 Apr 2017   Prob (F-statistic):               0.00
Time:                        13:59:53   Log-Likelihood:                -5107.0
No. Observations:                4547   AIC:                         1.023e+04
Df Residuals:                    4540   BIC:                         1.027e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Q("volatile acidity") 

In [50]:
result.params

Q("volatile acidity")       -1.415096
Q("residual sugar")          0.029670
Q("free sulfur dioxide")     0.005345
Q("total sulfur dioxide")   -0.001572
Q("pH")                      0.588491
Q("sulphates")               0.663992
Q("alcohol")                 0.372699
dtype: float64