In [0]:
# Classification by Wine Type

%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [0]:
red_wine = pd.read_csv('./winequality-red.csv')
white_wine = pd.read_csv('./winequality-white.csv', sep=';')

In [0]:
white_wine.head()

In [0]:
red_wine.head()

In [0]:
# quality scores
def plot_quality_scores(df, kind):
    ax = df.quality.value_counts().sort_index(
        ascending=False
    ).plot.barh(title=f'{kind.title()} Wine Quality Scores', figsize=(12, 3))
    for bar in ax.patches:
        ax.text(
            bar.get_width(), 
            bar.get_y() + bar.get_height()/4, 
            f'{bar.get_width()/df.shape[0]:.1%}'
        )
    plt.xlabel('count of wines')
    plt.ylabel('quality score')

plot_quality_scores(white_wine, "white")

In [0]:
plot_quality_scores(red_wine, "red")

In [0]:
# Combining red and white wine data
wine = pd.concat([
    white_wine.assign(kind='white'), red_wine.assign(kind='red')
])
wine.sample(5, random_state=10)

In [0]:
wine.info()

In [0]:
wine.kind.value_counts()

In [0]:
wine.drop(columns='quality').groupby('kind').describe()

In [0]:
# How do chemical properties of the wine correlate to each other and the wine type?
fig = plt.figure(figsize=(7, 7))
sns.heatmap(
    wine.drop(columns='quality').assign(
        is_red=lambda x: np.where(x.kind=='red', 1, 0)
    ).corr(), center=True, square=True, annot=True, fmt='.1g'
)

In [0]:
# Comparison of Red and White Wines by Their Chemical Properties

import math

chemical_properties = [col for col in wine.columns if col not in ['quality', 'kind']]
melted = wine.drop(columns='quality').melt(id_vars=['kind'])

fig, axes = plt.subplots(math.ceil(len(chemical_properties) / 4), 4, figsize=(20, 10))
axes = axes.flatten()

for prop, ax in zip(chemical_properties, axes):
    sns.boxplot(
        data=melted[melted.variable.isin([prop])], 
        x='variable', y='value', hue='kind', ax=ax
    )
    
# remove the extra subplots
for ax in axes[len(chemical_properties):]:
    ax.remove()
    
plt.suptitle('Comparing Chemical Properties of Red and White Wines')

In [0]:
# Classification of Red and White Wines
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

wine_y = np.where(wine.kind == 'red', 1, 0)
wine_X = wine.drop(columns=['quality', 'kind'])

w_X_train, w_X_test, w_y_train, w_y_test = train_test_split(
    wine_X, wine_y, test_size=0.25, random_state=0, stratify=wine_y
)

white_or_red = Pipeline([
    ('scale', StandardScaler()), ('lr', LogisticRegression(solver='lbfgs', random_state=0))
]).fit(w_X_train, w_y_train)

In [0]:
kind_preds = white_or_red.predict(w_X_test)

In [0]:
from sklearn.metrics import classification_report
print(classification_report(w_y_test, kind_preds))

In [0]:
from utils import plot_roc

plot_roc(w_y_test, kind_preds)

In [0]:
from utils import confusion_matrix_visual

confusion_matrix_visual(w_y_test, kind_preds, ['white', 'red'])