# Difference Distribution Chart of Wine Quality: Quality Feature

## Prepare the data

In [1]:
# Import library dependencies
import pandas as pd
import datetime
from whylogs import get_or_create_session

# Load the data
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
wine = pd.read_csv(url,sep=";")

# Split the wines in two groups
cond_reference = (wine['alcohol']<=11)
wine_reference = wine.loc[cond_reference]

cond_target = (wine['alcohol']>11)
wine_target = wine.loc[cond_target]

# add some missing values for data variability
ixs = wine_target.iloc[100:110].index
wine_target.loc[ixs,'citric acid'] = None

# transform numerical 'quality' to a categorical feature containing Good for anything above 6.5, othererwise Bad
pd.options.mode.chained_assignment = None  # Disabling false positive warning

bins = (2, 6.5, 8)
group_names = ['bad', 'good']

wine_reference['quality'] = pd.cut(wine_reference['quality'], bins = bins, labels = group_names)
wine_target['quality'] = pd.cut(wine_target['quality'], bins = bins, labels = group_names)

## Profile the prepared dataframes with `whylogs`

In [2]:
import datetime
from whylogs import get_or_create_session


session = get_or_create_session()
now = datetime.datetime.now()

with session.logger("drift-test", dataset_timestamp=now-datetime.timedelta(days=1)) as logger:
    logger.log_dataframe(wine_reference)
    reference_profile = logger.profile

WARN: Missing config


In [3]:
session = get_or_create_session()
now = datetime.datetime.now()

with session.logger("drift-test", dataset_timestamp=now) as logger:
    logger.log_dataframe(wine_target)
    target_profile = logger.profile

## Instantiate profile visualizer with the reference and target profiles

Instantiate `NotebookProfileViewer` and set the reference and target profiles:

In [4]:
from whylogs.viz import NotebookProfileViewer

visualization = NotebookProfileViewer()
visualization.set_profiles(target_profile=target_profile, reference_profile=reference_profile)

# View a Difference Distribution Chart of Quality

Enter the single line of code needed to look at the difference between distributions in a single chart:

In [5]:
visualization.difference_distribution_chart(feature_names="quality")


# Downloading the Visualization Output

All of the visualizations in the post can be downloaded in `HTML` format for further inspection. To try this for the Summary Drift Report, just run:

In [6]:
import os
os.getcwd()
visualization.download(html=visualization.difference_distribution_chart(feature_names="quality"), html_file_name=os.getcwd()+"/example")

Simply replace 'visualization.summary_drift_reprt()' with the visualization you want to download, and update the file name if you'd like to make it easier to recognize. 