In [86]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

red_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', sep = ';')
white_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv', sep = ';')

# create a new variable 'wine_type'
red_wine['wine_type'] = 'red'
white_wine['wine_type'] = 'white'


# bucket wine quality scores into qualitative quality labels
red_wine['quality_label'] = red_wine['quality'].apply(lambda value: 'low'
if value <= 5 else 'medium'
if value <= 7 else 'high')

red_wine['quality_label'] = pd.Categorical(red_wine['quality_label'],
categories=['low', 'medium', 'high'])

white_wine['quality_label'] = white_wine['quality'].apply(lambda value: 'low'
if value <= 5 else 'medium'
if value <= 7 else 'high')

white_wine['quality_label'] = pd.Categorical(white_wine['quality_label'],
categories=['low', 'medium', 'high'])

wines = pd.concat([red_wine, white_wine])

In [87]:
wines.shape

(6497, 14)

In [88]:
wines.isnull().sum()


fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
wine_type               0
quality_label           0
dtype: int64

In [89]:
pwd

'c:\\Users\\Karsten Werner\\Documents\\Python\\git\\antelope'

In [90]:
wine_sales = pd.read_csv('wine_sales_data.csv')

In [91]:
wine_sales.isnull().sum()

Unnamed: 0         0
country            5
description        0
designation    45735
points             0
price          13695
province           5
region_1       25060
region_2       89977
variety            0
winery             0
dtype: int64

In [92]:
winw_sales_red = wine_sales[['country','province','points','price']]

In [93]:
winw_sales_red.isnull().sum()

country         5
province        5
points          0
price       13695
dtype: int64

In [94]:
winw_sales_red.shape


(150930, 4)

In [95]:
winw_sales_red.head

<bound method NDFrame.head of        country            province  points  price
0           US          California      96  235.0
1        Spain      Northern Spain      96  110.0
2           US          California      96   90.0
3           US              Oregon      96   65.0
4       France            Provence      95   66.0
...        ...                 ...     ...    ...
150925   Italy      Southern Italy      91   20.0
150926  France           Champagne      91   27.0
150927   Italy      Southern Italy      91   20.0
150928  France           Champagne      90   52.0
150929   Italy  Northeastern Italy      90   15.0

[150930 rows x 4 columns]>

In [96]:
port_vv = winw_sales_red[(winw_sales_red['country'] == 'Portugal') & (winw_sales_red['province'] == 'Vinho Verde')]

In [97]:
port_vv.shape

(396, 4)

In [98]:
port_vv.isnull().sum()

country      0
province     0
points       0
price       86
dtype: int64

In [99]:
port_vv

Unnamed: 0,country,province,points,price
277,Portugal,Vinho Verde,92,35.0
993,Portugal,Vinho Verde,85,10.0
994,Portugal,Vinho Verde,85,9.0
1378,Portugal,Vinho Verde,91,
1558,Portugal,Vinho Verde,84,10.0
...,...,...,...,...
141129,Portugal,Vinho Verde,83,9.0
141143,Portugal,Vinho Verde,82,8.0
141148,Portugal,Vinho Verde,82,6.0
143472,Portugal,Vinho Verde,86,9.0


In [100]:
port_vv_clean = port_vv.dropna()

In [52]:
port_vv_clean.isnull().sum()

country     0
province    0
points      0
price       0
dtype: int64

In [101]:
from scipy.stats import iqr

iqr(port_vv_clean['price'])

5.0

In [102]:
port_vv_clean.drop(['country', 'province'], axis = 1, inplace = True)

In [103]:
port_vv_clean.describe()

Unnamed: 0,points,price
count,310.0,310.0
mean,86.351613,11.4
std,2.512541,4.821205
min,80.0,5.0
25%,84.0,8.0
50%,86.0,10.0
75%,88.0,13.0
max,92.0,45.0


In [104]:
price_irq = port_vv_clean[port_vv_clean['price'].between(port_vv_clean['price'].quantile(0.25), port_vv_clean['price'].quantile(0.75))]

In [105]:
price_irq.describe()

Unnamed: 0,points,price
count,198.0,198.0
mean,85.979798,10.287879
std,2.279816,1.719684
min,80.0,8.0
25%,84.0,9.0
50%,85.5,10.0
75%,88.0,12.0
max,91.0,13.0
