# Wine dataset

https://archive-beta.ics.uci.edu/dataset/109/wine

1) Alcohol
2) Malic acid
3) Ash
4) Alcalinity of ash  
5) Magnesium
6) Total phenols
7) Flavanoids
8) Nonflavanoid phenols
9) Proanthocyanins
10) Color intensity
11) Hue
12) OD280/OD315 of diluted wines
13) Proline 

https://archive-beta.ics.uci.edu/dataset/186/wine+quality

1. fixed acidity
2. volatile acidity
3. citric acid
4 - residual sugar
5 - chlorides
6 - free sulfur dioxide
7 - total sulfur dioxid
8 - density
9 - pH
10 - sulphates
11 - alcohol
Output variable (based on sensory data): 
12 - quality (score between 0 and 10)

In [1]:
import numpy as np
import urllib

## Cargar los datos

In [5]:
# Forma 1: Leer de URL
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
response = urllib.request.urlopen(url)
data_red = np.genfromtxt(response, delimiter=";", skip_header=1)
print(data_red.ndim)
print(data_red.shape)

2
(1599, 12)


In [28]:
# Forma 2: Leer de archivo local, no poner dtype=int porque se pierde la parte decimal
data_red = np.genfromtxt('../../data/winequality-red.csv', delimiter=";", skip_header=1)
print(data_red.ndim)
print(data_red.shape)

2
(1599, 12)


In [7]:
data_white = np.genfromtxt('../../data/winequality-white.csv', delimiter=";", skip_header=1)
print(data_white.ndim)
print(data_white.shape)

2
(4898, 12)


In [42]:
# Estadisticas sobre columna alcohol (indice 10) en vinos rojos
red_alcohol = data_red[:, 10]

print(f"mean: {np.mean(red_alcohol):.2f}")
# print(f"mean: {np.around(np.mean(red_alcohol), 2)}")
print(f"median: {np.median(red_alcohol)}")
print(f"max: {np.max(red_alcohol)}")
print(f"min: {np.min(red_alcohol)}")
print(f"std: {np.std(red_alcohol):.2f}")


mean: 10.42
median: 10.2
max: 14.9
min: 8.4
std: 1.07


In [43]:
# Estadisticas sobre columna alcohol (indice 10) en vinos blancos
white_alcohol = data_white[:, 10]

print(f"mean: {np.mean(white_alcohol):.2f}")
# print(f"mean: {np.around(np.mean(red_alcohol), 2)}")
print(f"median: {np.median(white_alcohol)}")
print(f"max: {np.max(white_alcohol)}")
print(f"min: {np.min(white_alcohol)}")
print(f"std: {np.std(white_alcohol):.2f}")

mean: 10.51
median: 10.4
max: 14.2
min: 8.0
std: 1.23


In [51]:
data_all = np.concatenate((data_red, data_white))
data_all.shape

(6497, 12)

In [52]:
all_alcohol = data_all[:, 10]

print(f"mean: {np.mean(all_alcohol):.2f}")
print(f"median: {np.median(all_alcohol)}")
print(f"max: {np.max(all_alcohol)}")
print(f"min: {np.min(all_alcohol)}")
print(f"std: {np.std(all_alcohol):.2f}")

mean: 10.49
median: 10.3
max: 14.9
min: 8.0
std: 1.19
