# Wine Quality Predictor Report

## Summary

## Introduction

## Methods

In [None]:
#imports 
import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline
import numpy as np
import seaborn as sns

### Data

In [None]:
url_red = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
url_white = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"

red_wine_data = pd.read_csv(url_red, sep=';')
red_wine_data.head()

white_wine_data = pd.read_csv(url_white, sep=';')
white_wine_data

In [None]:
### EDA

In [None]:
#Columns, all feautures are numeric. Target is integer
#n=1599
red_wine_data.info()

In [None]:
#There are not missing values in any features
red_wine_data.isna().sum()

In [None]:
#Summary statistics of the features
red_wine_data.drop(columns=["quality"]).describe()

In [None]:
#Some variables are right skewed: residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide and sulphates.
#Some features have narrow ranges: density (0.990-1.004), pH(2.9-3.7), fixed acidity (95% around 6-9). So, it is necessary the scaling. The outliers stand out relative to small ranges.
#Density could be correlated with sugar/alcohol (overlapping).
#High alcohol content wines are few but is expected to be one of the best predictors for quality. 
#Outliers are: residual sugar(>10), chloried (>0.4), total suffur dioxide (>200), free suffur dioxide (>70)
#Target left out of the analysis in order to prevent leakage

red_wine_data.drop("quality", axis=1).hist(bins=50, figsize=(20, 15))

In [None]:
#The scales matter: free suffur dioxide and total suffur dioxide have magnitude orders larger than the rest of features.
#Several features have heavy tails and outliers: residual sugar, free sulfur dioxide, total sulfur dioxide and alcohol.
#Some boxplots are tight: fixed acidity, volatile acidity, citric acidity, chlorides, ph so it is possible they have weak predictive power.
#Alcohol has a larger spread so can have a better prediction value. 

plt.figure(figsize=(12,6))
sns.boxplot(data=red_wine_data.drop(columns=["quality"]))

plt.xticks(fontsize=10, rotation=45)
plt.yticks(fontsize=10)

plt.tight_layout()
plt.show()


In [None]:
#There are strong positive relationships: total sulfur dioxide with free sulfur dioxide and fixed acidity with citric acid an also density (acid wines have more acid and are more dense). 
#Also are negative relationships: density and alcohol (more alcohol reduces the density)
#There is a strong positive relationship between alcohol and quality and a negative one between volatile acidity and quality.


plt.figure(figsize=(12,8))

sns.heatmap(
    red_wine_data.corr(),
    annot=False,
    cmap="coolwarm"
)

plt.tight_layout()
plt.show()

### Analysis

In [None]:
#Chemical features have weak pairwise relationship, with no clear patterns suggesting low multicolinearity. 
#As the variables donÂ´t look so similar and not redundant, it seems that every individual feature have different predictive value or information.
relevant_features = ["alcohol", "sulphates", "volatile acidity", "citric acid"]
sns.pairplot(red_wine_data[relevant_features])



## Results & Discussion

## References