# Dataset

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split


dataset = pd.read_csv('winequality-red.csv')
X_train, X_test, y_train, y_test = train_test_split(
    dataset.drop('quality', axis=1),
    dataset.quality,
    stratify=dataset.quality,
    test_size=0.2,
    random_state=42
)

In [None]:
dataset

# Entrenamos un modelo (no tan) simple

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

print('MAE: ', mean_absolute_error(y_test, y_pred))
print('MSE: ', mean_squared_error(y_test, y_pred))

## Usando los SHAP values

In [None]:
import shap

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)


## Entire Model

In [None]:
shap.summary_plot(shap_values, X_test)

In [None]:
shap.summary_plot(shap_values, X_test, plot_type="violin")

## Individual predictions

In [None]:
row_to_show = 5
data_for_prediction = X_test.iloc[row_to_show] 

In [None]:
data_for_prediction

In [None]:
shap.force_plot(explainer.expected_value[0], shap_values[row_to_show], data_for_prediction, matplotlib=True)

In [None]:
print(f'calidad del vino en la fila {row_to_show}: {y_test.iloc[row_to_show]}')
print(f'predicción de la calidad del vino en la fila {row_to_show}: {y_pred[row_to_show]}')

## Mejoras

In [None]:
new_X_train, new_X_test, new_y_train, new_y_test = train_test_split(
    dataset.drop(['quality', 'citric acid', 'residual sugar', 'free sulfur dioxide'], axis=1),
    dataset.quality,
    stratify=dataset.quality,
    test_size=0.2,
    random_state=42
)

new_model = RandomForestRegressor(n_estimators=100, random_state=42)
new_model.fit(new_X_train, new_y_train)

new_y_pred = new_model.predict(new_X_test)

In [None]:
print('MAE: ', mean_absolute_error(new_y_test, new_y_pred))
print('MSE: ', mean_squared_error(new_y_test, new_y_pred))

In [None]:
new_explainer = shap.TreeExplainer(new_model)
new_shap_values = new_explainer.shap_values(new_X_test)
data_for_prediction = new_X_test.iloc[row_to_show] 
shap.force_plot(new_explainer.expected_value[0], new_shap_values[row_to_show], data_for_prediction, matplotlib=True)

In [None]:
print(f'calidad del vino en la fila {row_to_show}: {new_y_test.iloc[row_to_show]}')
print(f'predicción de la calidad del vino en la fila {row_to_show}: {new_y_pred[row_to_show]}')