In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
data = pd.read_csv('vivino_clean.csv')

In [3]:
# Preprocesar la columna 'pairing'
data['pairing'] = data['pairing'].str.replace(r'\(.*?\)', '', regex=True)
data['pairing'] = data['pairing'].str.replace(r'\s+', ' ', regex=True).str.strip()

In [4]:
data

Unnamed: 0,winery,wine_name,year,price,score,country,wine_type,pairing,picture,price_quality,Estilo de vino,country_grouped
0,Quinta de Catralvos,Alicante Bouschet - Syrah,2021,14.95,4.5,Portugal,2,"Ternera, Ternera lechal, Aves, Pasta",//images.vivino.com/thumbs/AwtSRa2ZSSWJE1AoViX...,6,Sur Portugal Tinto,0
1,Quinta do Ermizio,Vinha do Cuco,2023,16.95,4.5,Portugal,1,"Marisco, Aperitivos y tentempiés, Pescado blan...",//images.vivino.com/thumbs/M2tFNe1GQQaqrU_wn-q...,6,Norte Portugal Blanco,0
2,Sociedade Vinicola de Palmela,Personalizado,2022,17.95,4.5,Portugal,2,"Ternera, Ternera lechal, Aves, Pasta",//images.vivino.com/thumbs/UgUGQSbaTSGQMHMk6zj...,6,Sur Portugal Tinto,0
3,San Marzano,60 Sessantanni Old Vines Primitivo di Manduria,2018,21.95,4.4,Italia,2,"Ternera, Cordero, Aves, Pasta",//images.vivino.com/thumbs/tLtS1VwaRxCL25zZ-j6...,4,Primitivo (Italia),1
4,Puglia Pop,Fico Susumaniello,2022,19.95,4.4,Italia,2,"Ternera, Cordero, Pasta",//images.vivino.com/thumbs/XAKEoj1sR5SllZQ-x5N...,4,Sur Italia Tinto,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1869,Niepoort,V.V. Bical - Maria Gomes Vinhas Velhas,2018,38.35,4.2,Portugal,1,"Marisco, Pescado azul , Aperitivos y tentempié...",//images.vivino.com/thumbs/fq2KfOb6QgyVmXeEsow...,4,Central Portugal White,0
1870,Forjas del Salnés,Goliardo a Telleira Albariño,2021,34.51,4.2,España,1,"Marisco, Vegetariana, Pasta, Aperitivos y tent...",//images.vivino.com/thumbs/YHiRsXtVRteasXHBETF...,4,Albariño (España),3
1871,Fernández de Piérola,Vitium Reserva,2015,30.20,4.2,España,2,"Ternera, Cordero, Ternera lechal, Aves",//images.vivino.com/thumbs/Sfpl41O_T2S__xayMq3...,4,Rioja Tinto (España),3
1872,Cota 45,Pandorga Tintilla de Rota,2022,39.15,4.2,España,2,"Ternera, Ternera lechal, Aves, Pasta",//images.vivino.com/thumbs/s0OuKAb-SYiyh9n3SP-...,1,Tinto (España),3


## Elimnar columnas innecesarias pero con información relevante

In [5]:
data.drop(columns=['winery','wine_name', 'country', 'picture', 'Estilo de vino'], inplace=True)

## Separar la columna 'pairing' y 'flavours' en listas

In [6]:
# Verificar el resultado
data.head()

Unnamed: 0,year,price,score,wine_type,pairing,price_quality,country_grouped
0,2021,14.95,4.5,2,"Ternera, Ternera lechal, Aves, Pasta",6,0
1,2023,16.95,4.5,1,"Marisco, Aperitivos y tentempiés, Pescado blan...",6,0
2,2022,17.95,4.5,2,"Ternera, Ternera lechal, Aves, Pasta",6,0
3,2018,21.95,4.4,2,"Ternera, Cordero, Aves, Pasta",4,1
4,2022,19.95,4.4,2,"Ternera, Cordero, Pasta",4,1


In [7]:
# Dividir la columna 'pairing' en una lista de elementos
data['pairing'] = data['pairing'].str.split(', ')

# Creamos una lista con todos los elementos únicos de 'pairing'
pairing_dummies = data['pairing'].apply(lambda x: pd.Series(1, index=x)).fillna(0)

# Unir los datos originales con las columnas dummy
data = data.join(pairing_dummies)

# Eliminar la columna 'pairing'
data.drop(columns=['pairing'], inplace=True)

In [8]:
data

Unnamed: 0,year,price,score,wine_type,price_quality,country_grouped,Ternera,Ternera lechal,Aves,Pasta,...,Carne de caza,Queso azul,Pescado azul,Comida picante,Queso tierno y cremoso,Pescado azul.1,Champiñones,Queso curado,Queso de leche de cabra,Aperitivo
0,2021,14.95,4.5,2,6,0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2023,16.95,4.5,1,6,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2022,17.95,4.5,2,6,0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2018,21.95,4.4,2,4,1,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2022,19.95,4.4,2,4,1,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1869,2018,38.35,4.2,1,4,0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1870,2021,34.51,4.2,1,4,3,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1871,2015,30.20,4.2,2,4,3,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1872,2022,39.15,4.2,2,1,3,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
from sklearn.model_selection import train_test_split

# Selecionamos las columnas de características
X = data[['year', 'price', 'score', 'wine_type', 'price_quality', 'country_grouped']]

# Selecionamos las columnas objetivo
pairing_columns = [col for col in data.columns if col not in ['year', 'price', 'score', 'wine_type', 'price_quality', 'country_grouped']]
y = data[pairing_columns]

# Separar los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
data

Unnamed: 0,year,price,score,wine_type,price_quality,country_grouped,Ternera,Ternera lechal,Aves,Pasta,...,Carne de caza,Queso azul,Pescado azul,Comida picante,Queso tierno y cremoso,Pescado azul.1,Champiñones,Queso curado,Queso de leche de cabra,Aperitivo
0,2021,14.95,4.5,2,6,0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2023,16.95,4.5,1,6,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2022,17.95,4.5,2,6,0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2018,21.95,4.4,2,4,1,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2022,19.95,4.4,2,4,1,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1869,2018,38.35,4.2,1,4,0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1870,2021,34.51,4.2,1,4,3,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1871,2015,30.20,4.2,2,4,3,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1872,2022,39.15,4.2,2,1,3,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV

# Definir el modelo base
base_model = GradientBoostingClassifier(random_state=42)

# Definir los parámetros para GridSearchCV
param_grid = {
    'estimator__n_estimators': [100, 200],
    'estimator__learning_rate': [0.01, 0.1, 0.2],
    'estimator__max_depth': [3, 5, 7],
    'estimator__min_samples_split': [2, 5],
    'estimator__min_samples_leaf': [1, 2]
}

# Crear el modelo MultiOutputClassifier
multi_target_model = MultiOutputClassifier(base_model)

# Configurar GridSearchCV
grid_search = GridSearchCV(multi_target_model, param_grid, cv=3, scoring='precision_micro', n_jobs=-1)

# Entrenar el modelo con GridSearchCV
grid_search.fit(X_train, y_train)

# Obtener el mejor modelo
best_model = grid_search.best_estimator_
best_model

In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Hacer predicciones en el conjunto de prueba
y_pred = best_model.predict(X_test)

# Calcular las métricas de evaluación
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='micro')
recall = recall_score(y_test, y_pred, average='micro')
f1 = f1_score(y_test, y_pred, average='micro')

# Mostrar las métricas de evaluación
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

Accuracy: 0.07
Precision: 0.77
Recall: 0.55
F1 Score: 0.64


In [13]:
# Funcion para probar el modelo
def get_pairing_recommendations(wine_features):
    # Nos aseguramos de que las características sean correctas
    feature_names = ['year', 'price', 'score', 'wine_type', 'price_quality', 'country_grouped']
    wine_features_df = pd.DataFrame([wine_features], columns=feature_names)

    # Predecir las recomendaciones de maridaje
    predicted_pairings = best_model.predict(wine_features_df)
    return pd.DataFrame(predicted_pairings, columns=pairing_columns)

# Ejemplo de uso
wine_features = [2020, 24.5, 4.2, 2, 4, 1]  # year, price, score, wine_type, price_quality, country_grouped
pairing_recommendations = get_pairing_recommendations(wine_features)
pairing_recommendations

Unnamed: 0,Ternera,Ternera lechal,Aves,Pasta,Marisco,Aperitivos y tentempiés,Pescado blanco,Carne adobada,Cordero,Carne de caza,...,Carne de caza.1,Queso azul,Pescado azul,Comida picante,Queso tierno y cremoso,Pescado azul.1,Champiñones,Queso curado,Queso de leche de cabra,Aperitivo
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
## Guardamos el modelo

import joblib

# Guardar el modelo en un archivo
joblib.dump(best_model, 'modelo_maridaje.pkl')

['modelo_maridaje.pkl']