In [2]:
import yaml
 
try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")

In [3]:
# ------------------------------------------------------------------
# 1 - Load dataset
# ------------------------------------------------------------------

import pandas as pd

df = pd.read_csv(config['input_data']['file'])
df

Unnamed: 0,wine_type,wine_category,food_item,food_category,cuisine,pairing_quality,quality_label,description
0,Syrah/Shiraz,Red,smoked sausage,Smoky BBQ,Spanish,2,Poor,Heuristic pairing assessment.
1,Grenache,Red,charcuterie board,Salty Snack,French,3,Neutral,Heuristic pairing assessment.
2,Madeira,Fortified,lemon tart,Dessert,French,4,Good,Acidic wine balances acidic food.
3,Cabernet Sauvignon,Red,roast lamb,Red Meat,Mexican,5,Excellent,Tannic red complements red meat fat.
4,Viognier,White,duck à l’orange,Poultry,Vietnamese,2,Poor,Heuristic pairing assessment.
...,...,...,...,...,...,...,...,...
34928,Merlot,Red,mac and cheese,Cheese,Greek,5,Excellent,Idealized perfect pairing example for contrast.
34929,Sauternes,Dessert,grilled ribeye,Red Meat,Spanish,5,Excellent,Idealized perfect pairing example for contrast.
34930,Sauternes,Dessert,fondue,Cheese,Mexican,1,Terrible,Deliberately bad pairing example for contrast.
34931,Zinfandel,Red,beef stew,Red Meat,Mexican,5,Excellent,Idealized perfect pairing example for contrast.


In [4]:
from itertools import combinations
from scipy.stats import chi2_contingency

categorical_cols = ['wine_type', 'wine_category', 'food_item', 'food_category', 'cuisine']

for var1, var2 in combinations(categorical_cols, 2):
    contingency = pd.crosstab(df[var1], df[var2])
    chi2, p, dof, _ = chi2_contingency(contingency)
    result = "✅ Significant" if p < 0.05 else "❌ Not significant"
    print(f"{var1} vs {var2}: p-value={p:.4f} → {result}")

wine_type vs wine_category: p-value=0.0000 → ✅ Significant
wine_type vs food_item: p-value=0.0000 → ✅ Significant
wine_type vs food_category: p-value=0.0000 → ✅ Significant
wine_type vs cuisine: p-value=1.0000 → ❌ Not significant
wine_category vs food_item: p-value=0.0000 → ✅ Significant
wine_category vs food_category: p-value=0.0000 → ✅ Significant
wine_category vs cuisine: p-value=1.0000 → ❌ Not significant
food_item vs food_category: p-value=0.0000 → ✅ Significant
food_item vs cuisine: p-value=1.0000 → ❌ Not significant
food_category vs cuisine: p-value=1.0000 → ❌ Not significant


In [5]:
# ---------------------------------------------------------------------------------
# 2 - Create unique IDs for wines and foods and Create Datafrme with IDs and Names
# ---------------------------------------------------------------------------------

df['wine_type_id'] = df['wine_type'].astype('category').cat.codes
df['wine_category_id'] = df['wine_category'].astype('category').cat.codes
df['food_item_id'] = df['food_item'].astype('category').cat.codes
df['food_category_id'] = df['food_category'].astype('category').cat.codes
df['cuisine_id'] = df['cuisine'].astype('category').cat.codes

df[['wine_type', 'wine_type_id', 'wine_category', 'wine_category_id', 'food_item', 'food_item_id', 'food_category', 'food_category_id', 'cuisine', 'cuisine_id']].head()

labels_df = df[['wine_type', 'wine_type_id', 'wine_category', 'wine_category_id', 'food_item', 'food_item_id', 'food_category', 'food_category_id', 'cuisine', 'cuisine_id']]

try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")

labels_df.to_csv(config['output_data']['labels_file'], index=False)

In [6]:
labels_df

Unnamed: 0,wine_type,wine_type_id,wine_category,wine_category_id,food_item,food_item_id,food_category,food_category_id,cuisine,cuisine_id
0,Syrah/Shiraz,23,Red,2,smoked sausage,34,Smoky BBQ,9,Spanish,14
1,Grenache,9,Red,2,charcuterie board,10,Salty Snack,7,French,4
2,Madeira,12,Fortified,1,lemon tart,24,Dessert,3,French,4
3,Cabernet Sauvignon,2,Red,2,roast lamb,32,Red Meat,6,Mexican,11
4,Viognier,26,White,5,duck à l’orange,17,Poultry,5,Vietnamese,16
...,...,...,...,...,...,...,...,...,...,...
34928,Merlot,14,Red,2,mac and cheese,25,Cheese,1,Greek,6
34929,Sauternes,21,Dessert,0,grilled ribeye,22,Red Meat,6,Spanish,14
34930,Sauternes,21,Dessert,0,fondue,19,Cheese,1,Mexican,11
34931,Zinfandel,28,Red,2,beef stew,7,Red Meat,6,Mexican,11


In [7]:
# ------------------------------------------------------------------
# 3️ - One-Hot Encode categorical variables
# ------------------------------------------------------------------
# Columns to one-hot encode
categorical_cols = ['wine_type', 'wine_category', 'food_item', 'food_category', 'cuisine']

# Use pandas get_dummies
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print("After One-Hot Encoding:", df_encoded.shape)
df_encoded.head()

After One-Hot Encoding: (34933, 105)


Unnamed: 0,pairing_quality,quality_label,description,wine_type_id,wine_category_id,food_item_id,food_category_id,cuisine_id,wine_type_Barbera,wine_type_Cabernet Sauvignon,...,cuisine_Indian,cuisine_Italian,cuisine_Japanese,cuisine_Korean,cuisine_Mexican,cuisine_Middle Eastern,cuisine_Moroccan,cuisine_Spanish,cuisine_Thai,cuisine_Vietnamese
0,2,Poor,Heuristic pairing assessment.,23,2,34,9,14,False,False,...,False,False,False,False,False,False,False,True,False,False
1,3,Neutral,Heuristic pairing assessment.,9,2,10,7,4,False,False,...,False,False,False,False,False,False,False,False,False,False
2,4,Good,Acidic wine balances acidic food.,12,1,24,3,4,False,False,...,False,False,False,False,False,False,False,False,False,False
3,5,Excellent,Tannic red complements red meat fat.,2,2,32,6,11,False,True,...,False,False,False,False,True,False,False,False,False,False
4,2,Poor,Heuristic pairing assessment.,26,5,17,5,16,False,False,...,False,False,False,False,False,False,False,False,False,True


In [8]:
# ------------------------------------------------------------------
# 4 - Define target variable (pairing score)
# ------------------------------------------------------------------

target_col = 'pairing_quality'
assert target_col in df_encoded.columns, f"Target column '{target_col}' not found."

df_encoded[target_col].value_counts().sort_index()

pairing_quality
1    7034
2    6179
3    8108
4    6449
5    7163
Name: count, dtype: int64

In [9]:
# ------------------------------------------------------------------
# 5 - Split into Train / Test sets (80/20)
# ------------------------------------------------------------------

from sklearn.model_selection import train_test_split

# Define features and target
X = df_encoded.drop(columns=[target_col])
y = df_encoded[target_col]

# Use the original categorical column for stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=df['wine_type'])

print(f"Train samples: {X_train.shape[0]}  |  Test samples: {X_test.shape[0]}")

Train samples: 27946  |  Test samples: 6987


In [10]:
# ------------------------------------------------------------------
# 6 - Save clean datasets
# ------------------------------------------------------------------
import yaml

# Load YAML config
with open("../config.yaml", "r") as f:
    config = yaml.safe_load(f)

# Prepare datasets
train_df = X_train.copy()
train_df[target_col] = y_train
test_df = X_test.copy()
test_df[target_col] = y_test

# Save using config paths
train_df.to_csv(config['output_data']['train_file'], index=False)
test_df.to_csv(config['output_data']['test_file'], index=False)

print("✅ Data preparation complete. Files saved:")
print(f"  {config['output_data']['train_file']}")
print(f"  {config['output_data']['test_file']}")

✅ Data preparation complete. Files saved:
  ../data/clean/train_file.csv
  ../data/clean/test_file.csv


In [11]:
train_df

Unnamed: 0,quality_label,description,wine_type_id,wine_category_id,food_item_id,food_category_id,cuisine_id,wine_type_Barbera,wine_type_Cabernet Sauvignon,wine_type_Cava,...,cuisine_Italian,cuisine_Japanese,cuisine_Korean,cuisine_Mexican,cuisine_Middle Eastern,cuisine_Moroccan,cuisine_Spanish,cuisine_Thai,cuisine_Vietnamese,pairing_quality
23065,Poor,Delicate wine overwhelmed by red meat.,21,0,32,6,8,False,False,False,...,True,False,False,False,False,False,False,False,False,2
285,Terrible,Too lean for creamy dish.,15,2,16,2,7,False,False,False,...,False,False,False,False,False,False,False,False,False,1
19261,Excellent,Idealized perfect pairing example for contrast.,3,4,7,6,3,False,False,True,...,False,False,False,False,False,False,False,False,False,5
23716,Terrible,Deliberately bad pairing example for contrast.,20,2,37,11,10,False,False,False,...,False,False,True,False,False,False,False,False,False,1
20067,Terrible,Tannic reds clash with delicate seafood.,21,0,33,8,13,False,False,False,...,False,False,False,False,False,True,False,False,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10128,Neutral,Tannic reds clash with delicate seafood.,11,0,33,8,16,False,False,False,...,False,False,False,False,False,False,False,False,True,3
16667,Poor,Heuristic pairing assessment.,6,5,22,6,5,False,False,False,...,False,False,False,False,False,False,False,False,False,2
20275,Neutral,Delicate wine overwhelmed by red meat.,16,2,32,6,13,False,False,False,...,False,False,False,False,False,True,False,False,False,3
8976,Poor,Heuristic pairing assessment.,23,2,34,9,7,False,False,False,...,False,False,False,False,False,False,False,False,False,2


In [12]:
test_df

Unnamed: 0,quality_label,description,wine_type_id,wine_category_id,food_item_id,food_category_id,cuisine_id,wine_type_Barbera,wine_type_Cabernet Sauvignon,wine_type_Cava,...,cuisine_Italian,cuisine_Japanese,cuisine_Korean,cuisine_Mexican,cuisine_Middle Eastern,cuisine_Moroccan,cuisine_Spanish,cuisine_Thai,cuisine_Vietnamese,pairing_quality
10991,Poor,Tannic reds clash with delicate seafood.,7,2,29,8,11,False,False,False,...,False,False,False,True,False,False,False,False,False,2
13018,Poor,Heuristic pairing assessment.,3,4,19,1,5,False,False,True,...,False,False,False,False,False,False,False,False,False,2
27430,Terrible,Deliberately bad pairing example for contrast.,21,0,28,7,14,False,False,False,...,False,False,False,False,False,False,True,False,False,1
28946,Excellent,Acidic wine balances acidic food.,28,2,15,0,8,False,False,False,...,True,False,False,False,False,False,False,False,False,5
16530,Poor,Delicate wine overwhelmed by red meat.,4,4,5,6,3,False,False,False,...,False,False,False,False,False,False,False,False,False,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3014,Neutral,Crisp acidity suits seafood.,4,4,18,8,4,False,False,False,...,False,False,False,False,False,False,False,False,False,3
14147,Neutral,Acidic wine balances acidic food.,12,1,24,3,8,False,False,False,...,True,False,False,False,False,False,False,False,False,3
16724,Good,Heuristic pairing assessment.,28,2,11,1,5,False,False,False,...,False,False,False,False,False,False,False,False,False,4
30578,Excellent,Idealized perfect pairing example for contrast.,26,5,20,0,1,False,False,False,...,False,False,False,False,False,False,False,False,False,5
