In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [2]:

train_df = pd.read_csv("wines_SPA_train.csv")
test_df = pd.read_csv("wines_SPA_test.csv")

# DATASETS INFO
print(train_df.info())
print(train_df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1062 entries, 0 to 1061
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   winery       1062 non-null   object 
 1   wine         1062 non-null   object 
 2   year         1062 non-null   object 
 3   rating       1062 non-null   float64
 4   num_reviews  1062 non-null   float64
 5   country      1062 non-null   object 
 6   region       1062 non-null   object 
 7   price        1062 non-null   float64
 8   type         1062 non-null   object 
 9   body         1062 non-null   float64
 10  acidity      1062 non-null   float64
dtypes: float64(5), object(6)
memory usage: 91.4+ KB
None
                winery                   wine  year    rating  num_reviews  \
0  Pago de Carraovejas              El Anejon  2016  1.000000     0.382012   
1      Bodegas El Nido                El Nido  2018  1.000000     0.170971   
2        La Rioja Alta       Gran Reserva 890  1985  1.

In [3]:
# OUR DATA SET WAS ALREADY CLEANED AND PREPROCESSED

In [4]:
# NUMERICAL FEATURES ONLY
feature_columns = ["price", "body", "num_reviews", "acidity"]
target_column = "rating"

X_train = train_df[feature_columns]
y_train = train_df[target_column]

X_test = test_df[feature_columns]
y_test = test_df[target_column]


In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [6]:
# Define models to test
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(n_estimators=100),
    "Support Vector Machine": SVR(),
    "K-Nearest Neighbors": KNeighborsRegressor(),
    "Gradient Boosting": GradientBoostingRegressor()
}

# Train and evaluate each model
results = {}

for name, model in models.items():
    model.fit(X_train, y_train)  # Train
    y_pred = model.predict(X_valid)  # Predict
    mse = mean_squared_error(y_valid, y_pred)  # Calculate MSE
    r2 = r2_score(y_valid, y_pred)  # R² score
    results[name] = {"MSE": mse, "R²": r2}
    print(f"{name}: MSE = {mse:.4f}, R² = {r2:.4f}")


Linear Regression: MSE = 0.0333, R² = 0.2568
Decision Tree: MSE = 0.0527, R² = -0.1771
Random Forest: MSE = 0.0308, R² = 0.3122
Support Vector Machine: MSE = 0.0327, R² = 0.2695
K-Nearest Neighbors: MSE = 0.0362, R² = 0.1911
Gradient Boosting: MSE = 0.0311, R² = 0.3054


In [7]:

results_df = pd.DataFrame(results).T
print(results_df.sort_values(by="MSE"))


                             MSE        R²
Random Forest           0.030785  0.312216
Gradient Boosting       0.031088  0.305448
Support Vector Machine  0.032698  0.269485
Linear Regression       0.033264  0.256849
K-Nearest Neighbors     0.036208  0.191074
Decision Tree           0.052686 -0.177085


In [13]:
best_model = RandomForestRegressor(n_estimators=100)
best_model.fit(X_train, y_train)

# Predict on the test set
y_test_pred = best_model.predict(X_test)

# Evaluate on the test set
final_mse = mean_squared_error(y_test, y_test_pred)
final_r2 = r2_score(y_test, y_test_pred)
print(f"Final Model Test MSE: {final_mse:.4f}, R²: {final_r2:.4f}")


Final Model Test MSE: 0.0311, R²: 0.1358


In [14]:
# TO work best on unseen data we should select a model with high R2score and less MSE to avoid overfitting