In [32]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,confusion_matrix

In [33]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


In [34]:
df = pd.read_csv("/content/sample_data/wines_SPA (1).csv")
df["num_reviews"] = df["num_reviews"].fillna(df["num_reviews"].mean())
df["price"] = df["price"].fillna(df["price"].median())
df["body"] = df["body"].fillna(df["body"].mode()[0])
df["acidity"] = df["acidity"].fillna(df["acidity"].mean())
df.dropna(inplace=True)

In [35]:
df.head()

Unnamed: 0,winery,wine,year,rating,num_reviews,country,region,price,type,body,acidity
0,Teso La Monja,Tinto,2013,4.9,58,Espana,Toro,995.0,Toro Red,5.0,3.0
1,Artadi,Vina El Pison,2018,4.9,31,Espana,Vino de Espana,313.5,Tempranillo,4.0,2.0
2,Vega Sicilia,Unico,2009,4.8,1793,Espana,Ribera del Duero,324.95,Ribera Del Duero Red,5.0,3.0
3,Vega Sicilia,Unico,1999,4.8,1705,Espana,Ribera del Duero,692.96,Ribera Del Duero Red,5.0,3.0
4,Vega Sicilia,Unico,1996,4.8,1309,Espana,Ribera del Duero,778.06,Ribera Del Duero Red,5.0,3.0


In [36]:
X = df[["num_reviews", "price", "body","acidity"]]  # Inputs
y = df["rating"]  # Output

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [39]:
# List of models
models = {
    "Linear Regression": LinearRegression(),
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}


In [48]:
from sklearn.preprocessing import LabelEncoder

l = LabelEncoder()
y_train = l.fit_transform(y_train)
y_test = l.transform(y_test)

In [55]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

output = {}

for a, b in models.items():
    b.fit(X_train, y_train)
    y_pred = b.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    output[a] = {"MAE": mae, "MSE": mse, "RMSE": rmse, "R2_Score": r2}

# Print results for each model
for model, metrics in output.items():
    print(f"\n{model}:")
    print(f"  MAE  = {metrics['MAE']:.4f}")
    print(f"  MSE  = {metrics['MSE']:.4f}")
    print(f"  RMSE = {metrics['RMSE']:.4f}")
    print(f"  R² Score = {metrics['R2_Score']:.4f}")



Linear Regression:
  MAE  = 0.7088
  MSE  = 1.0636
  RMSE = 1.0313
  R² Score = 0.3242

Logistic Regression:
  MAE  = 0.4364
  MSE  = 1.0158
  RMSE = 1.0079
  R² Score = 0.3545

Decision Tree:
  MAE  = 0.2969
  MSE  = 0.5917
  RMSE = 0.7692
  R² Score = 0.6240

Random Forest:
  MAE  = 0.2610
  MSE  = 0.4925
  RMSE = 0.7017
  R² Score = 0.6871

SVM:
  MAE  = 0.3192
  MSE  = 0.7132
  RMSE = 0.8445
  R² Score = 0.5468

KNN:
  MAE  = 0.2768
  MSE  = 0.5457
  RMSE = 0.7387
  R² Score = 0.6533

Gradient Boosting:
  MAE  = 0.2689
  MSE  = 0.5061
  RMSE = 0.7114
  R² Score = 0.6784


In [59]:
for a, b in models.items():
    b.fit(X_train, y_train)
    y_pred = b.predict(X_test)

    m = np.mean(y_test)
    accuracy = (1 - (mean_absolute_error(y_test, y_pred) / m)) * 100 if m != 0 else 0

    print(f"{a}: {accuracy:.2f}%")


Linear Regression: -15.04%
Logistic Regression: 29.17%
Decision Tree: 52.16%
Random Forest: 56.94%
SVM: 48.19%
KNN: 55.08%
Gradient Boosting: 56.36%


In [61]:
#Using GridSearchCV
#For Random forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

model = RandomForestRegressor(random_state=42)
grid = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
predictions = best_model.predict(X_test)

mean_actual = np.mean(y_test)
accuracy = (1 - (mean_absolute_error(y_test, predictions) / mean_actual)) * 100 if mean_actual != 0 else 0

print("\nOptimized Random Forest Model:")
print(f"Best Params: {grid.best_params_}")
print(f"Improved Accuracy: {accuracy:.2f}%")

Fitting 5 folds for each of 108 candidates, totalling 540 fits

Optimized Random Forest Model:
Best Params: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Improved Accuracy: 59.27%
