In [7]:
# Imports 
import requests
import pandas as pd
import numpy as np
import plotly.express as px
import string
from rapidfuzz import process, fuzz
from sklearn.preprocessing import RobustScaler
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, root_mean_squared_error
from sklearn.dummy import DummyRegressor
from sklearn.feature_selection import VarianceThreshold
from sklearn.neighbors import KNeighborsRegressor
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import joblib

In [8]:
# Laden der Daten von meinem Google Drive (Dateien sind öffentlich)

file_paths = ["data/Red.csv", "data/White.csv", "data/Rose.csv", "data/Sparkling.csv"]
wine_types = ["Red", "White", "Rose", "Sparkling"]
datasets = []
for path, wine_type in zip(file_paths, wine_types):
    df = pd.read_csv(path)
    df["WineType"] = wine_type # Hinzufügen der Spalte "WineType" 
    datasets.append(df)

df = pd.concat(datasets, ignore_index=True)

## DATA UNDERSTANDING

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13834 entries, 0 to 13833
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             13834 non-null  object 
 1   Country          13834 non-null  object 
 2   Region           13834 non-null  object 
 3   Winery           13834 non-null  object 
 4   Rating           13834 non-null  float64
 5   NumberOfRatings  13834 non-null  int64  
 6   Price            13834 non-null  float64
 7   Year             13834 non-null  object 
 8   WineType         13834 non-null  object 
dtypes: float64(2), int64(1), object(6)
memory usage: 972.8+ KB


In [10]:
# Spalte "Year" als einziger Data Preparation Schritt wird hier schon als Integer konvertiert, um bessere Statistiken & Visualisierungen bilden zu können
df["Year"] = pd.to_numeric(df["Year"].replace("N.V.", np.nan))
df["Year"] = pd.to_numeric(df["Year"], downcast="integer")
df["Year"] = pd.to_numeric(df["Year"].fillna(df["Year"].median()))
print(df.describe())
df.head()

             Rating  NumberOfRatings         Price          Year
count  13834.000000     13834.000000  13834.000000  13834.000000
mean       3.865664       428.322466     33.024850   2015.556961
std        0.296427      1838.413812     70.899893      3.188885
min        2.200000        25.000000      3.150000   1961.000000
25%        3.700000        56.000000      9.902500   2015.000000
50%        3.900000       129.000000     15.950000   2016.000000
75%        4.100000       336.000000     32.500000   2018.000000
max        4.900000     94287.000000   3410.790000   2020.000000


Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year,WineType
0,Pomerol 2011,France,Pomerol,Château La Providence,4.2,100,95.0,2011.0,Red
1,Lirac 2017,France,Lirac,Château Mont-Redon,4.3,100,15.5,2017.0,Red
2,Erta e China Rosso di Toscana 2015,Italy,Toscana,Renzo Masi,3.9,100,7.45,2015.0,Red
3,Bardolino 2019,Italy,Bardolino,Cavalchina,3.5,100,8.72,2019.0,Red
4,Ried Scheibner Pinot Noir 2016,Austria,Carnuntum,Markowitsch,3.9,100,29.15,2016.0,Red


In [11]:
# Anzahl unique values nach Spalte für nicht-numerische Spalten
print("Countries:", df["Country"].nunique())
print("Wineries:",df["Winery"].nunique())
print("Regions",df["Region"].nunique())
print("Names:",df["Name"].nunique())
print("Types:",df["WineType"].nunique())
print("-------Verteilung der Wine Types-------\n",df["WineType"].value_counts())

Countries: 33
Wineries: 3505
Regions 861
Names: 10934
Types: 4
-------Verteilung der Wine Types-------
 WineType
Red          8666
White        3764
Sparkling    1007
Rose          397
Name: count, dtype: int64


In [12]:
# Bar Plot: Verteilung der Ratings 

counts_nach_rating = df["Rating"].value_counts().reset_index()
counts_nach_rating= counts_nach_rating.sort_values("Rating")

counts_nach_rating["Rating"] = counts_nach_rating["Rating"].astype(str)
counts_nach_rating["label"] = counts_nach_rating["count"].apply(lambda x: str(x) if x < 10 else "")

fig = px.bar(counts_nach_rating, x="Rating", y="count", text="label", color_discrete_sequence=["#146c91"])
fig.update_traces(textangle=45)
fig.update_layout(yaxis_title="Anzahl der Weine", xaxis_title="Rating")
fig.show()

In [13]:
# Heatmap: Korrelation numerischer Spalten
numerische_spalten = df[["Year", "Rating", "NumberOfRatings", "Price"]]
heatmap = numerische_spalten.corr().round(2)

fig = px.imshow(heatmap, text_auto=True, color_continuous_scale="YlGnBu")
fig.show()

In [14]:
# Plot: Average Price per Country UND average Rating per country
fig = go.Figure()
fig = make_subplots(specs=[[{"secondary_y": True}]])
df_avg_prices = df.groupby("Country")["Price"].mean().reset_index()
df_avg_ratings = df.groupby("Country")["Rating"].mean().reset_index()

fig.add_trace(go.Bar(x=df_avg_ratings["Country"], y=df_avg_ratings["Rating"], marker=dict(color="#146c91")),secondary_y=False)

fig.add_trace(go.Scatter(x=df_avg_prices["Country"],y=df_avg_prices["Price"],marker=dict(color="#FF3358")),secondary_y=True)

min_avg_rating = df_avg_ratings["Rating"].min()
max_avg_rating = df_avg_ratings["Rating"].max()
fig.update_yaxes(title_text="Durchschnittliches Rating", range=[1,max_avg_rating + 0.2], secondary_y=False, title_font=dict(color="#146c91"),
tickfont=dict(color="#146c91"))
fig.update_yaxes(title_text="Durchschnittlicher Preis", range=[0,60],secondary_y=True, title_font=dict(color="#FF3358"),
    tickfont=dict(color="#FF3358"))
fig.update_layout(title="Durchschnitt von Rating & Preis nach Ländern", showlegend=False)

fig.show()

In [15]:
country_statistiken = df.groupby("Country").agg(Count=("Country", "size"), Avg_Rating=("Rating", "mean")).reset_index()

top10_countries = country_statistiken.sort_values(by="Count", ascending=False).head(10).round(2)

fig = px.bar(top10_countries, 
             x="Country", 
             y="Count", 
             labels={"Count": "Anzahl der Weine",  "Avg_Rating": "Avg Rating"}, 

             color_discrete_sequence=["#146c91"])
fig.update_yaxes(range=[0,4200])
fig.update_xaxes(title=None)
fig.update_traces(textposition="outside",)

fig.show()

In [16]:
# Barplot: Top 10 Länder nach Anzahl der Weine: Höhe des Balken nach Summe, Farbe nach Rating

fig = px.bar(top10_countries, 
             x="Country", 
             y="Count", 
             labels={"Count": "Anzahl der Weine"}, 
             text=top10_countries["Avg_Rating"],
             color="Avg_Rating",
             color_continuous_scale="YlGnBu")

fig.update_yaxes(range=[0,4200])
fig.update_xaxes(title=None)
fig.update_traces(textposition="outside")

fig.show()

In [17]:
# Boxplot: Preis
fig = px.box(df, x="Price", log_x=True, color_discrete_sequence=["#146c91"])

fig.update_xaxes(title="Preis (logarithmiert)")

fig.show()

In [18]:
# Average Rating per Wine Type
df_avg_winetypes = df.groupby("WineType")["Rating"].mean().reset_index()
df_avg_winetypes

Unnamed: 0,WineType,Rating
0,Red,3.890342
1,Rose,3.74131
2,Sparkling,3.880834
3,White,3.817906


## DATA PREPARATION

### Cleaning, Feature Engineering, Split

In [19]:
# Duplikate ermitteln
print("Duplikate:", df.duplicated().sum()) # -> keine zu entfernenden Duplikate 
# Sicherstellen, dass die Datensätze (Red, Wine, Rose, Sparkling) untereinander keine Duplikate aufweisen, die aufgrund der differenzierenden 
# Spalte "WineType" nicht erkannt werden
df_without_type = df.drop("WineType", axis="columns")
print("Duplikate untereinander:", df_without_type.duplicated().sum())

Duplikate: 0
Duplikate untereinander: 0


In [20]:
# NaN Values handlen
# N.V. bereits mit median ersetzt
# Ausreißer lasse ich explizit drin (sehr teure weine, sehr gut/schlecht geratete weine, weine mit sehr vielen ratings, sehr alte weine), um herauszufinden ob modell auch sehr gut/schlecht geratete weine erkennt
print("Nullen im Datensatz?",0 in df.values) # -> keine 0er im Datensatz

Nullen im Datensatz? False


In [21]:
# Entferne Jahr aus Name 
df["Name"] = df["Name"].str.rstrip(string.digits)
df["Name"] = df["Name"].str.replace("N.V.", "")

In [22]:
print("Unique Values in Spalte Name",len(df["Name"].unique()))

# Die Spalte "Name" ist nur schwierig verwendbar, da es sich um 7915 unique values handelt, also schwer Muster erkennbar sind & sie auch sehr Inkonsistent ist (Pinot noir vs Pinot-Noir) etc
varieties = pd.read_csv("data/varieties.csv")
print(varieties.head())
print(varieties.info())
print("Unique Werte in Varieties:", len(varieties["Variety"].unique()))
varieties["Variety"] = varieties["Variety"].drop_duplicates()

Unique Values in Spalte Name 7915
        Variety
0      Abouriou
1     Abrustine
2      Absinthe
3  Acadie Blanc
4        Acolon
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1512 entries, 0 to 1511
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Variety  1512 non-null   object
dtypes: object(1)
memory usage: 11.9+ KB
None
Unique Werte in Varieties: 1493


In [23]:
df["Name"] = df["Name"].astype(str)
varieties["Variety"] = varieties["Variety"].astype(str)

In [24]:
#Fuzzy Matching von Variety & Name
variety_list = varieties["Variety"].tolist()

def fuzzy_match(name, matches, threshold=82):
    if isinstance(name, str): 
        result = process.extractOne(name, matches, scorer=fuzz.partial_ratio) # fuzz.partial_ratio ist besser geeignet um matchende substrings zu finden als z. B. fuzz.ratio
        if result: 
            matched_variety = result[0]
            matching_score = result[1]
            if matching_score > threshold:
                return  matched_variety
    return np.nan  

# Hinzufügend der Variety-Spalte mit den gematchten Werten
df["Variety"] = df["Name"].apply(lambda x: fuzzy_match(x, variety_list))

In [25]:
df.head(20)

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year,WineType,Variety
0,Pomerol,France,Pomerol,Château La Providence,4.2,100,95.0,2011.0,Red,
1,Lirac,France,Lirac,Château Mont-Redon,4.3,100,15.5,2017.0,Red,
2,Erta e China Rosso di Toscana,Italy,Toscana,Renzo Masi,3.9,100,7.45,2015.0,Red,
3,Bardolino,Italy,Bardolino,Cavalchina,3.5,100,8.72,2019.0,Red,
4,Ried Scheibner Pinot Noir,Austria,Carnuntum,Markowitsch,3.9,100,29.15,2016.0,Red,Pinot Noir
5,Gigondas (Nobles Terrasses),France,Gigondas,Vieux Clocher,3.7,100,19.9,2017.0,Red,
6,Marion's Vineyard Pinot Noir,New Zealand,Wairarapa,Schubert,4.0,100,43.87,2016.0,Red,Pinot Noir
7,Red Blend,Chile,Itata Valley,Viña La Causa,3.9,100,17.52,2014.0,Red,Rare Red Blend
8,Chianti,Italy,Chianti,Castello Montaùto,3.6,100,10.75,2015.0,Red,Chianti Blend
9,Tradition,France,Minervois,Domaine des Aires Hautes,3.5,100,6.9,2014.0,Red,


In [26]:
print("Match gefunden für ", df.Variety.notna().sum(),"Weine")
df.Variety = df.Variety.replace(np.nan, "Unknown")
df.Variety = df.Variety.replace("nan", "Unknown")
print("Unique Varieties:", df.Variety.nunique())
df.info()

Match gefunden für  7648 Weine
Unique Varieties: 422
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13834 entries, 0 to 13833
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             13834 non-null  object 
 1   Country          13834 non-null  object 
 2   Region           13834 non-null  object 
 3   Winery           13834 non-null  object 
 4   Rating           13834 non-null  float64
 5   NumberOfRatings  13834 non-null  int64  
 6   Price            13834 non-null  float64
 7   Year             13834 non-null  float64
 8   WineType         13834 non-null  object 
 9   Variety          13834 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 1.1+ MB


In [27]:
# Entferne "NumberOfRatings"-Spalte weil nachgelagertes Feature
df = df.drop(["NumberOfRatings"], axis=1)

In [28]:
# Encoding der kategorischen Spalten 
# WineType/Country -> One Hot Encoding
df = pd.get_dummies(df, columns=["Country", "WineType"])

In [29]:
# Train/Test Split
# kein validation split weil cross validation
# Entferne Einträge mit Ratingwerten die nur 1 mal vorkommen, da sonst startify=y nicht möglich
rating_counts = df["Rating"].value_counts()
print(rating_counts)
df = df[df["Rating"].isin(rating_counts[rating_counts > 1].index)]
X = df.drop("Rating", axis="columns")
y = df["Rating"]

# random_state=1 setzt fest, dass die Daten vor dem Splitten geshuffled werden, aber dadurch dass immer dieselbe zahl für random_state genutzt wird werden die Daten immer gleich geshuffled, man erhält also immer densleben Trainings- & Test-Datensatz
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=1)
print(X_test.columns)
print(df.info())

Rating
3.8    2065
3.9    1792
3.7    1663
4.0    1418
3.6    1347
4.1    1339
4.2    1034
3.5     824
4.3     620
3.4     469
4.4     364
3.3     281
4.5     202
3.2     142
4.6     130
3.1      48
3.0      32
4.7      31
2.9       8
4.8       8
2.8       8
2.7       3
2.6       2
2.5       2
4.9       1
2.2       1
Name: count, dtype: int64
Index(['Name', 'Region', 'Winery', 'Price', 'Year', 'Variety',
       'Country_Argentina', 'Country_Australia', 'Country_Austria',
       'Country_Brazil', 'Country_Bulgaria', 'Country_Canada', 'Country_Chile',
       'Country_China', 'Country_Croatia', 'Country_Czech Republic',
       'Country_France', 'Country_Georgia', 'Country_Germany',
       'Country_Greece', 'Country_Hungary', 'Country_Israel', 'Country_Italy',
       'Country_Lebanon', 'Country_Luxembourg', 'Country_Mexico',
       'Country_Moldova', 'Country_New Zealand', 'Country_Portugal',
       'Country_Romania', 'Country_Slovakia', 'Country_Slovenia',
       'Country_South Africa', '

In [30]:
# Winery/Region/Variety -> Target Encoding
winery_encoder = TargetEncoder()
region_encoder = TargetEncoder()
variety_encoder = TargetEncoder()

X_train["Winery_encoded"] = winery_encoder.fit_transform(X_train["Winery"], y_train)
X_train["Region_encoded"] = region_encoder.fit_transform(X_train["Region"], y_train)
X_train["Variety_encoded"] = variety_encoder.fit_transform(X_train["Variety"], y_train)

X_test["Winery_encoded"] = winery_encoder.transform(X_test["Winery"])
X_test["Region_encoded"] = region_encoder.transform(X_test["Region"])
X_test["Variety_encoded"] = variety_encoder.transform(X_test["Variety"])

X_train = X_train.drop(["Region", "Winery", "Name", "Variety"], axis=1)
X_test = X_test.drop(["Region", "Winery", "Name", "Variety"], axis=1)

In [31]:
# Feature Selection

var_selector = VarianceThreshold(threshold=0.001)
X_train_reduced= var_selector.fit_transform(X_train)
X_test_reduced = var_selector.transform(X_test)

selected_features = X_train.columns[var_selector.get_support()]
print(len(selected_features), "Features mit Varianz > 0.001:")
print(selected_features)

31 Features mit Varianz > 0.001:
Index(['Price', 'Year', 'Country_Argentina', 'Country_Australia',
       'Country_Austria', 'Country_Brazil', 'Country_Chile', 'Country_France',
       'Country_Germany', 'Country_Greece', 'Country_Hungary',
       'Country_Israel', 'Country_Italy', 'Country_Lebanon',
       'Country_Luxembourg', 'Country_Moldova', 'Country_New Zealand',
       'Country_Portugal', 'Country_Romania', 'Country_Slovenia',
       'Country_South Africa', 'Country_Spain', 'Country_Switzerland',
       'Country_United States', 'WineType_Red', 'WineType_Rose',
       'WineType_Sparkling', 'WineType_White', 'Winery_encoded',
       'Region_encoded', 'Variety_encoded'],
      dtype='object')


In [32]:
# Scaling 
robust_scaler = RobustScaler()
X_train_reduced = robust_scaler.fit_transform(X_train_reduced)
X_test_reduced = robust_scaler.transform(X_test_reduced)

## MODEL

### Dummy Regressor

In [33]:
# Durchschnitt
dummy = DummyRegressor(strategy="mean")

dummy.fit(X_train_reduced, y_train)

y_pred_test_dummy = dummy.predict(X_test_reduced)

mape_dummy_test = mean_absolute_percentage_error(y_test, y_pred_test_dummy)
mae_dummy_test = mean_absolute_error(y_test, y_pred_test_dummy)

print("MAPE: ", mape_dummy_test)
print("MAE:", mae_dummy_test)

MAPE:  0.0616601945334654
MAE: 0.23596271726798845


### KNN Regression

#### Base Model Test

In [34]:
knn_regressor = KNeighborsRegressor()
knn_regressor.fit(X_train_reduced, y_train)

In [35]:
y_pred = knn_regressor.predict(X_test_reduced)
print(mean_absolute_error(y_pred, y_test))
print(mean_absolute_percentage_error(y_test, y_pred))

0.15235272858691723
0.04027580486857026


### Gradient Boosting Regression

#### Base Model Test

In [36]:
reg = GradientBoostingRegressor()
reg.fit(X_train_reduced, y_train)
y_pred = reg.predict(X_test_reduced)
print(mean_absolute_error(y_test, y_pred))
print(mean_absolute_percentage_error(y_test, y_pred))

0.14595006101459923
0.03852468896373109


### Random Forest Regressor

#### Base Model Test

In [37]:
rfr = RandomForestRegressor(random_state=13)
rfr.fit(X_train_reduced, y_train)
y_pred = rfr.predict(X_test_reduced)
print(mean_absolute_error(y_pred, y_test))
print(mean_absolute_percentage_error(y_test, y_pred))

0.14260836875648947
0.0376324160795318


#### Tuning

In [38]:
print(rfr.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 13, 'verbose': 0, 'warm_start': False}


In [39]:
# Hyperparamter-Suche
param_grid = {
    "n_estimators": [500, 1000, 1500],
    "max_features": [0.5, 1],
    "max_depth": [20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2 ,4],
}

search_cv = GridSearchCV(
    estimator=rfr,
    param_grid=param_grid,
    cv=3, 
    scoring="neg_mean_absolute_error", # neg weil gridSearchCV maximiert scoring parameter
    n_jobs=-1,
    verbose=2 # Überwachen
)
search_cv.fit(X_train_reduced, y_train)

Fitting 3 folds for each of 108 candidates, totalling 324 fits
[CV] END max_depth=20, max_features=0.5, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   7.7s
[CV] END max_depth=20, max_features=0.5, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   7.9s
[CV] END max_depth=20, max_features=0.5, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   8.0s
[CV] END max_depth=20, max_features=0.5, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=   7.3s
[CV] END max_depth=20, max_features=0.5, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=   7.3s
[CV] END max_depth=20, max_features=0.5, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time=  15.8s
[CV] END max_depth=20, max_features=0.5, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time=  16.1s
[CV] END max_depth=20, max_features=0.5, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; t


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



[CV] END max_depth=20, max_features=0.5, min_samples_leaf=1, min_samples_split=10, n_estimators=500; total time=   6.8s
[CV] END max_depth=20, max_features=0.5, min_samples_leaf=1, min_samples_split=10, n_estimators=500; total time=   6.6s
[CV] END max_depth=20, max_features=0.5, min_samples_leaf=1, min_samples_split=5, n_estimators=1500; total time=  21.9s
[CV] END max_depth=20, max_features=0.5, min_samples_leaf=1, min_samples_split=10, n_estimators=1000; total time=  13.0s
[CV] END max_depth=20, max_features=0.5, min_samples_leaf=1, min_samples_split=10, n_estimators=1000; total time=  13.0s
[CV] END max_depth=20, max_features=0.5, min_samples_leaf=1, min_samples_split=5, n_estimators=1500; total time=  21.9s
[CV] END max_depth=20, max_features=0.5, min_samples_leaf=1, min_samples_split=5, n_estimators=1500; total time=  22.4s
[CV] END max_depth=20, max_features=0.5, min_samples_leaf=1, min_samples_split=10, n_estimators=1000; total time=  13.2s
[CV] END max_depth=20, max_features=0

In [40]:
print("Beste Hyperparameter:", search_cv.best_params_)

Beste Hyperparameter: {'max_depth': 20, 'max_features': 0.5, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1000}


In [41]:
joblib.dump(search_cv.best_estimator_, "best_model.pkl")

['best_model.pkl']

In [42]:
# Testen des besten Modells auf den Test-Daten
best_model = joblib.load("best_model.pkl")
y_pred = best_model.predict(X_test_reduced)
print("MAE: ", mean_absolute_error(y_pred, y_test))
print("MAPE: ",mean_absolute_percentage_error(y_pred, y_test))

MAE:  0.14119259804591439
MAPE:  0.0367839960048242


In [43]:
knn = KNeighborsRegressor()


param_grid = {
    'n_neighbors': range(1, 31),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski', 'chebyshev', 'cosine']
}

grid_search = GridSearchCV(
    estimator=knn,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1
)

grid_search.fit(X_train_reduced, y_train)

In [44]:
y_pred = grid_search.predict(X_test_reduced)
print("MAE: ", mean_absolute_error(y_pred, y_test))
print("MAPE: ",mean_absolute_percentage_error(y_pred, y_test))

MAE:  0.14694673917350695
MAPE:  0.03832943792582881


In [45]:
print("Beste Hyperparameter:", grid_search.best_params_)

Beste Hyperparameter: {'metric': 'manhattan', 'n_neighbors': 14, 'weights': 'distance'}


## EVALUATION

In [46]:
results_by_rating = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})

mae_by_rating= results_by_rating.groupby("Actual").apply(lambda x: mean_absolute_error(x["Actual"], x["Predicted"])).reset_index(name="MAE")
mae_by_rating.head(30)





Unnamed: 0,Actual,MAE
0,2.7,1.001558
1,2.8,0.805993
2,2.9,0.650385
3,3.0,0.570573
4,3.1,0.463861
5,3.2,0.347907
6,3.3,0.360789
7,3.4,0.250832
8,3.5,0.175482
9,3.6,0.14467


In [47]:
# Scatter Plot: Predicted vs actual Rating
y_train_pred = best_model.predict(X_train_reduced)

fig = px.scatter(x=y_test, y=y_pred, labels={
                     "y": "Predicted Rating",
                     "x": "Actual Rating"},title="Actual vs. Predicted Rating", color_discrete_sequence=["#146c91"])

start = y_test.min()
end = y_test.max()

fig.add_shape(type="line", x0=start, y0=start, x1=end, y1=end, line=dict(color="#FF3358"))

fig.show()

In [48]:
# Barplot: Top 10 Features by Feature Importance
importances = search_cv.best_estimator_.feature_importances_

feature_importances_df = pd.DataFrame({
    "feature": selected_features,
    "importance": importances
}).sort_values(by='importance', ascending=False)

fig = px.bar(feature_importances_df.head(10), x="feature", y="importance", title ="Feature Importance", color_discrete_sequence=["#146c91"])
fig.show()