In [2]:
# Step 1: Download the Dataset
import kagglehub

# Download the dataset
path = kagglehub.dataset_download("neuromusic/avocado-prices")

print("Path to dataset files:", path)

# Step 2: Read the Dataset and Handle Missing Values
import pandas as pd

# Load the dataset
file_path = f"{path}/avocado.csv"
data = pd.read_csv(file_path)

# Check for missing values
print("Missing values:\n", data.isnull().sum())

# Drop missing values
data.dropna(inplace=True)

# Step 3: Feature Selection and Preprocessing
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Drop unnecessary columns
X = data.drop(columns=["region", "Date", "AveragePrice"])
y = data["AveragePrice"]

# One-hot encode categorical features
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
encoded_features = encoder.fit_transform(X[["type"]])

# Drop original categorical columns
X = X.drop(columns=["type"])

# Scale numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(X)

# Combine scaled numerical features and encoded categorical features
import numpy as np
X_preprocessed = np.hstack([scaled_features, encoded_features])

# Step 4: Split the Data
from sklearn.model_selection import train_test_split

# First, split off the test set (10%)
X_train_val, X_test, y_train_val, y_test = train_test_split(X_preprocessed, y, test_size=0.1, random_state=0)

# Then, split the remaining data into training (80%) and validation (10%)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1111, random_state=0)  # 0.1111 of 90% = ~10%

# Step 5: Train KNN Regression and Choose Best k
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score

best_k = None
best_score = float("-inf")

# Test different values of k
for k in range(1, 21):  # Try k from 1 to 20
    model = KNeighborsRegressor(n_neighbors=k)
    model.fit(X_train, y_train)
    
    # Evaluate on validation set
    y_val_pred = model.predict(X_val)
    score = r2_score(y_val, y_val_pred)
    
    if score > best_score:
        best_k = k
        best_score = score

print(f"Best k: {best_k} with R-squared score: {best_score}")

# Step 6: Evaluate the Final Model on the Test Set
# Train the final model
final_model = KNeighborsRegressor(n_neighbors=best_k)
final_model.fit(X_train, y_train)

# Predict and evaluate on the test set
y_test_pred = final_model.predict(X_test)
test_score = r2_score(y_test, y_test_pred)

print(f"Final R-squared score on test set: {test_score}")


Path to dataset files: C:\Users\justino\.cache\kagglehub\datasets\neuromusic\avocado-prices\versions\1
Missing values:
 Unnamed: 0      0
Date            0
AveragePrice    0
Total Volume    0
4046            0
4225            0
4770            0
Total Bags      0
Small Bags      0
Large Bags      0
XLarge Bags     0
type            0
year            0
region          0
dtype: int64
Best k: 4 with R-squared score: 0.6928581868388315
Final R-squared score on test set: 0.6770630559161779
