In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
df = pd.read_csv("./data/processed/pricesList.csv")

In [3]:
df = df.dropna(subset=['Price'])

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1665 entries, 0 to 1699
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    1665 non-null   object 
 1   Price   1665 non-null   float64
 2   Week    1665 non-null   int64  
 3   Month   1665 non-null   int64  
 4   Year    1665 non-null   int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 78.0+ KB


In [6]:
{column: df[column].unique() for column in df.select_dtypes("object").columns}

{'Name': array(['Leeks 1kg', 'Local Potatoes 1Kg', 'Imported Potatoes 1Kg',
        'Carrot 1kg', 'Tomatoe No1. 1kg'], dtype=object)}

In [7]:
def onehot_encode(df, column):
    df = df.copy()

    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)

    return df

In [None]:
def preprocess_input(df):
    df = df.copy()

    # One-hot encode categorical columns
    df = onehot_encode(df, "Name")

    # Split df into X and y
    Y = df["Price"]
    X = df.drop("Price", axis=1)

    # Train-test split
    X_train, X_test, Y_train, Y_test =  train_test_split(X, Y, train_size=0.7, shuffle=True, random_state=1)

    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

    return X_train, X_test, Y_train, Y_test

In [None]:
X_train, X_test, Y_train, Y_test = preprocess_input(data)

# Train

In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "K Nearest Neighbors": KNeighborsRegressor(),
    "Support Vector Machine": SVR(),
    "Linear SVR": LinearSVR(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "XGBoost": XGBRegressor(),
    "LightGBM": LGBMRegressor(),
    "CatBoost": CatBoostRegressor(verbose=0)
}

In [None]:
for name, model in models.items():
    model.fit(X_train, Y_train)
    print(name + " trained.")

In [None]:
for name, model in models.items():
    print(name + " R^2: {:.5f}".format(model.score(X_test, Y_test)))