In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import accuracy_score, mean_squared_error 
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
df = pd.read_csv("./data/processed/pricesList.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1700 entries, 0 to 1699
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    1700 non-null   object 
 1   Price   1665 non-null   float64
 2   Week    1700 non-null   int64  
 3   Month   1700 non-null   int64  
 4   Year    1700 non-null   int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 66.5+ KB


In [4]:
{column: df[column].unique() for column in df.select_dtypes("object").columns}

{'Name': array(['Leeks  1kg', 'Local Potatoes 1Kg', 'Imported Potatoes  1Kg',
        'Carrot  1kg', 'Tomatoe No1.  1kg'], dtype=object)}

In [5]:
def preprocess_input(df):
    df = df.copy()

    # Drop rows with missing values if any
    df.dropna(inplace=True)

    # Split df into X and y
    X = df.drop("Price", axis=1)
    Y = df["Price"]

    # Train-test split
    X_train, X_test, Y_train, Y_test =  train_test_split(X, Y, test_size=0.2, random_state=42)

    return X_train, X_test, Y_train, Y_test

In [6]:
X_train, X_test, Y_train, Y_test = preprocess_input(df)

# Train

In [7]:
def get_user_input():
    name = input("Enter the name of the item: ")
    week = int(input("Enter the week: "))
    month = int(input("Enter the month: "))
    year = int(input("Enter the year: "))
    return {'Name': name, 'Week': week, 'Month': month, 'Year': year}


In [8]:
# Preprocessing pipeline
numeric_features = ['Week', 'Month', 'Year']
categorical_features = ['Name']

numeric_transformer = Pipeline(steps=[
    ('num', 'passthrough')])  # No transformation needed for numeric features

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

regression_model = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('regressor', RandomForestRegressor())])




regression_model.fit(X_train, Y_train)
y_pred = regression_model.predict(X_test)
accuracy = regression_model.score(X_test, Y_test)
mse = mean_squared_error(Y_test, y_pred)  
print("Accuracy:", accuracy)
print("Mean Squared Error:", mse)


Accuracy: 0.9133126104874308
Mean Squared Error: 1149.309971917808


In [9]:
user_data = get_user_input()
user_df = pd.DataFrame([user_data])
predicted_price = regression_model.predict(user_df)

# Display prediction
print("Predicted price:", predicted_price[0])


Predicted price: 133.70129999999995


In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "K Nearest Neighbors": KNeighborsRegressor(),
    "Support Vector Machine": SVR(),
    "Linear SVR": LinearSVR(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "XGBoost": XGBRegressor(),
    "LightGBM": LGBMRegressor(),
    "CatBoost": CatBoostRegressor(verbose=0)
}
model_scores = {name: [] for name in models.keys()}

In [None]:
for _ in range(10):
    X_train, X_test, Y_train, Y_test = preprocess_input(df)
    models = {
        "Linear Regression": LinearRegression(),
        "Ridge": Ridge(),
        "Lasso": Lasso(),
        "K Nearest Neighbors": KNeighborsRegressor(),
        "Support Vector Machine": SVR(),
        "Linear SVR": LinearSVR(),
        "Decision Tree": DecisionTreeRegressor(),
        "Random Forest": RandomForestRegressor(),
        "Gradient Boosting": GradientBoostingRegressor(),
        "XGBoost": XGBRegressor(),
        "LightGBM": LGBMRegressor(),
        "CatBoost": CatBoostRegressor(verbose=0)
    }
    for name, model in models.items():
        model.fit(X_train, Y_train)
        score = model.score(X_test, Y_test)
        model_scores[name].append(score)



In [None]:
average_scores = {name: sum(scores) / len(scores) for name, scores in model_scores.items()}
# Print average scores
for name, avg_score in average_scores.items():
    print(f"{name} Average R^2: {avg_score:.5f}")

In [None]:
for name, model in models.items():
    model.fit(X_train, Y_train)
    print(name + " trained.")

In [None]:
for name, model in models.items():
    print(name + " R^2: {:.5f}".format(model.score(X_test, Y_test)))

In [None]:
models["CatBoost"].predict([3, 7, 1, 1,-1,-1,-1,-1])