In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import  mean_squared_error 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
df = pd.read_csv("./data/processed/pricesList.csv")

In [None]:
df.info()

In [None]:
{column: df[column].unique() for column in df.select_dtypes("object").columns}

In [None]:
def preprocess_input(df):
    df = df.copy()

    # Drop rows with missing values if any
    df.dropna(inplace=True)

    # Split df into X and y
    X = df.drop("Price", axis=1)
    Y = df["Price"]

    # Train-test split
    X_train, X_test, Y_train, Y_test =  train_test_split(X, Y, test_size=0.1, shuffle=True)

    return X_train, X_test, Y_train, Y_test

In [None]:
X_train, X_test, Y_train, Y_test = preprocess_input(df)

# Train

In [None]:
def get_user_input():
    # name = input("Enter the name of the item: ")
    week = int(input("Enter the week: "))
    month = int(input("Enter the month: "))
    year = int(input("Enter the year: "))
    if month in range(3, 5):
        season = "First Inter-monsoon Season"
    elif month in range(5, 10):
        season = "South-West Monsoon Season"
    elif month in range(10, 12):
        season = "Second Inter-monsoon Season"
    else:
        season = "North-East Monsoon Season"
    return { 'Week': week, 'Month': month, 'Year': year, 'Season': season}


In [None]:
# Preprocessing pipeline
numeric_features = ['Week', 'Month', 'Year']
categorical_features = ['Season']

numeric_transformer = Pipeline(steps=[
    ('num', 'passthrough')])  # No transformation needed for numeric features

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])
regression_model = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('regressor', RandomForestRegressor())])

In [None]:
regression_model.fit(X_train, Y_train)
y_pred = regression_model.predict(X_test)
accuracy = regression_model.score(X_test, Y_test)
mse = mean_squared_error(Y_test, y_pred)  
print("Accuracy:", accuracy)
print("Mean Squared Error:", mse)

In [None]:
models = {
    "XG BOOST": XGBRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "CatBoost": CatBoostRegressor(verbose=0),
    "Random Forest": RandomForestRegressor(),
}
model_scores = {name: [] for name in models.keys()}
for _ in range(10):
    X_train, X_test, Y_train, Y_test = preprocess_input(df)
    models = {
        "XG BOOST": XGBRegressor(),
        "Decision Tree": DecisionTreeRegressor(),
        "CatBoost": CatBoostRegressor(verbose=0),
        "Random Forest": RandomForestRegressor(),
    }
    for name, model in models.items():
        regression_model = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('regressor', model)])
        regression_model.fit(X_train, Y_train)
        score = regression_model.score(X_test, Y_test)
        model_scores[name].append(score)

average_scores = {name: sum(scores) / len(scores) for name, scores in model_scores.items()}
# Print average scores
for name, avg_score in average_scores.items():
    print(f"{name} Average R^2: {avg_score:.5f}")

In [None]:
# train one model
model_score = []
mse_score = []
for _ in range(1):
    X_train, X_test, Y_train, Y_test = preprocess_input(df)
    regression_model = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('regressor', RandomForestRegressor())])
    regression_model.fit(X_train, Y_train)
    score = regression_model.score(X_test, Y_test)
    model_score.append(score)
    y_pred = regression_model.predict(X_test)
    accuracy = regression_model.score(X_test, Y_test)
    mse = mean_squared_error(Y_test, y_pred)  
    mse_score.append(mse)

print("Average score:", (sum(model_score)/len(model_score) * 100))
print("Average MSE:", sum(mse_score)/len(mse_score))

In [None]:
user_data = get_user_input()
user_df = pd.DataFrame([user_data])
predicted_price = regression_model.predict(user_df)

# Display prediction
print("Predicted price:", predicted_price[0])
