In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import  mean_squared_error 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
df = pd.read_csv("./data/processed/pricesList.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 340 entries, 0 to 339
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Price   333 non-null    float64
 1   Week    340 non-null    int64  
 2   Month   340 non-null    int64  
 3   Year    340 non-null    int64  
 4   Season  340 non-null    object 
dtypes: float64(1), int64(3), object(1)
memory usage: 13.4+ KB


In [4]:
{column: df[column].unique() for column in df.select_dtypes("object").columns}

{'Season': array(['North-East Monsoon Season', 'First Inter-monsoon Season',
        'South-West Monsoon Season', 'Second Inter-monsoon Season'],
       dtype=object)}

In [16]:
def preprocess_input(df):
    df = df.copy()

    # Drop rows with missing values if any
    df.dropna(inplace=True)

    # Split df into X and y
    X = df.drop("Price", axis=1)
    Y = df["Price"]

    # Train-test split
    X_train, X_test, Y_train, Y_test =  train_test_split(X, Y, test_size=0.1, shuffle=True)

    return X_train, X_test, Y_train, Y_test

In [6]:
X_train, X_test, Y_train, Y_test = preprocess_input(df)

# Train

In [14]:
def get_user_input():
    # name = input("Enter the name of the item: ")
    week = int(input("Enter the week: "))
    month = int(input("Enter the month: "))
    year = int(input("Enter the year: "))
    if month in range(3, 5):
        season = "First Inter-monsoon Season"
    elif month in range(5, 10):
        season = "South-West Monsoon Season"
    elif month in range(10, 12):
        season = "Second Inter-monsoon Season"
    else:
        season = "North-East Monsoon Season"
    return { 'Week': week, 'Month': month, 'Year': year, 'Season': season}


In [8]:
# Preprocessing pipeline
numeric_features = ['Week', 'Month', 'Year']
categorical_features = ['Season']

numeric_transformer = Pipeline(steps=[
    ('num', 'passthrough')])  # No transformation needed for numeric features

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])
regression_model = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('regressor', RandomForestRegressor())])

In [9]:
regression_model.fit(X_train, Y_train)
y_pred = regression_model.predict(X_test)
accuracy = regression_model.score(X_test, Y_test)
mse = mean_squared_error(Y_test, y_pred)  
print("Accuracy:", accuracy)
print("Mean Squared Error:", mse)

Accuracy: 0.9441508546049967
Mean Squared Error: 1629.521435080202


In [17]:
models = {
    "XG BOOST": XGBRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "CatBoost": CatBoostRegressor(verbose=0),
    "Random Forest": RandomForestRegressor(),
}
model_scores = {name: [] for name in models.keys()}
for _ in range(10):
    X_train, X_test, Y_train, Y_test = preprocess_input(df)
    models = {
        "XG BOOST": XGBRegressor(),
        "Decision Tree": DecisionTreeRegressor(),
        "CatBoost": CatBoostRegressor(verbose=0),
        "Random Forest": RandomForestRegressor(),
    }
    for name, model in models.items():
        regression_model = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('regressor', model)])
        regression_model.fit(X_train, Y_train)
        score = regression_model.score(X_test, Y_test)
        model_scores[name].append(score)

average_scores = {name: sum(scores) / len(scores) for name, scores in model_scores.items()}
# Print average scores
for name, avg_score in average_scores.items():
    print(f"{name} Average R^2: {avg_score:.5f}")

XG BOOST Average R^2: 0.76013
Decision Tree Average R^2: 0.75948
CatBoost Average R^2: 0.86491
Random Forest Average R^2: 0.79514


In [154]:
# train one model
model_score = []
mse_score = []
for _ in range(1):
    X_train, X_test, Y_train, Y_test = preprocess_input(df)
    regression_model = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('regressor', RandomForestRegressor())])
    regression_model.fit(X_train, Y_train)
    score = regression_model.score(X_test, Y_test)
    model_score.append(score)
    y_pred = regression_model.predict(X_test)
    accuracy = regression_model.score(X_test, Y_test)
    mse = mean_squared_error(Y_test, y_pred)  
    mse_score.append(mse)

print("Average score:", (sum(model_score)/len(model_score) * 100))
print("Average MSE:", sum(mse_score)/len(mse_score))

Learning rate set to 0.033833
0:	learn: 171.5969477	total: 263us	remaining: 263ms
1:	learn: 168.7504899	total: 441us	remaining: 220ms
2:	learn: 166.3796260	total: 579us	remaining: 193ms
3:	learn: 163.9614541	total: 720us	remaining: 179ms
4:	learn: 161.7690302	total: 831us	remaining: 165ms
5:	learn: 159.3558296	total: 979us	remaining: 162ms
6:	learn: 157.0274505	total: 1.13ms	remaining: 160ms
7:	learn: 155.5274609	total: 1.21ms	remaining: 151ms
8:	learn: 153.0144668	total: 1.38ms	remaining: 152ms
9:	learn: 150.6859332	total: 1.53ms	remaining: 151ms
10:	learn: 149.4705989	total: 1.68ms	remaining: 151ms
11:	learn: 147.8949519	total: 1.82ms	remaining: 150ms
12:	learn: 145.8849171	total: 1.99ms	remaining: 151ms
13:	learn: 144.3598042	total: 2.15ms	remaining: 152ms
14:	learn: 142.4223231	total: 2.32ms	remaining: 153ms
15:	learn: 140.2867840	total: 2.51ms	remaining: 154ms
16:	learn: 138.3862379	total: 2.68ms	remaining: 155ms
17:	learn: 137.0460593	total: 2.85ms	remaining: 156ms
18:	learn: 135

In [34]:
user_data = get_user_input()
user_df = pd.DataFrame([user_data])
predicted_price = regression_model.predict(user_df)

# Display prediction
print("Predicted price:", predicted_price[0])


Predicted price: 335.7069446898608
