In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import  mean_squared_error 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings(action='ignore')

In [129]:
df = pd.read_csv("./data/processed/pricesList.csv")
def preprocess_input(df):
    df = df.copy()

    # Drop rows with missing values if any
    df.dropna(inplace=True)

    # Split df into X and y
    X = df.drop("Price", axis=1)
    Y = df["Price"]

    return X, Y
X, Y = preprocess_input(df)

In [15]:
# Preprocessing pipeline
numeric_features = ['Week', 'Month', 'Year']
categorical_features = ['Name', 'Season']

numeric_transformer = Pipeline(steps=[
    ('num', 'passthrough')])  # No transformation needed for numeric features

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])
regression_model = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('regressor', RandomForestRegressor())])

In [None]:
regression_model.fit(X, Y)
accuracy = regression_model.score(X, Y)
print("Accuracy:", accuracy*100)

In [53]:
def get_user_input():
    name = input("Enter the name of the item: ")
    week = int(input("Enter the week: "))
    month = int(input("Enter the month: "))
    year = int(input("Enter the year: "))
    if month in range(3, 5):
        season = "First Inter-monsoon Season"
    elif month in range(5, 10):
        season = "South-West Monsoon Season"
    elif month in range(10, 12):
        season = "Second Inter-monsoon Season"
    else:
        season = "North-East Monsoon Season"
    return {"Name": name, 'Week': week, 'Month': month, 'Year': year, 'Season': season}


In [156]:
user_data = get_user_input()
user_df = pd.DataFrame([user_data])
predicted_price = regression_model.predict(user_df)

# Display prediction
print("Predicted price:", predicted_price[0])

ValueError: invalid literal for int() with base 10: ''

In [None]:
import seaborn as sns
data2 = df.copy()
data2.dropna()
sns.boxplot(data2["Price"])

In [None]:
import plotly.express as px
sns.relplot(data=data2, x="Name", y="Price", hue="Season", kind="line")

In [None]:
import matplotlib.pyplot as plt
dist = data2["Name"]
distset = set(dist)
dd = list(distset)
dictOfWords = { dd[i] : i for i in range(0, len(dd) ) }
data2["Name"] = data2["Name"].map(dictOfWords)

dist = (data2["Season"])
distset = set(dist)
dd = list(distset)
dictOfWords = { dd[i] : i for i in range(0, len(dd) ) }
data2["Season"] = data2["Season"].map(dictOfWords)

dataplot = sns.heatmap(data2.corr(), cmap="YlGnBu", annot=True)

plt.show()

# Test

In [5]:
import numpy as np

In [130]:
data2 = df.copy()
data2 = data2.dropna()

In [131]:
data2.shape

(37538, 6)

In [127]:
data2["Price"]

0        135.15
1        111.11
2        112.86
3         99.39
4        228.57
          ...  
37533    292.61
37534    462.17
37535    255.91
37536    641.50
37537    145.83
Name: Price, Length: 34314, dtype: float64

In [9]:
# IQR
Q1 = np.percentile(data2["Price"], 25, interpolation = 'midpoint')
Q3 = np.percentile(data2["Price"], 75, interpolation = 'midpoint')
IQR = Q3 - Q1

In [10]:
# Upper bound
upper = np.where(data2["Price"] >= (Q3+1.5*IQR))
# Lower bound
lower = np.where(data2["Price"] <= (Q1-1.5*IQR))
print(upper[0], lower[0])

[   46    53    55 ... 37523 37530 37531] []


In [11]:
data2.reset_index(inplace=True, drop=True)

# Removing the outliers
data2.drop(upper[0], inplace=True)
data2.drop(lower[0], inplace=True)

print("New Shape: ", data2.shape)

New Shape:  (34314, 6)


In [12]:
sns.boxplot(data2["Price"])

NameError: name 'sns' is not defined

In [133]:
df = data2.copy()


X_train, X_test, Y_train, Y_test = preprocess_input(df)

In [134]:
regression_model = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('regressor', RandomForestRegressor(bootstrap=True, criterion='poisson',max_depth=None, max_leaf_nodes=None,n_estimators=250, random_state=None, n_jobs=1, verbose=0))])
regression_model.fit(X_train, Y_train)

In [148]:
regression_model = Pipeline(steps=[('preprocessor', preprocessor),
                                      ('regressor', CatBoostRegressor(iterations=10000, depth=10, learning_rate=0.25, loss_function='RMSE'))])
regression_model.fit(X_train, Y_train)

0:	learn: 569.8299688	total: 5.77ms	remaining: 57.7s
1:	learn: 500.3894673	total: 10.8ms	remaining: 53.9s
2:	learn: 454.0095637	total: 16.2ms	remaining: 53.9s
3:	learn: 418.3909229	total: 24.6ms	remaining: 1m 1s
4:	learn: 387.9796549	total: 30.1ms	remaining: 1m
5:	learn: 370.1727586	total: 36.1ms	remaining: 1m
6:	learn: 351.1860513	total: 40.9ms	remaining: 58.4s
7:	learn: 337.8302286	total: 46ms	remaining: 57.4s
8:	learn: 324.9272544	total: 52.1ms	remaining: 57.9s
9:	learn: 311.6422097	total: 56.7ms	remaining: 56.7s
10:	learn: 301.5007873	total: 60.8ms	remaining: 55.2s
11:	learn: 291.5428053	total: 65.7ms	remaining: 54.7s
12:	learn: 282.1855805	total: 71.1ms	remaining: 54.6s
13:	learn: 273.6251749	total: 76.5ms	remaining: 54.6s
14:	learn: 266.7678734	total: 81.5ms	remaining: 54.2s
15:	learn: 260.9865563	total: 86.8ms	remaining: 54.1s
16:	learn: 255.0557181	total: 92.5ms	remaining: 54.3s
17:	learn: 249.3029717	total: 98.8ms	remaining: 54.8s
18:	learn: 244.2355052	total: 104ms	remaining:

In [149]:
accuracy = regression_model.score(X_test, Y_test)
print("Accuracy:", accuracy)
Y_pred = regression_model.predict(X_test)
mse = mean_squared_error(Y_test, Y_pred)
rmse = np.sqrt(mse)
print("MSE:", mse)
print("RMSE:", rmse)

Accuracy: 0.9921513709392458
MSE: 3173.2905797398057
RMSE: 56.33196765372044


In [150]:
regression_model.fit(X_test, Y_test)
accuracy = regression_model.score(X_test, Y_test)
print("Accuracy:", accuracy)
Y_pred = regression_model.predict(X_test)
mse = mean_squared_error(Y_test, Y_pred)
rmse = np.sqrt(mse)
print("MSE:", mse)
print("RMSE:", rmse)

0:	learn: 565.4100579	total: 4.88ms	remaining: 48.8s
1:	learn: 505.2610178	total: 9.02ms	remaining: 45.1s
2:	learn: 468.6931969	total: 13.7ms	remaining: 45.6s
3:	learn: 429.9706354	total: 18.9ms	remaining: 47.2s
4:	learn: 399.5483915	total: 23.7ms	remaining: 47.5s
5:	learn: 385.5083690	total: 28ms	remaining: 46.6s
6:	learn: 365.6488936	total: 32.5ms	remaining: 46.4s
7:	learn: 351.0667955	total: 37.4ms	remaining: 46.8s
8:	learn: 338.0370314	total: 41.8ms	remaining: 46.4s
9:	learn: 325.3375470	total: 46.4ms	remaining: 46.3s
10:	learn: 316.7814713	total: 51.7ms	remaining: 46.9s
11:	learn: 305.5024725	total: 56.8ms	remaining: 47.3s
12:	learn: 296.0931417	total: 62.3ms	remaining: 47.9s
13:	learn: 288.2631529	total: 66ms	remaining: 47.1s
14:	learn: 281.3053431	total: 70.2ms	remaining: 46.7s
15:	learn: 274.1374052	total: 74.7ms	remaining: 46.6s
16:	learn: 268.4157829	total: 77.9ms	remaining: 45.8s
17:	learn: 263.7299307	total: 81.8ms	remaining: 45.4s
18:	learn: 258.0404496	total: 86.5ms	remai

In [None]:
import pickle
# Save the model as a pickle file
filename = './models/test_model.sav'
pickle.dump(regression_model, open(filename, 'wb'))

In [132]:
def preprocess_input(df):
    df = df.copy()

    # Drop rows with missing values if any
    df.dropna(inplace=True)

    # Split df into X and y
    X = df.drop("Price", axis=1)
    Y = df["Price"]

    # Train-test split
    X_train, X_test, Y_train, Y_test =  train_test_split(X, Y, test_size=0.3, shuffle=True)

    return X_train, X_test, Y_train, Y_test

In [None]:
# Take a look at the class/breed distribution
ax=pd.value_counts(data2['Name'],ascending=True).plot(kind='barh',fontsize="10",title="Veges Distribution",figsize=(20,30),color=["red","green","blue","orange","purple","pink"])
ax.set(xlabel="counts", ylabel="Vegetables")
ax.xaxis.label.set_size(10)
ax.yaxis.label.set_size(10)
ax.title.set_size(10)
plt.show()

In [None]:
data2.head()