In [1]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Define vegetable types
vegetables = ['Tomato', 'Potato', 'Onion', 'Brinjal', 'Cabbage', 'Carrot']

# Generate random data for 10000 rows
num_rows = 10000
data = {
    'Vegetable Type': np.random.choice(vegetables, num_rows),
    'Price (INR/kg)': np.random.randint(20, 60, num_rows),
    'Temperature (°C)': np.random.uniform(25, 40, num_rows).round(2),
    'Rainfall (mm)': np.random.uniform(0, 20, num_rows).round(2),
    'Supply (tons)': np.random.randint(30, 100, num_rows),
    'Demand (tons)': np.random.randint(30, 100, num_rows)
}

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv('vegetable_price_prediction_10000.csv', index=False)

In [15]:
dataset=pd.read_csv('/content/vegetable_price_prediction_10000.csv')

In [3]:
dataset.head()

Unnamed: 0,Vegetable Type,Price (INR/kg),Temperature (°C),Rainfall (mm),Supply (tons),Demand (tons)
0,Brinjal,43,31.07,3.18,93,38
1,Cabbage,21,27.55,14.02,50,61
2,Onion,59,28.04,15.01,61,83
3,Cabbage,39,38.77,8.86,57,74
4,Cabbage,53,37.87,15.87,41,56


In [4]:
dataset.value_counts('Vegetable Type')

Unnamed: 0_level_0,count
Vegetable Type,Unnamed: 1_level_1
Potato,1692
Cabbage,1689
Brinjal,1672
Tomato,1665
Carrot,1657
Onion,1625


In [5]:
dataset.isnull().sum()

Unnamed: 0,0
Vegetable Type,0
Price (INR/kg),0
Temperature (°C),0
Rainfall (mm),0
Supply (tons),0
Demand (tons),0


In [16]:
dataset['Vegetable Type'].replace({
    'Potato': 0,
    'Cabbage': 1,
    'Brinjal': 2,
    'Tomato': 3,
    'Carrot': 4,
    'Onion': 5
},inplace=True)

In [7]:
dataset.head()

Unnamed: 0,Vegetable Type,Price (INR/kg),Temperature (°C),Rainfall (mm),Supply (tons),Demand (tons)
0,2,43,31.07,3.18,93,38
1,1,21,27.55,14.02,50,61
2,5,59,28.04,15.01,61,83
3,1,39,38.77,8.86,57,74
4,1,53,37.87,15.87,41,56


In [9]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [17]:
X=dataset.drop(['Price (INR/kg)'],axis=1)
Y=dataset['Price (INR/kg)']

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [13]:
print(X.shape,X_train.shape,X_test.shape)

(10000, 5) (8000, 5) (2000, 5)


In [14]:
models = {
    "                     Linear Regression": LinearRegression(),
    " Linear Regression (L2 Regularization)": Ridge(),
    " Linear Regression (L1 Regularization)": Lasso(),
    "                   K-Nearest Neighbors": KNeighborsRegressor(),
    "                        Neural Network": MLPRegressor(),
     "Support Vector Machine (Linear Kernel)": LinearSVR(),
     "   Support Vector Machine (RBF Kernel)": SVR(),
    "                         Decision Tree": DecisionTreeRegressor(),
    "                         Random Forest": RandomForestRegressor(),
     "                     Gradient Boosting": GradientBoostingRegressor(),
    "                               XGBoost": XGBRegressor(),
     "                              LightGBM": LGBMRegressor(),
     "                              CatBoost": CatBoostRegressor(verbose=0)
}

for name, model in models.items():
    model.fit(X_train, Y_train)
    print(name + " trained.")

                     Linear Regression trained.
 Linear Regression (L2 Regularization) trained.
 Linear Regression (L1 Regularization) trained.
                   K-Nearest Neighbors trained.




                        Neural Network trained.




Support Vector Machine (Linear Kernel) trained.
   Support Vector Machine (RBF Kernel) trained.
                         Decision Tree trained.
                         Random Forest trained.
                     Gradient Boosting trained.
                               XGBoost trained.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000953 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 5
[LightGBM] [Info] Start training from score 39.329625
                              LightGBM trained.
                              CatBoost trained.


In [15]:
for name, model in models.items():
    print(name + " R^2 Score: {:.5f}".format(model.score(X_test, Y_test)))

                     Linear Regression R^2 Score: 0.00021
 Linear Regression (L2 Regularization) R^2 Score: 0.00021
 Linear Regression (L1 Regularization) R^2 Score: -0.00014
                   K-Nearest Neighbors R^2 Score: -0.23770
                        Neural Network R^2 Score: -0.01922
Support Vector Machine (Linear Kernel) R^2 Score: -0.69591
   Support Vector Machine (RBF Kernel) R^2 Score: 0.00081
                         Decision Tree R^2 Score: -1.15213
                         Random Forest R^2 Score: -0.05720
                     Gradient Boosting R^2 Score: -0.00590
                               XGBoost R^2 Score: -0.13489
                              LightGBM R^2 Score: -0.03309
                              CatBoost R^2 Score: -0.05087


In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/content/vegetable_price_prediction_10000.csv')  # Adjust the filename as needed


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Handle categorical variables
label_encoder = LabelEncoder()
df['Vegetable Type'] = label_encoder.fit_transform(df['Vegetable Type'])

# Split the dataset into features and target variable
X = df.drop('Price (INR/kg)', axis=1)  # Features
y = df['Price (INR/kg)']                # Target variable

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)


In [5]:
from sklearn.metrics import r2_score, mean_absolute_error

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f'R² Score: {r2:.5f}')
print(f'Mean Absolute Error: {mae:.5f}')


R² Score: -0.04866
Mean Absolute Error: 9.69601


In [6]:
from sklearn.model_selection import GridSearchCV

# Define the model
model = RandomForestRegressor(random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Set up Grid Search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='r2', n_jobs=-1)

# Fit Grid Search
grid_search.fit(X_train, y_train)

# Best parameters and score
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best R² score: {grid_search.best_score_:.5f}')


Best parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 100}
Best R² score: -0.00550


In [7]:
from sklearn.ensemble import GradientBoostingRegressor

# Initialize the model
gb_model = GradientBoostingRegressor(random_state=42)

# Train the model
gb_model.fit(X_train, y_train)

# Evaluate the model
gb_y_pred = gb_model.predict(X_test)
gb_r2 = r2_score(y_test, gb_y_pred)
gb_mae = mean_absolute_error(y_test, gb_y_pred)

print(f'Gradient Boosting R² Score: {gb_r2:.5f}')
print(f'Gradient Boosting Mean Absolute Error: {gb_mae:.5f}')


Gradient Boosting R² Score: -0.00447
Gradient Boosting Mean Absolute Error: 9.58388


In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import mean_absolute_error, r2_score

# Load your data
# df = pd.read_csv('your_dataset.csv')

# Prepare features and target variable
X = df.drop('Price (INR/kg)', axis=1)  # Adjust according to your dataset
y = df['Price (INR/kg)']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the ANN model
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))  # Input layer
model.add(Dense(32, activation='relu'))  # Hidden layer
model.add(Dense(1))  # Output layer

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
history = model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2)

# Evaluate the model
y_pred = model.predict(X_test_scaled)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f'ANN R² Score: {r2:.5f}')
print(f'ANN Mean Absolute Error: {mae:.5f}')


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - loss: 1326.5811 - val_loss: 170.8566
Epoch 2/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 156.7110 - val_loss: 162.6690
Epoch 3/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - loss: 153.2190 - val_loss: 158.3967
Epoch 4/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 149.6491 - val_loss: 154.3883
Epoch 5/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 145.4538 - val_loss: 150.1488
Epoch 6/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 142.3876 - val_loss: 147.9982
Epoch 7/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 139.4872 - val_loss: 145.8152
Epoch 8/100
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 137.0892 - val_loss: 144.1252
Epoch 9/100

In [11]:
model1=SVR()

In [19]:
model1.fit(X_train,Y_train)

In [20]:
training_data_prediction=model1.predict(X_train)
score1=metrics.r2_score(Y_train,training_data_prediction)
score2=metrics.mean_absolute_error(Y_train,training_data_prediction)
print("R square error: ", score1)
print("Mean Absolute Error: ", score2)

R square error:  0.0009315264427357528
Mean Absolute Error:  9.916665749220316


In [21]:
testing_data_prediction=model1.predict(X_test)
score3=metrics.r2_score(Y_test,testing_data_prediction)
score4=metrics.mean_absolute_error(Y_test,testing_data_prediction)
print("R square error: ", score3)
print("Mean Absolute Error: ", score4)

R square error:  0.0008104993740475352
Mean Absolute Error:  9.566763885934554


In [23]:
import pickle
pickle.dump(model1,open('/content/Vegetable_prices.pkl','wb'))