In [16]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

import warnings
warnings.filterwarnings(action='ignore')

In [17]:
data = pd.read_csv("./input/Vegetable_market.csv")
data

Unnamed: 0,Vegetable,Season,Month,Temp,Deasaster Happen in last 3month,Vegetable condition,Price per kg
0,potato,winter,jan,15,no,fresh,20
1,tomato,winter,jan,15,no,fresh,50
2,peas,winter,jan,15,no,fresh,70
3,pumkin,winter,jan,15,no,fresh,25
4,cucumber,winter,jan,15,no,fresh,20
...,...,...,...,...,...,...,...
116,brinjal,winter,jan,15,yes,fresh,33
117,ginger,winter,jan,15,no,fresh,88
118,potato,summer,apr,32,no,fresh,24
119,peas,summer,apr,33,no,fresh,33


In [18]:
data.info() gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121 entries, 0 to 120
Data columns (total 7 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   Vegetable                        121 non-null    object
 1   Season                           121 non-null    object
 2   Month                            121 non-null    object
 3   Temp                             121 non-null    int64 
 4   Deasaster Happen in last 3month  121 non-null    object
 5   Vegetable condition              121 non-null    object
 6   Price per kg                     121 non-null    int64 
dtypes: int64(2), object(5)
memory usage: 6.7+ KB


In [19]:
{column: list(data[column].unique()) for column in data.select_dtypes("object").columns}

{'Vegetable': ['potato',
  'tomato ',
  'peas',
  'pumkin',
  'cucumber',
  'pointed grourd ',
  'Raddish',
  'Bitter gourd',
  'onion',
  'garlic',
  'cabage',
  'califlower',
  'chilly',
  'okra',
  'brinjal',
  'ginger',
  'radish'],
 'Season': ['winter', 'summer', 'monsoon', 'autumn', 'spring'],
 'Month': ['jan',
  'apr',
  'july',
  'sept',
  'oct',
  'dec',
  'may',
  'aug',
  'june',
  ' ',
  'march'],
 'Deasaster Happen in last 3month': ['no', 'yes'],
 'Vegetable condition': ['fresh', 'scrap', 'avarage', 'scarp']}

In [69]:
def onehot_encode(df, column):
    df = df.copy()

    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)

    return df

In [77]:
def preprocess_input(df):
    df = df.copy()

    # Clean vegetable condition column
    df["Vegetable condition"] = df["Vegetable condition"].replace({"scarp": "scrap"})

    # Binary Encoding
    df["Deasaster Happen in last 3month"] = df["Deasaster Happen in last 3month"].replace({"yes": 1, "no": 0})

    # Ordinal Encoding
    df["Month"] = df["Month"].replace({
        ' ' : np.NaN,
        'jan' : 1,
        'feb' : 2,
        'march' : 3,
        'apr' : 4,
        'may' : 5,
        'june' : 6,
        'july' : 7,
        'aug' : 8,
        'sept' : 9,
        'oct' : 10,
        'nov' : 11,
        'dec' : 12,
    })

    # Fill missing month values with column mode
    df["Month"] = df["Month"].fillna(df["Month"].mode()[0])

    # One-hot encode categorical columns
    for column in ['Vegetable', 'Season', 'Vegetable condition']:
        df = onehot_encode(df, column)

    # Split df into X and y
    Y = df["Price per kg"]
    X = df.drop("Price per kg", axis=1)

    # Train-test split
    X_train, X_test, Y_train, Y_test =  train_test_split(X, Y, train_size=0.7, shuffle=True, random_state=1)

    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

    return X_train, X_test, Y_train, Y_test

In [78]:
X_train, X_test, Y_train, Y_test = preprocess_input(data)

# Train

In [79]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "K Nearest Neighbors": KNeighborsRegressor(),
    "Support Vector Machine": SVR(),
    "Linear SVR": LinearSVR(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "XGBoost": XGBRegressor(),
    "LightGBM": LGBMRegressor(),
    "CatBoost": CatBoostRegressor(verbose=0)
}

In [80]:
for name, model in models.items():
    model.fit(X_train, Y_train)
    print(name + " trained.")

Linear Regression trained.
Ridge trained.
Lasso trained.
K Nearest Neighbors trained.
Support Vector Machine trained.
Linear SVR trained.
Decision Tree trained.
Random Forest trained.
Gradient Boosting trained.
XGBoost trained.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000636 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 30
[LightGBM] [Info] Number of data points in the train set: 84, number of used features: 6
[LightGBM] [Info] Start training from score 55.333333
LightGBM trained.
CatBoost trained.


In [81]:
for name, model in models.items():
    print(name + " R^2: {:.5f}".format(model.score(X_test, Y_test)))

Linear Regression R^2: 0.71175
Ridge R^2: 0.71197
Lasso R^2: 0.70461
K Nearest Neighbors R^2: 0.23805
Support Vector Machine R^2: -0.12694
Linear SVR R^2: 0.41847
Decision Tree R^2: 0.60808
Random Forest R^2: 0.64102
Gradient Boosting R^2: 0.62853
XGBoost R^2: 0.63068
LightGBM R^2: 0.15834
CatBoost R^2: 0.62010
