In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv('commodities_dataset.csv')

In [3]:
Date1 = []
Date2 = []

In [4]:
for i in range(len(df['Date'])):
    if str(df["Date"].iloc[i]).count("-") == 2:
        lis = str(df["Date"].iloc[i]).split("-")
        date = lis[2][:2]
        lis[2] = date
        lis = list(map(int, lis))
        lis = list([lis[1], lis[2], lis[0]])
        Date1.append(lis)
    else:
        lis = str(df["Date"].iloc[i]).split("/")
        lis = list(map(int, lis))
        Date2.append(lis)

In [5]:
Date = Date1+Date2
Date_df = pd.DataFrame(Date)
df2 = np.c_[df, Date_df]
df = pd.DataFrame(df2, columns=list(df.columns) + ['Month', 'Day', 'Year'])

In [6]:
le_commodity = LabelEncoder()

le_commodity.fit(df["Commodity"])
df["Commodity"] = le_commodity.transform(df["Commodity"])

In [7]:
le_unit = LabelEncoder()

le_unit.fit(df["Unit"])
df["Unit"] = le_unit.transform(df["Unit"])

In [8]:
X = df.drop(['SN', 'Minimum', 'Maximum', 'Average', 'Date', 'Day', 'Year'], axis=1)
y = df[['Minimum', 'Maximum', 'Average']]

y = df['Average']

In [9]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [10]:
linreg = LinearRegression()
ridge = Ridge()
lasso = Lasso()
svr = LinearSVR()
knn = KNeighborsRegressor()
dt = DecisionTreeRegressor()
egb = xgb.XGBRegressor(objective='reg:linear', n_estimators=10)
ad = AdaBoostRegressor(base_estimator=dt, n_estimators=10)
br = BaggingRegressor(base_estimator=dt, n_estimators=10)
gbr = GradientBoostingRegressor(n_estimators=10, max_depth=1)
rf = RandomForestRegressor(n_estimators=10)
models = [('Linear Regression', linreg), ('Ridge Regression', ridge), ('Lasso Regression', lasso),
          ('Support Vector Machine', svr), ('K Nearest Neighbours', knn), ('Decision Tree', dt),
          ('Extreme Gradient Boosting', egb),
          ('Ada Boost', ad), ('Bagging Regressor', br), ('Gradient Boosting', gbr), ('Random Forest', rf)]

mean = {}

In [11]:
for model_name, model in models:
    model.fit(X, y)
    y_pred = model.predict(X)
    print('mean squared error for {} : {}'.format(model_name, mean_squared_error(y, y_pred)))
    mean[model_name]=mean_squared_error(y, y_pred)

mean squared error for Linear Regression : 6232.721649531979
mean squared error for Ridge Regression : 6232.721649549977
mean squared error for Lasso Regression : 6237.322373773822




mean squared error for Support Vector Machine : 6715.538270139696
mean squared error for K Nearest Neighbours : 2355.9587748084055
mean squared error for Decision Tree : 1441.4876847126766


ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:Month: object

In [12]:
X_test = np.array([["Banana", "Doz", 6]])

X_test[:, 0] = le_commodity.transform(X_test[:, 0])
X_test[:, 1] = le_unit.transform(X_test[:, 1])
X_test = X_test.astype(float)

In [13]:
# Random Forest
rf.fit(X, y)
y_pred_rf = rf.predict(X_test)
print("RF", y_pred_rf)

RF [85.44503579]




In [15]:
data = {"model": rf, "le_commodity": le_commodity, "le_unit": le_unit}
with open("../farmflow_ML.API/model.pkl", "wb") as file:
    pickle.dump(data, file)