In [1]:
!pip install scikit-learn



In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

In [4]:
data = pd.read_csv('/content/drive/MyDrive/PROJECT/Dataset/train.csv')

In [5]:
data.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,State,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales
0,1,CA-2017-152156,08/11/2017,11/11/2017,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420.0,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96
1,2,CA-2017-152156,08/11/2017,11/11/2017,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420.0,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94
2,3,CA-2017-138688,12/06/2017,16/06/2017,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,California,90036.0,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62
3,4,US-2016-108966,11/10/2016,18/10/2016,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311.0,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775
4,5,US-2016-108966,11/10/2016,18/10/2016,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311.0,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368


In [6]:
data['Order Date'] = pd.to_datetime(data['Order Date'], format="%d/%m/%Y")

In [7]:
def get_forecast(model_name):
  sales_by_category = data.groupby(["Category", data["Order Date"].dt.year])["Sales"].sum().reset_index()
  forcasted_sales = pd.DataFrame(columns=["Category", "Forcasted_sales_2019"])
  target_year = 2019
  result = []

  for category in sales_by_category["Category"].unique():
      category_data = sales_by_category[sales_by_category["Category"] == category]
      category_data["Order Date"] = pd.to_datetime(category_data["Order Date"])
      category_data["Year"] = category_data["Order Date"].dt.year

      train_data, valid_data = train_test_split(category_data, test_size=0.2, random_state=42, shuffle=False)

      features = ["Year"]
      target = "Sales"

      model = model_name
      model.fit(train_data[features], train_data[target])

      forecast_data = pd.DataFrame({
          "Year": [target_year]
      })
      forecast = model.predict(forecast_data)

      forcasted_sales = pd.concat([forcasted_sales, pd.DataFrame({
          "Category": category,
          "Forcasted_sales_2019": forecast
      })], ignore_index=True)

      true_values = valid_data[target]
      predicted_values = model.predict(valid_data[features])
      mae = mean_absolute_error(true_values, predicted_values)
      residuals = true_values - predicted_values
      result.append({
          "Category": category,
          'True Values': true_values.values,
          'Predicted Values': predicted_values,
          "MAE": mae,
          "Residuals": residuals.values  #Positive value means model underpredicted, Negative value means model overpredicted
      })
      for i in result:
          print(f"Category: {i['Category']}")
          print(f"True Values: {i['True Values']}")
          print(f"Predicted Values: {i['Predicted Values']}")
          print(f"MAE: {i['MAE']}")
          print(f"Residuals: {i['Residuals']}")
          print()

  forcasted_sales["Forcasted_sales_2019"] = forcasted_sales["Forcasted_sales_2019"].apply(lambda x: '{:,.2f}'.format(x))
  print("Forcasted Sales in 2019 for Each Product Category")
  print(forcasted_sales)

In [8]:
get_forecast(LinearRegression())

Category: Furniture
True Values: [212313.7872]
Predicted Values: [172114.9295]
MAE: 40198.85769999999
Residuals: [40198.8577]

Category: Furniture
True Values: [212313.7872]
Predicted Values: [172114.9295]
MAE: 40198.85769999999
Residuals: [40198.8577]

Category: Office Supplies
True Values: [240367.541]
Predicted Values: [155018.26433333]
MAE: 85349.27666666667
Residuals: [85349.27666667]

Category: Furniture
True Values: [212313.7872]
Predicted Values: [172114.9295]
MAE: 40198.85769999999
Residuals: [40198.8577]

Category: Office Supplies
True Values: [240367.541]
Predicted Values: [155018.26433333]
MAE: 85349.27666666667
Residuals: [85349.27666667]

Category: Technology
True Values: [269370.691]
Predicted Values: [186028.394]
MAE: 83342.29699999999
Residuals: [83342.297]

Forcasted Sales in 2019 for Each Product Category
          Category Forcasted_sales_2019
0        Furniture           172,114.93
1  Office Supplies           155,018.26
2       Technology           186,028.39


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_data["Order Date"] = pd.to_datetime(category_data["Order Date"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_data["Year"] = category_data["Order Date"].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_data["Order Date"] = pd.to_datetime(category_data["Order Date"

In [9]:
get_forecast(SVR())

Category: Furniture
True Values: [212313.7872]
Predicted Values: [164053.8674]
MAE: 48259.9198
Residuals: [48259.9198]

Category: Furniture
True Values: [212313.7872]
Predicted Values: [164053.8674]
MAE: 48259.9198
Residuals: [48259.9198]

Category: Office Supplies
True Values: [240367.541]
Predicted Values: [149512.82]
MAE: 90854.72099999999
Residuals: [90854.721]

Category: Furniture
True Values: [212313.7872]
Predicted Values: [164053.8674]
MAE: 48259.9198
Residuals: [48259.9198]

Category: Office Supplies
True Values: [240367.541]
Predicted Values: [149512.82]
MAE: 90854.72099999999
Residuals: [90854.721]

Category: Technology
True Values: [269370.691]
Predicted Values: [173865.507]
MAE: 95505.18399999998
Residuals: [95505.184]

Forcasted Sales in 2019 for Each Product Category
          Category Forcasted_sales_2019
0        Furniture           164,053.87
1  Office Supplies           149,512.82
2       Technology           173,865.51


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_data["Order Date"] = pd.to_datetime(category_data["Order Date"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_data["Year"] = category_data["Order Date"].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_data["Order Date"] = pd.to_datetime(category_data["Order Date"

In [10]:
get_forecast(RandomForestRegressor())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_data["Order Date"] = pd.to_datetime(category_data["Order Date"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_data["Year"] = category_data["Order Date"].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_data["Order Date"] = pd.to_datetime(category_data["Order Date"

Category: Furniture
True Values: [212313.7872]
Predicted Values: [169260.45522633]
MAE: 43053.33197366647
Residuals: [43053.33197367]

Category: Furniture
True Values: [212313.7872]
Predicted Values: [169260.45522633]
MAE: 43053.33197366647
Residuals: [43053.33197367]

Category: Office Supplies
True Values: [240367.541]
Predicted Values: [155729.28169667]
MAE: 84638.25930333324
Residuals: [84638.25930333]

Category: Furniture
True Values: [212313.7872]
Predicted Values: [169260.45522633]
MAE: 43053.33197366647
Residuals: [43053.33197367]

Category: Office Supplies
True Values: [240367.541]
Predicted Values: [155729.28169667]
MAE: 84638.25930333324
Residuals: [84638.25930333]

Category: Technology
True Values: [269370.691]
Predicted Values: [186271.65174]
MAE: 83099.03925999993
Residuals: [83099.03926]

Forcasted Sales in 2019 for Each Product Category
          Category Forcasted_sales_2019
0        Furniture           169,260.46
1  Office Supplies           155,729.28
2       Technolo

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_data["Order Date"] = pd.to_datetime(category_data["Order Date"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_data["Year"] = category_data["Order Date"].dt.year
