In [7]:
# Import required libraries
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.linear_model import LinearRegression

In [10]:
# Read the sales data
file_path = Path("Resources/output.csv")
df_sales = pd.read_csv(file_path)

# Display sample data
df_sales.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Sub-Category,Product Name,Sales,Quantity,Discount,Profit,Unit Price,Price without discount,Unit Cost,Order Year
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0%,41.9136,130.98,130.98,20.9568,2016
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0%,219.582,243.98,243.98,73.194,2016
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0%,6.8714,7.31,7.31,3.4357,2016
3,4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,45%,-383.031,191.5155,348.21,80.0883,2015
4,5,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,Storage,Eldon Fold 'N Roll Cart System,22.368,2,20%,2.5164,11.184,13.98,4.0542,2015


In [13]:
# Create the X set by using the `reshape` function to format the ads data as a single column array.
X = df_sales["Sales"].values.reshape(-1, 1)

# Display sample data
X[:5]

array([[261.96  ],
       [731.94  ],
       [ 14.62  ],
       [957.5775],
       [ 22.368 ]])

In [15]:
# Create an array for the dependent variable y with the sales data
y = df_sales["Sales"]

In [16]:
# Create a model with scikit-learn
model = LinearRegression()

In [17]:
# Fit the data into the model
model.fit(X, y)

In [18]:
# Display the slope
print(f"Model's slope: {model.coef_}")

Model's slope: [1.]


In [19]:
# Display the y-intercept
print(f"Model's y-intercept: {model.intercept_}")

Model's y-intercept: -2.5579538487363607e-13


In [20]:
# Display the model's best fit line formula
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X")

Model's formula: y = -2.5579538487363607e-13 + 1.000000000000001X


In [21]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [22]:
# Create a copy of the original data
df_sales_predicted = df_sales.copy()

# Add a column with the predicted sales values
df_sales_predicted["sales_predicted"] = predicted_y_values

# Display sample data
df_sales_predicted.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Product Name,Sales,Quantity,Discount,Profit,Unit Price,Price without discount,Unit Cost,Order Year,sales_predicted
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,Bush Somerset Collection Bookcase,261.96,2,0%,41.9136,130.98,130.98,20.9568,2016,261.96
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0%,219.582,243.98,243.98,73.194,2016,731.94
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0%,6.8714,7.31,7.31,3.4357,2016,14.62
3,4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,45%,-383.031,191.5155,348.21,80.0883,2015,957.5775
4,5,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,Eldon Fold 'N Roll Cart System,22.368,2,20%,2.5164,11.184,13.98,4.0542,2015,22.368


In [25]:
# Create a line plot of ads versus the predicted sales values
best_fit_line = df_sales_predicted.hvplot.line(
    x = "Order Year",
    y = "sales_predicted",
    color = "red"
)
best_fit_line

In [26]:
# Superpose the original data and the best fit line
sales_plot * best_fit_line