# Load Libraries

In [145]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np

# Load Dataset

In [147]:
df = pd.read_csv("<DATA_SET_PATH>")

# Processing the Data

In [148]:
# Choosing required features and them to new datafram
df_model = df[['transaction_qty', 'unit_price', 'product_category', 'transaction_day', 'store_location', 'total_sales']]

In [149]:
df_model.head()

Unnamed: 0,transaction_qty,unit_price,product_category,transaction_day,store_location,total_sales
0,2,3.0,Coffee,1,Lower Manhattan,6.0
1,2,3.1,Tea,1,Lower Manhattan,6.2
2,2,4.5,Drinking Chocolate,1,Lower Manhattan,9.0
3,1,2.0,Coffee,1,Lower Manhattan,2.0
4,2,3.1,Tea,1,Lower Manhattan,6.2


## Convert Categorical Features to Numeric Features

In [150]:
# Mapping `product_category' to total_sales mean of each prodcut_category
category_means = df_model.groupby('product_category')['total_sales'].mean()
df_model['product_category'] = df_model['product_category'].map(category_means)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_model['product_category'] = df_model['product_category'].map(category_means)


In [151]:
# Define the mapping dictionary
location_mapping = {'Hell\'s Kitchen': 1, 'Astoria': 2, 'Lower Manhattan': 3}

# Map the values in the 'store_location' column using the mapping dictionary
df_model['store_location'] = df_model['store_location'].map(location_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_model['store_location'] = df_model['store_location'].map(location_mapping)


In [152]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149116 entries, 0 to 149115
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   transaction_qty   149116 non-null  int64  
 1   unit_price        149116 non-null  float64
 2   product_category  149116 non-null  float64
 3   transaction_day   149116 non-null  int64  
 4   store_location    149116 non-null  int64  
 5   total_sales       149116 non-null  float64
dtypes: float64(3), int64(3)
memory usage: 6.8 MB


## Scaling Features

__For feature scaling i have tried to test RobustScaler and StandardScaler. As per results , it seems that RobustScaler would have better affects on data and therefore better predictions__

In [None]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
for col in df_model.columns:
  df_model[col] = scaler.fit_transform(df_model[[col]])

In [154]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# for col in df_model.columns:
#   df_model[col] = scaler.fit_transform(df_model[[col]])

In [155]:
df_model.head()

Unnamed: 0,transaction_qty,unit_price,product_category,transaction_day,store_location,total_sales
0,1.0,0.0,1.0,-1.0,0.5,0.75
1,1.0,0.08,0.0,-1.0,0.5,0.816667
2,1.0,1.2,6.649413,-1.0,0.5,1.75
3,0.0,-0.8,1.0,-1.0,0.5,-0.583333
4,1.0,0.08,0.0,-1.0,0.5,0.816667


# Machine Learning Models

## 1. Linear Regression Model

In [156]:
# Linear regression model

X = df_model[['transaction_qty', 'unit_price', 'product_category', 'transaction_day', 'store_location']]
y = df_model['total_sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 0.07399622114798542
R-squared: 0.9271586107849344


## 2.1 Linear Regression Model - Gradient Descent
__Within this cell we have wrote GD Regression From Scratch__

In [157]:
class GradientDescentLinearRegression:
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.theta = None
        self.intercept = None

    def fit(self, X, y):
        m, n = X.shape
        self.theta = np.zeros(n)
        self.intercept = 0

        for _ in range(self.n_iterations):
            y_pred = np.dot(X, self.theta) + self.intercept
            error = y_pred - y
            gradient = np.dot(X.T, error) / m
            intercept_gradient = np.mean(error)
            self.theta -= self.learning_rate * gradient
            self.intercept -= self.learning_rate * intercept_gradient

    def predict(self, X):
        return np.dot(X, self.theta) + self.intercept


# Selecting X and y
X = df_model[['transaction_qty', 'unit_price', 'product_category', 'transaction_day', 'store_location']]
y = df_model['total_sales']

# Spliting Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Prepare data
X_train_np = X_train.values
y_train_np = y_train.values
X_test_np = X_test.values

# Instantiate and train the model
gd_lr = GradientDescentLinearRegression(learning_rate=0.01, n_iterations=1000)
gd_lr.fit(X_train_np, y_train_np)

# Make predictions
y_pred_gd = gd_lr.predict(X_test_np)

# Evaluation metrics
mse_gd = mean_squared_error(y_test, y_pred_gd)
r2_gd = r2_score(y_test, y_pred_gd)

print("Mean Squared Error (Gradient Descent):", mse_gd)
print("R-squared (Gradient Descent):", r2_gd)


Mean Squared Error (Gradient Descent): 0.06263058211964594
R-squared (Gradient Descent): 0.938346870445999


## 2.2 Linear Regression Model - Gradient Descent (Using SGDRegressor)

In [159]:
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Create a pipeline with feature scaling (optional but recommended for gradient descent)
pipeline = make_pipeline(StandardScaler(), SGDRegressor(max_iter=1000, tol=1e-3))

# Selection X and y
X = df_model[['transaction_qty', 'unit_price', 'product_category', 'transaction_day', 'store_location']]
y = df_model['total_sales']

# Spliting Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred_sgd = pipeline.predict(X_test)

# Evaluation metrics
mse_sgd = mean_squared_error(y_test, y_pred_sgd)
r2_sgd = r2_score(y_test, y_pred_sgd)

print("Mean Squared Error (SGDRegressor):", mse_sgd)
print("R-squared (SGDRegressor):", r2_sgd)


Mean Squared Error (SGDRegressor): 0.05804683887766444
R-squared (SGDRegressor): 0.9428590768853435
