In [1]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

import sys
import os
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
if project_root not in sys.path: sys.path.insert(0, project_root)
requirements_path = os.path.join(project_root, 'requirements.txt')

In [2]:
import os
import torch
import numpy as np
import pandas as pd

import src.data_handling as data_handling
import src.model.torch_model as t


# paths
PRODUCTION_MODEL_FOLDER_PATH = 'models/production'
DFN_FILE_PATH = os.path.join(PRODUCTION_MODEL_FOLDER_PATH, 'dfn_best.pth')
GBM_FILE_PATH =  os.path.join(PRODUCTION_MODEL_FOLDER_PATH, 'gbm_best.pth')
EN_FILE_PATH = os.path.join(PRODUCTION_MODEL_FOLDER_PATH, 'en_best.pth')

PREPROCESSOR_PATH = 'preprocessors/column_transformer.pkl'

file_name = 'online_retail.csv'
file_path = os.path.join(project_root, 'data', 'raw', file_name)
df = pd.read_csv(file_path)
df = data_handling.scripts.sanitize_column_names(df=df)
df.head()


  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,invoiceno,stockcode,description,quantity,invoicedate,unitprice,customerid,country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [3]:
# feature engineering + imputation
df = data_handling.scripts.structure_missing_values(df=df)
df = data_handling.scripts.handle_feature_engineering(df=df)
df.head()

Unnamed: 0,invoiceno,stockcode,quantity,invoicedate,unitprice,customerid,country,year,year_month,day_of_week,product_avg_quantity_last_month,is_registered,customer_recency_days,customer_total_spend_ltm,customer_freq_ltm,is_return
0,536365,85123A,1.94591,1291192000.0,0.85,17850.0,United Kingdom,2010,12,Wed,3168.0,1,91.0,0.0,0.0,0
1,536365,71053,1.94591,1291192000.0,1.66,17850.0,United Kingdom,2010,12,Wed,157.75,1,91.0,0.0,0.0,0
2,536365,84406B,2.197225,1291192000.0,0.85,17850.0,United Kingdom,2010,12,Wed,115.75,1,41.0,0.0,0.0,0
3,536365,84029G,1.94591,1291192000.0,0.85,17850.0,Switzerland,2010,12,Wed,275.75,1,91.0,0.0,0.0,0
4,536365,84029E,1.94591,1291192000.0,0.85,17850.0,France,2010,12,Wed,354.833333,1,91.0,0.0,0.0,0


In [4]:
df.columns

Index(['invoiceno', 'stockcode', 'quantity', 'invoicedate', 'unitprice',
       'customerid', 'country', 'year', 'year_month', 'day_of_week',
       'product_avg_quantity_last_month', 'is_registered',
       'customer_recency_days', 'customer_total_spend_ltm',
       'customer_freq_ltm', 'is_return'],
      dtype='object')

In [5]:
from sklearn.model_selection import train_test_split

# classify num and cat columns
target_col = 'quantity'
num_cols, cat_cols = data_handling.scripts.categorize_num_cat_cols(df=df, target_col=target_col)
if cat_cols: 
    for col in cat_cols: df[col] = df[col].astype('string')


# creates train, val, test datasets
y = df[target_col]
X = df.copy().drop(target_col, axis='columns')
X.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 15 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   invoiceno                        541909 non-null  string 
 1   stockcode                        541909 non-null  string 
 2   invoicedate                      541909 non-null  float64
 3   unitprice                        541909 non-null  float64
 4   customerid                       541909 non-null  string 
 5   country                          541909 non-null  string 
 6   year                             541909 non-null  string 
 7   year_month                       541909 non-null  string 
 8   day_of_week                      541909 non-null  string 
 9   product_avg_quantity_last_month  541909 non-null  float64
 10  is_registered                    541909 non-null  string 
 11  customer_recency_days            541909 non-null  float64
 12  cu

In [6]:
test_size, random_state = 50000, 42
X_tv, X_test, y_tv, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_tv, y_tv, test_size=test_size, random_state=random_state, shuffle=True)
X_train.info()


<class 'pandas.core.frame.DataFrame'>
Index: 441909 entries, 325043 to 246249
Data columns (total 15 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   invoiceno                        441909 non-null  string 
 1   stockcode                        441909 non-null  string 
 2   invoicedate                      441909 non-null  float64
 3   unitprice                        441909 non-null  float64
 4   customerid                       441909 non-null  string 
 5   country                          441909 non-null  string 
 6   year                             441909 non-null  string 
 7   year_month                       441909 non-null  string 
 8   day_of_week                      441909 non-null  string 
 9   product_avg_quantity_last_month  441909 non-null  float64
 10  is_registered                    441909 non-null  string 
 11  customer_recency_days            441909 non-null  float64
 12  cu

In [7]:
X_train, X_val, X_test, preprocessor = data_handling.scripts.transform_input(X_train, X_val, X_test, num_cols=num_cols, cat_cols=cat_cols)


2025-08-11 20:49:08,284 - root - INFO - transformed input datasets: X_train: (441909, 65), X_val: (50000, 65), X_test: (50000, 65)


In [8]:
import src.model.torch_model as t

file_path = os.path.join(project_root, 'models', 'production', 'dfn_best.pth')
model = t.scripts.load_model(input_dim=X_train.shape[1], file_path=file_path)


In [9]:

stockcode = '85123A'

file_name = 'online_retail.csv'
file_path = os.path.join(project_root, 'data', 'raw', file_name)
df = pd.read_csv(file_path)
df = data_handling.scripts.sanitize_column_names(df=df)

df_stockcode = df[df['stockcode'] == stockcode]
print(df_stockcode['quantity'].unique())

df_stockcode = data_handling.scripts.structure_missing_values(df=df_stockcode)
df_stockcode = data_handling.scripts.handle_feature_engineering(df=df_stockcode)

print(df_stockcode['quantity'].unique())


[    6    64    32     4     8     3   128     9    12     1   160     2
     5     7    24    14    -1    19    10    27    13    96   500  -500
   -24   250    15  1010  1930    20    62    18    22   256    -3    48
    33   150    16    36    11    17    -8    81   100    72    -6    25
 -1930    29   192    23   -12    21   608   320  4000   -18   512   480
   400   224    30    60   992  -256   300]
[1.94591015 4.17438727 3.49650756 1.60943791 2.19722458 1.38629436
 4.8598124  2.30258509 2.56494936 0.69314718 5.08140436 1.09861229
 1.79175947 2.07944154 3.21887582 2.7080502  0.         2.99573227
 2.39789527 3.33220451 2.63905733 4.57471098 6.2166061  5.52545294
 2.77258872 6.91869522 7.56579328 3.04452244 4.14313473 2.94443898
 3.13549422 5.54907608 3.8918203  3.52636052 5.01727984 2.83321334
 3.61091791 2.48490665 2.89037176 4.40671925 4.61512052 4.29045944
 3.25809654 3.40119738 5.26269019 3.17805383 3.09104245 6.41181827
 5.77144112 8.29429961 6.24027585 6.17586727 5.99396143

In [10]:

X = df_stockcode.copy().drop(columns=target_col)
y = df_stockcode.copy()[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1000, random_state=random_state, shuffle=True)

X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

batch_size = 32
train_data_loader = t.scripts.create_torch_data_loader(X=X_train, y=y_train, batch_size=batch_size)
val_data_loader = t.scripts.create_torch_data_loader(X=X_val, y=y_val, batch_size=batch_size)
# retrain the best model
model, _ = t.scripts.train_model(
    model=model,
    optimizer=torch.optim.Adam(model.parameters(), lr=0.001),
    criterion=torch.nn.MSELoss(),
    num_epochs=50,
    min_delta=0.00001,
    patience=10,
    train_data_loader=train_data_loader,
    val_data_loader=val_data_loader,
    device_type='cpu'
)

2025-08-11 20:49:09,559 - root - INFO - ... starts epoch 1 ...
2025-08-11 20:49:09,744 - root - INFO - ... starts epoch 2 ...
2025-08-11 20:49:09,903 - root - INFO - ... starts epoch 3 ...
2025-08-11 20:49:10,062 - root - INFO - ... starts epoch 4 ...
2025-08-11 20:49:10,223 - root - INFO - ... starts epoch 5 ...
2025-08-11 20:49:10,380 - root - INFO - ... starts epoch 6 ...
2025-08-11 20:49:10,540 - root - INFO - ... starts epoch 7 ...
2025-08-11 20:49:10,696 - root - INFO - ... starts epoch 8 ...
2025-08-11 20:49:10,851 - root - INFO - ... starts epoch 9 ...
2025-08-11 20:49:11,011 - root - INFO - ... starts epoch 10 ...
2025-08-11 20:49:11,034 - root - INFO - epoch [10/50], loss: 1.3289
2025-08-11 20:49:11,169 - root - INFO - ... starts epoch 11 ...
2025-08-11 20:49:11,332 - root - INFO - early stopping at epoch 11 as validation loss did not improve for 10 epochs.


In [11]:
from src._utils import main_logger
import pandas as pd
import datetime

import src.model.torch_model as t

file_path = os.path.join(project_root, 'models', 'production', 'dfn_best.pth')
model = t.scripts.load_model(input_dim=X_train.shape[1], file_path=file_path)

min_price = 2
max_price = 6
NUM_PRICE_BINS = 1000
price_range = np.linspace(min_price, max_price, num=1000)
print(len(price_range))

# impute input data
new_data = {
    'invoicedate': [np.datetime64(datetime.datetime.now())] * NUM_PRICE_BINS,
    'invoiceno': [np.nan] * NUM_PRICE_BINS,
    'stockcode': [stockcode] * NUM_PRICE_BINS,
    'quantity': [0] * NUM_PRICE_BINS,
    'customerid': [np.nan] * NUM_PRICE_BINS,
    'country': ['United Kingdom'] * NUM_PRICE_BINS,
    'unitprice': price_range
}
new_df = pd.DataFrame(new_data)
new_df = data_handling.scripts.structure_missing_values(df=new_df)
new_df = data_handling.scripts.handle_feature_engineering(df=new_df)


# transform input data
target_col = 'quantity'
X = new_df.copy().drop(target_col, axis=1)
X = X.sample(frac=1).reset_index(drop=True)
# X = X.tail(NUM_PRICE_BINS)
print(X)


1000
      invoicedate invoiceno stockcode customerid         country  unitprice  \
0    1.754945e+09   unknown    85123A    unknown  United Kingdom   5.503504   
1    1.754945e+09   unknown    85123A    unknown  United Kingdom   4.502503   
2    1.754945e+09   unknown    85123A    unknown  United Kingdom   5.659660   
3    1.754945e+09   unknown    85123A    unknown  United Kingdom   5.531532   
4    1.754945e+09   unknown    85123A    unknown  United Kingdom   4.582583   
..            ...       ...       ...        ...             ...        ...   
995  1.754945e+09   unknown    85123A    unknown  United Kingdom   3.393393   
996  1.754945e+09   unknown    85123A    unknown  United Kingdom   3.561562   
997  1.754945e+09   unknown    85123A    unknown  United Kingdom   3.581582   
998  1.754945e+09   unknown    85123A    unknown  United Kingdom   5.971972   
999  1.754945e+09   unknown    85123A    unknown  United Kingdom   3.477477   

     year  year_month day_of_week  product_avg

In [12]:

if preprocessor: X = preprocessor.transform(X)


model.eval()
input_tensor = torch.tensor(X, dtype=torch.float32)
epsilon = 1e-10
with torch.inference_mode():
    y_pred = model(input_tensor)
    y_pred = y_pred.cpu().numpy().flatten()
    y_pred_actual = np.exp(y_pred + epsilon)
    main_logger.info(f"primary model's prediction for stockcode {stockcode} - actual quantity (units) {y_pred_actual}")



2025-08-11 20:49:11,379 - root - INFO - primary model's prediction for stockcode 85123A - actual quantity (units) [0.0048312  0.00485817 0.00482703 0.00483044 0.00485599 0.00485007
 0.00490585 0.00483295 0.00486519 0.00483454 0.00489224 0.00485828
 0.00489397 0.00483842 0.00485925 0.00490214 0.00482499 0.00486747
 0.00488277 0.00482401 0.00489169 0.00484941 0.00485709 0.004848
 0.00486758 0.00482553 0.00483972 0.00491045 0.00482413 0.00491066
 0.00489191 0.00483746 0.00485341 0.00488808 0.00486682 0.00483099
 0.00487115 0.00486367 0.00488059 0.00485967 0.00484262 0.00492578
 0.00492521 0.00482314 0.00490575 0.00486097 0.00491624 0.00483918
 0.00486294 0.00492357 0.00487951 0.00487461 0.00488886 0.00490543
 0.0048311  0.00492215 0.00483057 0.00483552 0.00483658 0.0048562
 0.00482982 0.00489082 0.0048519  0.00491514 0.00488645 0.00489202
 0.00484381 0.00490018 0.0048706  0.00482939 0.0048238  0.00487071
 0.00491712 0.00484455 0.00486617 0.00485243 0.00489975 0.00490291
 0.00487191 0.0049

In [13]:
df_ = new_df.copy()
df_['quantity'] = y_pred_actual
df_ = df_.sort_values(by='unitprice')

optimal_row = df_.loc[df_['quantity'].idxmax()]

optimal_price = optimal_row['unitprice']
best_sales = optimal_row['quantity'] * optimal_price

all_outputs = []
for _, row in df_.iterrows():
    current_output = {
        "stockcode": stockcode,
        "unit_price": float(row['unitprice']),
        "predicted_sales": float(row['quantity'] * row['unitprice']) * 30,
        "optimal_unit_price": float(optimal_price), # type: ignore
        "max_predicted_sales": float(best_sales) * 30, # type: ignore
    }
    all_outputs.append(current_output)

    # print(float(row['quantity'] * row['unitprice']))

print(optimal_price)

3.2612612612612613
