In [59]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from collections import deque
from tqdm import tqdm

# 1. 데이터 불러오기

In [60]:
sales = pd.read_csv('../data/sales.csv')
purchases_vendor = pd.read_csv('../data/preprocessed/purchases_vendor.csv')
inventory = pd.read_csv('../data/inventory.csv')


In [61]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12825363 entries, 0 to 12825362
Data columns (total 19 columns):
 #   Column          Dtype  
---  ------          -----  
 0   inventory_id    object 
 1   store_id        int64  
 2   brand_id        int64  
 3   item            object 
 4   size            object 
 5   sales_quantity  int64  
 6   sales_dollars   float64
 7   sales_price     float64
 8   sales_date      object 
 9   item_volume     float64
 10  classification  int64  
 11  excise_tax      float64
 12  vendor_id       int64  
 13  vendor          object 
 14  month           int64  
 15  dayofweek       int64  
 16  volume          float64
 17  purchase_price  float64
 18  retail_price    float64
dtypes: float64(7), int64(7), object(5)
memory usage: 1.8+ GB


In [62]:
if 'inventory_id' in sales.columns:
    id_parts = sales['inventory_id'].str.strip().str.split('_', expand=True)
    if id_parts.shape[1] >= 3:
        sales['store_id'] = id_parts[0].str.strip()
        sales['city'] = id_parts[1].str.strip()
        sales['brand_id'] = id_parts[2].str.strip()
        # 숫자 가능한 필드는 정수 변환 시도
        for col in ['store_id', 'brand_id']:
            sales[col] = pd.to_numeric(sales[col], errors='ignore')

In [63]:
sales.isnull().sum()

inventory_id      0
store_id          0
brand_id          0
item              0
size              0
sales_quantity    0
sales_dollars     0
sales_price       0
sales_date        0
item_volume       0
classification    0
excise_tax        0
vendor_id         0
vendor            0
month             0
dayofweek         0
volume            0
purchase_price    0
retail_price      0
city              0
dtype: int64

In [64]:
sales['city'].unique()

array(['SPARROW', 'PIGEON', 'FINCH', 'ORIOLE', 'KINGFISHER',
       'MOCKINGBIRD', 'PELICAN', 'EAGLE', 'HAWK', 'OWL', 'FALCON',
       'ROBIN', 'VULTURE', 'CONDOR', 'DUCK', 'SWAN', 'GOOSE', 'HERON',
       'ALBATROSS', 'PENGUIN', 'STORK', 'CRANE', 'FLAMINGO', 'PUFFIN',
       'TERN', 'SANDPIPER', 'EGRET', 'SWALLOW', 'PARROT', 'CROW', 'MACAW',
       'COCKATOO', 'TOUCAN', 'HUMMINGBIRD', 'NIGHTINGALE', 'CANARY',
       'LOVEBIRD', 'WOODPECKER', 'TURKEY', 'SEAGULL', 'CHICKEN',
       'PHEASANT', 'GROUSE', 'PARTRIDGE', 'CUCKOO', 'DOVE', 'THRUSH',
       'FLICKER', 'JAY', 'CARDINAL', 'SHRIKE', 'OSTRICH', 'EMU', 'KIWI',
       'CASSOWARY', 'COOT', 'MAGPIE', 'LOON', 'HORNBILL', 'HOOPOE',
       'KITE', 'DODO', 'IBIS', 'GANNET', 'AUK', 'STARLING', 'BLUEBIRD',
       '', 'SPOONBILL'], dtype=object)

In [65]:
sales[sales['city'] == '']['store_id'].unique()

array([46, 81])

In [66]:
sales[sales['store_id']==46]

Unnamed: 0,inventory_id,store_id,brand_id,item,size,sales_quantity,sales_dollars,sales_price,sales_date,item_volume,classification,excise_tax,vendor_id,vendor,month,dayofweek,volume,purchase_price,retail_price,city
411078,46_CANARY_10058,46,10058,F Coppola Dmd Ivry Cab Svgn,750mL,1,14.99,14.99,2024-01-16,750.0,2,0.11,2000,SOUTHERN WINE & SPIRITS NE,1,1,750.0,9.26,13.99,CANARY
411079,46_CANARY_10058,46,10058,F Coppola Dmd Ivry Cab Svgn,750mL,1,14.99,14.99,2024-01-31,750.0,2,0.11,2000,SOUTHERN WINE & SPIRITS NE,1,2,750.0,9.26,13.99,CANARY
411080,46_CANARY_10227,46,10227,Due Torri Pnt Nr del Venezie,750mL,3,29.97,9.99,2024-01-12,750.0,2,0.34,9165,ULTRA BEVERAGE COMPANY LLP,1,4,750.0,6.16,9.99,CANARY
411081,46_CANARY_10227,46,10227,Due Torri Pnt Nr del Venezie,750mL,2,19.98,9.99,2024-01-23,750.0,2,0.22,9165,ULTRA BEVERAGE COMPANY LLP,1,1,750.0,6.16,9.99,CANARY
411082,46_CANARY_1023,46,1023,Hennessy VSOP Cognac + 50mL,750mL,1,51.99,51.99,2024-01-25,750.0,1,0.79,8112,MOET HENNESSY USA INC,1,3,750.0,35.93,45.99,CANARY
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6356128,46_CANARY_8166,46,8166,Mount Gay Eclipse,750mL,6,119.94,19.99,2024-07-27,750.0,1,4.72,7239,REMY COINTREAU USA INC,7,5,750.0,15.14,19.99,CANARY
6356129,46_CANARY_8288,46,8288,Korbel Brut,750mL,1,11.99,11.99,2024-07-07,750.0,2,0.11,1128,BROWN-FORMAN CORP,7,6,750.0,7.94,11.99,CANARY
6356130,46_CANARY_8323,46,8323,Jose Cuervo Gold w/Marg Mix,750mL,6,89.94,14.99,2024-07-27,750.0,1,4.72,7245,PROXIMO SPIRITS INC.,7,5,750.0,11.90,14.99,CANARY
6356131,46_CANARY_8339,46,8339,The Glenlivet 12 Yr Single,750mL,2,87.98,43.99,2024-07-07,750.0,1,1.57,17035,PERNOD RICARD USA,7,6,750.0,30.21,42.99,CANARY


In [67]:
sales[sales['store_id']==81]

Unnamed: 0,inventory_id,store_id,brand_id,item,size,sales_quantity,sales_dollars,sales_price,sales_date,item_volume,classification,excise_tax,vendor_id,vendor,month,dayofweek,volume,purchase_price,retail_price,city
8169945,81__11321,81,11321,BV GDL Private Rsv Cab Svgn,750mL,3,359.97,119.99,2024-08-30,750.0,2,0.34,1590,DIAGEO CHATEAU ESTATE WINES,8,4,750.0,77.88,114.49,
8169946,81__11774,81,11774,Diamond Crk Cab Svgn Volcanc,750mL,2,449.98,224.99,2024-08-30,750.0,2,0.22,4425,MARTIGNETTI COMPANIES,8,4,750.0,148.02,224.99,
8169947,81__11775,81,11775,Diamond Creek Red Rock Cab S,750mL,2,449.98,224.99,2024-08-30,750.0,2,0.22,4425,MARTIGNETTI COMPANIES,8,4,750.0,149.99,224.99,
8169948,81__11776,81,11776,Diamond Crk Gravelly Cab Svg,750mL,3,674.97,224.99,2024-08-30,750.0,2,0.34,4425,MARTIGNETTI COMPANIES,8,4,750.0,154.10,224.99,
8169949,81__15971,81,15971,Revana Cab Svgn Napa Vly,750mL,3,329.97,109.99,2024-08-30,750.0,2,0.34,9165,ULTRA BEVERAGE COMPANY LLP,8,4,750.0,71.89,109.99,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12789939,81_SPOONBILL_984,81,984,Patron XO Cafe Liqueur,750mL,2,51.98,25.99,2024-12-22,750.0,1,1.57,9165,ULTRA BEVERAGE COMPANY LLP,12,6,750.0,20.63,25.99,SPOONBILL
12789940,81_SPOONBILL_984,81,984,Patron XO Cafe Liqueur,750mL,2,51.98,25.99,2024-12-23,750.0,1,1.57,9165,ULTRA BEVERAGE COMPANY LLP,12,0,750.0,20.63,25.99,SPOONBILL
12789941,81_SPOONBILL_984,81,984,Patron XO Cafe Liqueur,750mL,1,25.99,25.99,2024-12-24,750.0,1,0.79,9165,ULTRA BEVERAGE COMPANY LLP,12,1,750.0,20.63,25.99,SPOONBILL
12789942,81_SPOONBILL_984,81,984,Patron XO Cafe Liqueur,750mL,1,25.99,25.99,2024-12-26,750.0,1,0.79,9165,ULTRA BEVERAGE COMPANY LLP,12,3,750.0,20.63,25.99,SPOONBILL


In [68]:
sales.drop(columns=['store_id', 'brand_id', 'item', 'size', 'item_volume', 'classification', 'vendor'], inplace=True)
sales

Unnamed: 0,inventory_id,sales_quantity,sales_dollars,sales_price,sales_date,excise_tax,vendor_id,month,dayofweek,volume,purchase_price,retail_price,city
0,1_SPARROW_1004,1,16.49,16.49,2024-01-01,0.79,12546,1,0,750.0,10.65,16.49,SPARROW
1,1_SPARROW_1004,2,32.98,16.49,2024-01-02,1.57,12546,1,1,750.0,10.65,16.49,SPARROW
2,1_SPARROW_1004,1,16.49,16.49,2024-01-03,0.79,12546,1,2,750.0,10.65,16.49,SPARROW
3,1_SPARROW_1004,1,14.49,14.49,2024-01-08,0.79,12546,1,0,750.0,10.65,16.49,SPARROW
4,1_SPARROW_1005,2,69.98,34.99,2024-01-09,0.79,12546,1,1,750.0,27.34,34.99,SPARROW
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12825358,9_BLUEBIRD_984,1,25.99,25.99,2024-12-17,0.79,9165,12,1,750.0,20.63,25.99,BLUEBIRD
12825359,9_BLUEBIRD_984,1,25.99,25.99,2024-12-21,0.79,9165,12,5,750.0,20.63,25.99,BLUEBIRD
12825360,9_BLUEBIRD_984,3,77.97,25.99,2024-12-23,2.36,9165,12,0,750.0,20.63,25.99,BLUEBIRD
12825361,9_BLUEBIRD_984,1,25.99,25.99,2024-12-24,0.79,9165,12,1,750.0,20.63,25.99,BLUEBIRD


In [69]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12825363 entries, 0 to 12825362
Data columns (total 13 columns):
 #   Column          Dtype  
---  ------          -----  
 0   inventory_id    object 
 1   sales_quantity  int64  
 2   sales_dollars   float64
 3   sales_price     float64
 4   sales_date      object 
 5   excise_tax      float64
 6   vendor_id       int64  
 7   month           int64  
 8   dayofweek       int64  
 9   volume          float64
 10  purchase_price  float64
 11  retail_price    float64
 12  city            object 
dtypes: float64(6), int64(4), object(3)
memory usage: 1.2+ GB


In [70]:
purchases_vendor.head()

Unnamed: 0,inventory_id,store_id,brand_id,item,size,volume,total_volume,order_volume,classification,vendor_id,...,receiving_date,invoice_date,pay_date,dollars,quantity,purchase_price,purchase_cost,freight,freight_alloc,lead_time
0,69_PENGUIN_8412,69,8412,Tequila Ocho Plata Fresno,750mL,750.0,4500.0,4500.0,1,105,...,2024-01-02,2024-01-04,2024-02-16,214.26,6,35.71,36.288333,3.47,3.47,12
1,30_FLAMINGO_5255,30,5255,TGI Fridays Ultimte Mudslide,1.75L,1750.0,7000.0,26250.0,1,4466,...,2024-01-01,2024-01-07,2024-02-21,37.4,4,9.35,9.921333,8.57,2.285333,10
2,34_PUFFIN_5215,34,5215,TGI Fridays Long Island Iced,1.75L,1750.0,8750.0,26250.0,1,4466,...,2024-01-02,2024-01-07,2024-02-21,47.05,5,9.41,9.981333,8.57,2.856667,11
3,1_SPARROW_5255,1,5255,TGI Fridays Ultimte Mudslide,1.75L,1750.0,10500.0,26250.0,1,4466,...,2024-01-01,2024-01-07,2024-02-21,56.1,6,9.35,9.921333,8.57,3.428,10
4,76_HOOPOE_2034,76,2034,Glendalough Double Barrel,750mL,750.0,3750.0,3750.0,1,388,...,2024-01-02,2024-01-09,2024-02-16,106.6,5,21.32,22.242,4.61,4.61,9


In [71]:
purchases_vendor.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2372474 entries, 0 to 2372473
Data columns (total 23 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   inventory_id           object 
 1   store_id               int64  
 2   brand_id               int64  
 3   item                   object 
 4   size                   object 
 5   volume                 float64
 6   total_volume           float64
 7   order_volume           float64
 8   classification         int64  
 9   vendor_id              int64  
 10  vendor                 object 
 11  purchase_order_number  int64  
 12  purchase_order_date    object 
 13  receiving_date         object 
 14  invoice_date           object 
 15  pay_date               object 
 16  dollars                float64
 17  quantity               int64  
 18  purchase_price         float64
 19  purchase_cost          float64
 20  freight                float64
 21  freight_alloc          float64
 22  lead_time         

In [72]:
purchases_vendor.drop(columns=['store_id', 'brand_id', 'item', 'size', 'total_volume', 'order_volume', 'classification', 'vendor'], inplace=True)
purchases_vendor.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2372474 entries, 0 to 2372473
Data columns (total 15 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   inventory_id           object 
 1   volume                 float64
 2   vendor_id              int64  
 3   purchase_order_number  int64  
 4   purchase_order_date    object 
 5   receiving_date         object 
 6   invoice_date           object 
 7   pay_date               object 
 8   dollars                float64
 9   quantity               int64  
 10  purchase_price         float64
 11  purchase_cost          float64
 12  freight                float64
 13  freight_alloc          float64
 14  lead_time              int64  
dtypes: float64(6), int64(4), object(5)
memory usage: 271.5+ MB


# 2. 선입선출 재고 추적
- 매출총이익(gross profit) 계산
- 날짜 추적

In [73]:
# 1️⃣ 필요한 칼럼만 추출
purchases_df = purchases_vendor[
    ['inventory_id', 'purchase_order_date', 'receiving_date', 'invoice_date', 'quantity', 'purchase_cost']
].copy()
sales_df = sales[[
    'inventory_id', 'sales_date', 'sales_quantity', 'sales_price'
]].copy()


In [74]:
# 날짜를 Datetime으로 통일
purchase_date_cols = ['purchase_order_date', 'receiving_date', 'invoice_date']
for col in purchase_date_cols:
    purchases_df[col] = pd.to_datetime(purchases_df[col], errors='coerce')
sales_df['sales_date'] = pd.to_datetime(sales_df['sales_date'], errors='coerce')
sales['sales_date'] = pd.to_datetime(sales['sales_date'], errors='coerce')

In [75]:
# 기준 연도/기준일 계산: 판매데이터의 최소연도 기준으로 1/1
period_start = pd.Timestamp(
    f"{pd.to_datetime(sales_df['sales_date']).min().year}-01-01"
)

# 1) purchases_df 에서 1/1 이전 매입 제거
purchases_df = purchases_df[
    (purchases_df['receiving_date'].isna()) |  # 혹시 있을지 모르는 NaT 안전망
    (purchases_df['receiving_date'] >= period_start)
].copy()

# 2) 기초재고 생성 (opening_date = NaN)
opening_inventory = (
    inventory[['inventory_id', 'on_hand_begin', 'purchase_price']]
    .rename(columns={'on_hand_begin': 'quantity', 'purchase_price': 'purchase_cost'})
    .dropna(subset=['quantity'])
    .assign(quantity=lambda x: x['quantity'].astype(float),
            purchase_cost=lambda x: x['purchase_cost'].astype(float),
            is_opening=True)   # 디버깅용 플래그 권장
)

# 날짜 컬럼 채우기 (NaN)
for col in purchase_date_cols:
    opening_inventory[col] = np.nan

# 3) purchases_df 앞에 기초재고 삽입
#    (purchases_df에도 is_opening=False 추가해 두면 추적 쉬움)
if 'is_opening' not in purchases_df.columns:
    purchases_df['is_opening'] = False

purchases_df = pd.concat(
    [opening_inventory[purchases_df.columns], purchases_df],
    ignore_index=True
)

# 4) 정렬: NaN을 맨 앞으로 (기초재고가 먼저 소진)
purchases_df.sort_values(
    ['inventory_id', 'receiving_date'],
    na_position='first',
    inplace=True
)
sales_df.sort_values(['inventory_id', 'sales_date'], inplace=True)


In [76]:
purchases_vendor.sort_values(['inventory_id', 'receiving_date'], inplace=True)

In [77]:
purchases_vendor.head(10)

Unnamed: 0,inventory_id,volume,vendor_id,purchase_order_number,purchase_order_date,receiving_date,invoice_date,pay_date,dollars,quantity,purchase_price,purchase_cost,freight,freight_alloc,lead_time
1204794,10_PIGEON_1001,200.0,3960,11191,2024-07-14,2024-07-19,2024-08-01,2024-09-07,385.2,90,4.28,4.294245,7041.25,1.282071,5
1413404,10_PIGEON_1001,200.0,3960,11589,2024-08-10,2024-08-16,2024-08-30,2024-10-05,128.4,30,4.28,4.295173,5855.78,0.455179,6
1750550,10_PIGEON_1003,750.0,3960,12369,2024-09-28,2024-10-04,2024-10-13,2024-11-24,99.96,6,16.66,16.667828,733.34,0.04697,6
1843187,10_PIGEON_1003,750.0,3960,12585,2024-10-13,2024-10-19,2024-10-31,2024-11-29,99.96,6,16.66,16.660201,20.93,0.001208,6
1884260,10_PIGEON_1003,750.0,3960,12676,2024-10-19,2024-10-28,2024-11-06,2024-12-05,66.64,4,16.66,16.660345,42.56,0.001378,9
1929970,10_PIGEON_1003,750.0,3960,12748,2024-10-25,2024-11-04,2024-11-09,2024-12-22,99.96,6,16.66,16.66204,262.7,0.01224,10
2019220,10_PIGEON_1003,750.0,3960,13017,2024-11-11,2024-11-17,2024-11-29,2024-12-25,999.6,60,16.66,16.660005,0.59,0.000279,6
2066090,10_PIGEON_1003,750.0,3960,13084,2024-11-16,2024-11-25,2024-12-03,2025-01-06,49.98,3,16.66,16.660096,7.21,0.000288,9
2115568,10_PIGEON_1003,750.0,3960,13226,2024-11-25,2024-12-02,2024-12-05,2025-01-10,249.9,15,16.66,16.660111,13.5,0.001672,7
2170747,10_PIGEON_1003,750.0,3960,13320,2024-12-02,2024-12-07,2024-12-19,2025-01-13,149.94,9,16.66,16.665714,552.97,0.051427,5


In [78]:
purchases_df.head(10)

Unnamed: 0,inventory_id,purchase_order_date,receiving_date,invoice_date,quantity,purchase_cost,is_opening
0,10_PIGEON_1000,NaT,NaT,NaT,1.0,11.62,True
1,10_PIGEON_1001,NaT,NaT,NaT,11.0,4.28,True
1411323,10_PIGEON_1001,2024-07-14,2024-07-19,2024-08-01,90.0,4.294245,False
1619933,10_PIGEON_1001,2024-08-10,2024-08-16,2024-08-30,30.0,4.295173,False
1957079,10_PIGEON_1003,2024-09-28,2024-10-04,2024-10-13,6.0,16.667828,False
2049716,10_PIGEON_1003,2024-10-13,2024-10-19,2024-10-31,6.0,16.660201,False
2090789,10_PIGEON_1003,2024-10-19,2024-10-28,2024-11-06,4.0,16.660345,False
2136499,10_PIGEON_1003,2024-10-25,2024-11-04,2024-11-09,6.0,16.66204,False
2225749,10_PIGEON_1003,2024-11-11,2024-11-17,2024-11-29,60.0,16.660005,False
2272619,10_PIGEON_1003,2024-11-16,2024-11-25,2024-12-03,3.0,16.660096,False


In [79]:
sales_df

Unnamed: 0,inventory_id,sales_date,sales_quantity,sales_price
17790,10_PIGEON_1000,2024-01-29,1,16.99
17791,10_PIGEON_1001,2024-01-01,4,5.99
17792,10_PIGEON_1001,2024-01-02,5,5.99
17793,10_PIGEON_1001,2024-01-04,2,5.99
7060081,10_PIGEON_1001,2024-08-02,1,5.99
...,...,...,...,...
12825361,9_BLUEBIRD_984,2024-12-24,1,25.99
12825362,9_BLUEBIRD_984,2024-12-29,2,25.99
2758984,9_BLUEBIRD_999,2024-03-20,1,119.99
3698980,9_BLUEBIRD_999,2024-04-09,1,119.99


In [80]:
# 2️⃣ 그룹별 구매데이터 미리 묶기 (날짜 포함)
purchase_groups = {
    inv: grp[['quantity', 'purchase_cost',
              'purchase_order_date', 'receiving_date', 'invoice_date', 'is_opening']].to_numpy()
    for inv, grp in purchases_df.groupby('inventory_id', sort=False)
}

sales_records = []

In [81]:
# 3️⃣ FIFO 시뮬레이션 (inventory_id만 사용)
for inv, sales_grp in tqdm(
    sales_df.groupby('inventory_id', sort=False),
    total=sales_df['inventory_id'].nunique()
):
    purch_data = purchase_groups.get(inv)
    if purch_data is None or purch_data.size == 0:
        # 판매만 있고 구매기록이 없는 경우 (전량 품절 처리)
        for s in sales_grp.itertuples(index=False):
            sales_records.append((
                inv,
                s.sales_date,
                s.sales_quantity,
                np.nan,
                pd.NaT,
                pd.NaT,
                pd.NaT,
                True,   # shortage
                False   # is_opening (없으므로 False)
            ))
        continue

    # 구매데이터 언팩 (is_opening 포함)
    qtys = purch_data[:, 0].copy().astype(float)
    costs = purch_data[:, 1].copy().astype(float)
    order_dates = purch_data[:, 2].copy()
    recv_dates = purch_data[:, 3].copy()
    inv_dates = purch_data[:, 4].copy()
    is_opening_flags = purch_data[:, 5].copy().astype(bool) 

    ptr = 0

    for s in sales_grp.itertuples(index=False):
        qty_needed = s.sales_quantity
        total_cost = 0.0
        total_qty = 0
        order_used = recv_used = inv_used = pd.NaT
        shortage = False
        used_is_opening = False  # ✅ 판매 시 사용된 lot 중 하나라도 opening이면 True

        qtys_before = qtys.copy()
        ptr_before = ptr

        while qty_needed > 0 and ptr < len(qtys):
            recv_date = recv_dates[ptr]
            if pd.notna(recv_date) and recv_date > s.sales_date:
                shortage = True
                break

            available = qtys[ptr]
            if available <= 0:
                ptr += 1
                continue

            unit_cost = costs[ptr]

            if pd.isna(order_used):
                order_used = order_dates[ptr]
                recv_used = recv_dates[ptr]
                inv_used = inv_dates[ptr]

            take_qty = min(available, qty_needed)
            total_cost += take_qty * unit_cost
            total_qty += take_qty
            qty_needed -= take_qty

            if take_qty > 0 and is_opening_flags[ptr]:
                used_is_opening = True

            if take_qty == available:
                ptr += 1
            else:
                qtys[ptr] -= take_qty

        if qty_needed > 0:
            shortage = True

        if shortage:
            qtys = qtys_before
            ptr = ptr_before
            order_used = recv_used = inv_used = pd.NaT
            avg_cost = np.nan
            used_is_opening = False
        else:
            avg_cost = total_cost / total_qty if total_qty else np.nan

        sales_records.append((
            inv,
            s.sales_date,
            s.sales_quantity,
            avg_cost,
            order_used,
            recv_used,
            inv_used,
            shortage,
            used_is_opening 
        ))


100%|██████████| 267552/267552 [03:29<00:00, 1280.04it/s]


In [82]:
# 준비된 데이터 샘플 확인
purchases_df.head(10)

Unnamed: 0,inventory_id,purchase_order_date,receiving_date,invoice_date,quantity,purchase_cost,is_opening
0,10_PIGEON_1000,NaT,NaT,NaT,1.0,11.62,True
1,10_PIGEON_1001,NaT,NaT,NaT,11.0,4.28,True
1411323,10_PIGEON_1001,2024-07-14,2024-07-19,2024-08-01,90.0,4.294245,False
1619933,10_PIGEON_1001,2024-08-10,2024-08-16,2024-08-30,30.0,4.295173,False
1957079,10_PIGEON_1003,2024-09-28,2024-10-04,2024-10-13,6.0,16.667828,False
2049716,10_PIGEON_1003,2024-10-13,2024-10-19,2024-10-31,6.0,16.660201,False
2090789,10_PIGEON_1003,2024-10-19,2024-10-28,2024-11-06,4.0,16.660345,False
2136499,10_PIGEON_1003,2024-10-25,2024-11-04,2024-11-09,6.0,16.66204,False
2225749,10_PIGEON_1003,2024-11-11,2024-11-17,2024-11-29,60.0,16.660005,False
2272619,10_PIGEON_1003,2024-11-16,2024-11-25,2024-12-03,3.0,16.660096,False


In [83]:
# NaT 날짜의 실매입(False) 레코드는 모두 제거
mask_drop = purchases_df['receiving_date'].isna() & (purchases_df['is_opening'] == False)
purchases_df = purchases_df[~mask_drop].copy()

# 정렬 (기초재고가 맨 앞)
purchases_df.sort_values(['inventory_id', 'receiving_date'], na_position='first', inplace=True)

In [84]:
purchases_df

Unnamed: 0,inventory_id,purchase_order_date,receiving_date,invoice_date,quantity,purchase_cost,is_opening
0,10_PIGEON_1000,NaT,NaT,NaT,1.0,11.620000,True
1,10_PIGEON_1001,NaT,NaT,NaT,11.0,4.280000,True
1411323,10_PIGEON_1001,2024-07-14,2024-07-19,2024-08-01,90.0,4.294245,False
1619933,10_PIGEON_1001,2024-08-10,2024-08-16,2024-08-30,30.0,4.295173,False
1957079,10_PIGEON_1003,2024-09-28,2024-10-04,2024-10-13,6.0,16.667828,False
...,...,...,...,...,...,...,...
2122331,9_BLUEBIRD_984,2024-10-19,2024-10-26,2024-11-08,6.0,20.630039,False
2407930,9_BLUEBIRD_984,2024-11-29,2024-12-07,2024-12-18,6.0,20.630265,False
2573692,9_BLUEBIRD_984,2024-12-20,2024-12-28,2025-01-03,6.0,20.642595,False
206528,9_BLUEBIRD_999,NaT,NaT,NaT,0.0,89.840000,True


In [85]:
# 4️⃣ FIFO 결과 정리
fifo_df = pd.DataFrame(
    sales_records,
    columns=[
        'inventory_id',
        'sales_date',
        'sales_quantity',
        'fifo_purchase_cost',
        'fifo_purchase_order_date',
        'fifo_receiving_date',
        'fifo_invoice_date',
        'is_shortage',
        'fifo_is_opening'
    ]
)


In [86]:
fifo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12825363 entries, 0 to 12825362
Data columns (total 9 columns):
 #   Column                    Dtype         
---  ------                    -----         
 0   inventory_id              object        
 1   sales_date                datetime64[ns]
 2   sales_quantity            int64         
 3   fifo_purchase_cost        float64       
 4   fifo_purchase_order_date  datetime64[ns]
 5   fifo_receiving_date       datetime64[ns]
 6   fifo_invoice_date         datetime64[ns]
 7   is_shortage               bool          
 8   fifo_is_opening           bool          
dtypes: bool(2), datetime64[ns](4), float64(1), int64(1), object(1)
memory usage: 709.4+ MB


In [87]:
fifo_df

Unnamed: 0,inventory_id,sales_date,sales_quantity,fifo_purchase_cost,fifo_purchase_order_date,fifo_receiving_date,fifo_invoice_date,is_shortage,fifo_is_opening
0,10_PIGEON_1000,2024-01-29,1,11.620000,NaT,NaT,NaT,False,True
1,10_PIGEON_1001,2024-01-01,4,4.280000,NaT,NaT,NaT,False,True
2,10_PIGEON_1001,2024-01-02,5,4.280000,NaT,NaT,NaT,False,True
3,10_PIGEON_1001,2024-01-04,2,4.280000,NaT,NaT,NaT,False,True
4,10_PIGEON_1001,2024-08-02,1,4.294245,2024-07-14,2024-07-19,2024-08-01,False,False
...,...,...,...,...,...,...,...,...,...
12825358,9_BLUEBIRD_984,2024-12-24,1,20.630265,2024-11-29,2024-12-07,2024-12-18,False,False
12825359,9_BLUEBIRD_984,2024-12-29,2,20.636430,2024-11-29,2024-12-07,2024-12-18,False,False
12825360,9_BLUEBIRD_999,2024-03-20,1,89.894604,2024-02-21,2024-02-29,2024-03-09,False,False
12825361,9_BLUEBIRD_999,2024-04-09,1,89.894604,2024-02-21,2024-02-29,2024-03-09,False,False


In [88]:
# 5️⃣ FIFO 결과를 sales_df에 병합
sales_fifo = sales_df.merge(
    fifo_df,
    how='left'
)


In [89]:
sales_fifo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12825363 entries, 0 to 12825362
Data columns (total 10 columns):
 #   Column                    Dtype         
---  ------                    -----         
 0   inventory_id              object        
 1   sales_date                datetime64[ns]
 2   sales_quantity            int64         
 3   sales_price               float64       
 4   fifo_purchase_cost        float64       
 5   fifo_purchase_order_date  datetime64[ns]
 6   fifo_receiving_date       datetime64[ns]
 7   fifo_invoice_date         datetime64[ns]
 8   is_shortage               bool          
 9   fifo_is_opening           bool          
dtypes: bool(2), datetime64[ns](4), float64(2), int64(1), object(1)
memory usage: 905.1+ MB


In [90]:
sales_fifo

Unnamed: 0,inventory_id,sales_date,sales_quantity,sales_price,fifo_purchase_cost,fifo_purchase_order_date,fifo_receiving_date,fifo_invoice_date,is_shortage,fifo_is_opening
0,10_PIGEON_1000,2024-01-29,1,16.99,11.620000,NaT,NaT,NaT,False,True
1,10_PIGEON_1001,2024-01-01,4,5.99,4.280000,NaT,NaT,NaT,False,True
2,10_PIGEON_1001,2024-01-02,5,5.99,4.280000,NaT,NaT,NaT,False,True
3,10_PIGEON_1001,2024-01-04,2,5.99,4.280000,NaT,NaT,NaT,False,True
4,10_PIGEON_1001,2024-08-02,1,5.99,4.294245,2024-07-14,2024-07-19,2024-08-01,False,False
...,...,...,...,...,...,...,...,...,...,...
12825358,9_BLUEBIRD_984,2024-12-24,1,25.99,20.630265,2024-11-29,2024-12-07,2024-12-18,False,False
12825359,9_BLUEBIRD_984,2024-12-29,2,25.99,20.636430,2024-11-29,2024-12-07,2024-12-18,False,False
12825360,9_BLUEBIRD_999,2024-03-20,1,119.99,89.894604,2024-02-21,2024-02-29,2024-03-09,False,False
12825361,9_BLUEBIRD_999,2024-04-09,1,119.99,89.894604,2024-02-21,2024-02-29,2024-03-09,False,False


In [91]:
# 6️⃣ cost NaN 보정 (purchases_df + is_opening 활용 버전)

# 필수 컬럼 체크
for col in ['fifo_purchase_cost', 'is_shortage']:
    if col not in sales_fifo.columns:
        sales_fifo[col] = np.nan if col == 'fifo_purchase_cost' else False
sales_fifo['is_shortage'] = sales_fifo['is_shortage'].fillna(False).astype(bool)

# 6-1) 실매입 평균 단가 (is_opening=False)
fallback_cost_by_inv = (
    purchases_df.loc[purchases_df['is_opening'] == False]
    .dropna(subset=['purchase_cost'])
    .groupby('inventory_id', as_index=False)
    .agg(fallback_cost=('purchase_cost', 'mean'))
)

# 6-2) 기초재고 평균 단가 (is_opening=True)
opening_cost_by_inv = (
    purchases_df.loc[purchases_df['is_opening'] == True]
    .dropna(subset=['purchase_cost'])
    .groupby('inventory_id', as_index=False)
    .agg(opening_cost=('purchase_cost', 'mean'))
)

# 6-3) 두 테이블 합치기
inv_costs = fallback_cost_by_inv.merge(
    opening_cost_by_inv, on='inventory_id', how='outer'
)

# 6-4) 원본 sales_fifo 복사 후 inv_costs 병합 (모든 fifo_* 컬럼 유지)
sales_filled = sales_fifo.copy()
sales_filled = sales_filled.merge(
    inv_costs[['inventory_id', 'fallback_cost', 'opening_cost']],
    on='inventory_id',
    how='left'
)

# 6-5) 보정 순서: (1) 실매입 평균 → (2) 기초재고 평균
mask1 = sales_filled['fifo_purchase_cost'].isna() & (~sales_filled['is_shortage']) & sales_filled['fallback_cost'].notna()
sales_filled.loc[mask1, 'fifo_purchase_cost'] = sales_filled.loc[mask1, 'fallback_cost']
sales_filled.loc[mask1, 'fifo_is_opening'] = False

mask2 = sales_filled['fifo_purchase_cost'].isna() & (~sales_filled['is_shortage']) & sales_filled['opening_cost'].notna()
sales_filled.loc[mask2, 'fifo_purchase_cost'] = sales_filled.loc[mask2, 'opening_cost']
sales_filled.loc[mask2, 'fifo_is_opening'] = True

# 정리
sales_filled['fifo_is_opening'] = sales_filled['fifo_is_opening'].fillna(False).astype(bool)
sales_filled = sales_filled.drop(columns=['fallback_cost', 'opening_cost'], errors='ignore')

print("✅ cost 보정 및 is_opening 추가 완료")


✅ cost 보정 및 is_opening 추가 완료


In [92]:
sales_filled['fifo_is_opening'].value_counts()

False    10631350
True      2194013
Name: fifo_is_opening, dtype: int64

In [93]:
sales_filled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12825363 entries, 0 to 12825362
Data columns (total 10 columns):
 #   Column                    Dtype         
---  ------                    -----         
 0   inventory_id              object        
 1   sales_date                datetime64[ns]
 2   sales_quantity            int64         
 3   sales_price               float64       
 4   fifo_purchase_cost        float64       
 5   fifo_purchase_order_date  datetime64[ns]
 6   fifo_receiving_date       datetime64[ns]
 7   fifo_invoice_date         datetime64[ns]
 8   is_shortage               bool          
 9   fifo_is_opening           bool          
dtypes: bool(2), datetime64[ns](4), float64(2), int64(1), object(1)
memory usage: 905.1+ MB


In [94]:
sales

Unnamed: 0,inventory_id,sales_quantity,sales_dollars,sales_price,sales_date,excise_tax,vendor_id,month,dayofweek,volume,purchase_price,retail_price,city
0,1_SPARROW_1004,1,16.49,16.49,2024-01-01,0.79,12546,1,0,750.0,10.65,16.49,SPARROW
1,1_SPARROW_1004,2,32.98,16.49,2024-01-02,1.57,12546,1,1,750.0,10.65,16.49,SPARROW
2,1_SPARROW_1004,1,16.49,16.49,2024-01-03,0.79,12546,1,2,750.0,10.65,16.49,SPARROW
3,1_SPARROW_1004,1,14.49,14.49,2024-01-08,0.79,12546,1,0,750.0,10.65,16.49,SPARROW
4,1_SPARROW_1005,2,69.98,34.99,2024-01-09,0.79,12546,1,1,750.0,27.34,34.99,SPARROW
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12825358,9_BLUEBIRD_984,1,25.99,25.99,2024-12-17,0.79,9165,12,1,750.0,20.63,25.99,BLUEBIRD
12825359,9_BLUEBIRD_984,1,25.99,25.99,2024-12-21,0.79,9165,12,5,750.0,20.63,25.99,BLUEBIRD
12825360,9_BLUEBIRD_984,3,77.97,25.99,2024-12-23,2.36,9165,12,0,750.0,20.63,25.99,BLUEBIRD
12825361,9_BLUEBIRD_984,1,25.99,25.99,2024-12-24,0.79,9165,12,1,750.0,20.63,25.99,BLUEBIRD


In [95]:
# 7️⃣ 보정된 단가 및 날짜를 sales에 반영
sales_real = sales.merge(
    sales_filled,
    how='left'
)

In [96]:
sales_real.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12825363 entries, 0 to 12825362
Data columns (total 19 columns):
 #   Column                    Dtype         
---  ------                    -----         
 0   inventory_id              object        
 1   sales_quantity            int64         
 2   sales_dollars             float64       
 3   sales_price               float64       
 4   sales_date                datetime64[ns]
 5   excise_tax                float64       
 6   vendor_id                 int64         
 7   month                     int64         
 8   dayofweek                 int64         
 9   volume                    float64       
 10  purchase_price            float64       
 11  retail_price              float64       
 12  city                      object        
 13  fifo_purchase_cost        float64       
 14  fifo_purchase_order_date  datetime64[ns]
 15  fifo_receiving_date       datetime64[ns]
 16  fifo_invoice_date         datetime64[ns]
 17  is_sho

In [97]:
sales_real['is_shortage'] = sales_real['is_shortage'].fillna(False).astype(bool)
sales_real['fifo_is_opening'] = sales_real['fifo_is_opening'].fillna(False).astype(bool)
sales_real.head(3)

Unnamed: 0,inventory_id,sales_quantity,sales_dollars,sales_price,sales_date,excise_tax,vendor_id,month,dayofweek,volume,purchase_price,retail_price,city,fifo_purchase_cost,fifo_purchase_order_date,fifo_receiving_date,fifo_invoice_date,is_shortage,fifo_is_opening
0,1_SPARROW_1004,1,16.49,16.49,2024-01-01,0.79,12546,1,0,750.0,10.65,16.49,SPARROW,10.65,NaT,NaT,NaT,False,True
1,1_SPARROW_1004,2,32.98,16.49,2024-01-02,1.57,12546,1,1,750.0,10.65,16.49,SPARROW,10.65,NaT,NaT,NaT,False,True
2,1_SPARROW_1004,1,16.49,16.49,2024-01-03,0.79,12546,1,2,750.0,10.65,16.49,SPARROW,10.65,NaT,NaT,NaT,False,True


In [98]:
# 8️⃣ 매출총이익 및 이익률 계산
if 'excise_tax' in sales_real.columns:
    sales_real['gross_profit'] = (
        (sales_real['sales_price'] - sales_real['fifo_purchase_cost']) * sales_real['sales_quantity']
        - sales_real['excise_tax']
    )
else:
    sales_real['gross_profit'] = (
        (sales_real['sales_price'] - sales_real['fifo_purchase_cost']) * sales_real['sales_quantity']
    )

# 품절분은 NaN 처리
sales_real.loc[sales_real['is_shortage'], ['gross_profit']] = np.nan

# 이익률 계산
sales_real['gross_margin_rate'] = (
    sales_real['gross_profit'] /
    (sales_real['sales_price'] * sales_real['sales_quantity'])
)


In [99]:
# 9️⃣ FIFO 날짜 불일치 시 정리
if 'fifo_receiving_date' in sales_real.columns:
    condition = (
        sales_real['sales_date'].notna()
        & sales_real['fifo_receiving_date'].notna()
        & (sales_real['sales_date'] < sales_real['fifo_receiving_date'])
    )
else:
    condition = pd.Series(False, index=sales_real.index)
    print("경고: 'fifo_receiving_date' 컬럼이 없어 비교를 수행할 수 없습니다.")

fifo_columns_to_nan = [
    'fifo_purchase_cost',
    'fifo_purchase_order_date',
    'fifo_receiving_date',
    'fifo_invoice_date',
    'lead_time'
]
existing_columns_to_nan = [col for col in fifo_columns_to_nan if col in sales_real.columns]

if existing_columns_to_nan:
    num_changed = condition.sum()
    if num_changed > 0:
        sales_real.loc[condition, existing_columns_to_nan] = np.nan
        print("--- 날짜 불일치 데이터 처리 완료 ---")
        print(f"총 {num_changed} 건의 행에서 FIFO 및 리드타임 관련 컬럼 값이 NaN으로 변경되었습니다.")

        print("\n--- 변경된 데이터 확인 (조건 만족 행 중 상위 5건) ---")
        columns_to_check = existing_columns_to_nan + ['sales_date']
        columns_to_check = [col for col in columns_to_check if col in sales_real.columns]
        print(sales_real.loc[condition, columns_to_check].head())
    else:
        print("--- 날짜 불일치 데이터 처리 ---")
        print("sales_date가 fifo_receiving_date보다 빠른 행이 없습니다.")
else:
    print("경고: NaN으로 변경할 대상 컬럼(FIFO 관련 또는 lead_time)이 sales DataFrame에 없습니다.")

print('-' * 40)


--- 날짜 불일치 데이터 처리 ---
sales_date가 fifo_receiving_date보다 빠른 행이 없습니다.
----------------------------------------


In [100]:
sales_real.head(10)

Unnamed: 0,inventory_id,sales_quantity,sales_dollars,sales_price,sales_date,excise_tax,vendor_id,month,dayofweek,volume,...,retail_price,city,fifo_purchase_cost,fifo_purchase_order_date,fifo_receiving_date,fifo_invoice_date,is_shortage,fifo_is_opening,gross_profit,gross_margin_rate
0,1_SPARROW_1004,1,16.49,16.49,2024-01-01,0.79,12546,1,0,750.0,...,16.49,SPARROW,10.65,NaT,NaT,NaT,False,True,5.05,0.306246
1,1_SPARROW_1004,2,32.98,16.49,2024-01-02,1.57,12546,1,1,750.0,...,16.49,SPARROW,10.65,NaT,NaT,NaT,False,True,10.11,0.306549
2,1_SPARROW_1004,1,16.49,16.49,2024-01-03,0.79,12546,1,2,750.0,...,16.49,SPARROW,10.65,NaT,NaT,NaT,False,True,5.05,0.306246
3,1_SPARROW_1004,1,14.49,14.49,2024-01-08,0.79,12546,1,0,750.0,...,16.49,SPARROW,10.65,NaT,NaT,NaT,False,True,3.05,0.21049
4,1_SPARROW_1005,2,69.98,34.99,2024-01-09,0.79,12546,1,1,750.0,...,34.99,SPARROW,27.34,NaT,NaT,NaT,False,True,14.51,0.207345
5,1_SPARROW_1005,1,34.99,34.99,2024-01-15,0.39,12546,1,0,750.0,...,34.99,SPARROW,27.34,NaT,NaT,NaT,False,True,7.26,0.207488
6,1_SPARROW_1005,1,34.99,34.99,2024-01-22,0.39,12546,1,0,750.0,...,34.99,SPARROW,27.34,NaT,NaT,NaT,False,True,7.26,0.207488
7,1_SPARROW_1005,1,34.99,34.99,2024-01-30,0.39,12546,1,1,750.0,...,34.99,SPARROW,27.34,NaT,NaT,NaT,False,True,7.26,0.207488
8,1_SPARROW_10058,4,59.96,14.99,2024-01-05,0.45,2000,1,4,750.0,...,13.99,SPARROW,9.26,NaT,NaT,NaT,False,True,22.47,0.37475
9,1_SPARROW_10058,1,14.99,14.99,2024-01-06,0.11,2000,1,5,750.0,...,13.99,SPARROW,9.26,NaT,NaT,NaT,False,True,5.62,0.374917


In [101]:
sales_real.sort_values('inventory_id').head(3)

Unnamed: 0,inventory_id,sales_quantity,sales_dollars,sales_price,sales_date,excise_tax,vendor_id,month,dayofweek,volume,...,retail_price,city,fifo_purchase_cost,fifo_purchase_order_date,fifo_receiving_date,fifo_invoice_date,is_shortage,fifo_is_opening,gross_profit,gross_margin_rate
17790,10_PIGEON_1000,1,16.99,16.99,2024-01-29,0.79,1485,1,0,750.0,...,14.99,PIGEON,11.62,NaT,NaT,NaT,False,True,4.58,0.26957
9288375,10_PIGEON_1001,1,5.99,5.99,2024-10-26,0.05,3960,10,5,200.0,...,5.99,PIGEON,4.294245,2024-07-14,2024-07-19,2024-08-01,False,False,1.645755,0.27475
8203022,10_PIGEON_1001,1,5.99,5.99,2024-09-19,0.05,3960,9,3,200.0,...,5.99,PIGEON,4.294245,2024-07-14,2024-07-19,2024-08-01,False,False,1.645755,0.27475


In [102]:
inventory[inventory['inventory_id']=='1_SPARROW_1004']

Unnamed: 0,inventory_id,store_id,city,brand_id,item,size,on_hand_begin,on_hand_end,on_hand_diff,price_begin,price_end,begin_date,end_date,purchase_price,retail_price
26639,1_SPARROW_1004,1,Sparrow,1004,Jim Beam w/2 Rocks Glasses,750mL,17.0,,-17.0,16.49,,2024-01-01,,10.65,16.49


In [103]:
sales_real[(sales_real['fifo_purchase_order_date'].isna())&(sales_real['fifo_is_opening']==False)]

Unnamed: 0,inventory_id,sales_quantity,sales_dollars,sales_price,sales_date,excise_tax,vendor_id,month,dayofweek,volume,...,retail_price,city,fifo_purchase_cost,fifo_purchase_order_date,fifo_receiving_date,fifo_invoice_date,is_shortage,fifo_is_opening,gross_profit,gross_margin_rate


In [104]:
sales_real[(sales_real['fifo_purchase_order_date'].isna())&(sales_real['fifo_is_opening']==True)]

Unnamed: 0,inventory_id,sales_quantity,sales_dollars,sales_price,sales_date,excise_tax,vendor_id,month,dayofweek,volume,...,retail_price,city,fifo_purchase_cost,fifo_purchase_order_date,fifo_receiving_date,fifo_invoice_date,is_shortage,fifo_is_opening,gross_profit,gross_margin_rate
0,1_SPARROW_1004,1,16.49,16.49,2024-01-01,0.79,12546,1,0,750.0,...,16.49,SPARROW,10.65,NaT,NaT,NaT,False,True,5.05,0.306246
1,1_SPARROW_1004,2,32.98,16.49,2024-01-02,1.57,12546,1,1,750.0,...,16.49,SPARROW,10.65,NaT,NaT,NaT,False,True,10.11,0.306549
2,1_SPARROW_1004,1,16.49,16.49,2024-01-03,0.79,12546,1,2,750.0,...,16.49,SPARROW,10.65,NaT,NaT,NaT,False,True,5.05,0.306246
3,1_SPARROW_1004,1,14.49,14.49,2024-01-08,0.79,12546,1,0,750.0,...,16.49,SPARROW,10.65,NaT,NaT,NaT,False,True,3.05,0.210490
4,1_SPARROW_1005,2,69.98,34.99,2024-01-09,0.79,12546,1,1,750.0,...,34.99,SPARROW,27.34,NaT,NaT,NaT,False,True,14.51,0.207345
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12824860,9_BLUEBIRD_8700,1,24.95,24.95,2024-12-10,0.11,9819,12,1,750.0,...,31.99,BLUEBIRD,16.52,NaT,NaT,NaT,False,True,8.32,0.333467
12824861,9_BLUEBIRD_8700,1,24.95,24.95,2024-12-11,0.11,9819,12,2,750.0,...,31.99,BLUEBIRD,16.52,NaT,NaT,NaT,False,True,8.32,0.333467
12824862,9_BLUEBIRD_8700,1,24.95,24.95,2024-12-17,0.11,9819,12,1,750.0,...,31.99,BLUEBIRD,16.52,NaT,NaT,NaT,False,True,8.32,0.333467
12824863,9_BLUEBIRD_8700,2,49.90,24.95,2024-12-22,0.22,9819,12,6,750.0,...,31.99,BLUEBIRD,16.52,NaT,NaT,NaT,False,True,16.64,0.333467


In [105]:
sales_real.sort_values('sales_date').head(10)

Unnamed: 0,inventory_id,sales_quantity,sales_dollars,sales_price,sales_date,excise_tax,vendor_id,month,dayofweek,volume,...,retail_price,city,fifo_purchase_cost,fifo_purchase_order_date,fifo_receiving_date,fifo_invoice_date,is_shortage,fifo_is_opening,gross_profit,gross_margin_rate
0,1_SPARROW_1004,1,16.49,16.49,2024-01-01,0.79,12546,1,0,750.0,...,16.49,SPARROW,10.65,NaT,NaT,NaT,False,True,5.05,0.306246
500778,53_GROUSE_5342,1,26.99,26.99,2024-01-01,0.79,8112,1,0,750.0,...,28.99,GROUSE,19.85,NaT,NaT,NaT,False,True,6.35,0.235272
500812,53_GROUSE_5356,1,9.99,9.99,2024-01-01,0.39,17035,1,0,375.0,...,9.99,GROUSE,7.14,NaT,NaT,NaT,False,True,2.46,0.246246
30868,10_PIGEON_5368,1,21.49,21.49,2024-01-01,0.79,12546,1,0,750.0,...,19.49,PIGEON,15.23,NaT,NaT,NaT,False,True,5.47,0.254537
500876,53_GROUSE_5367,1,9.99,9.99,2024-01-01,0.79,660,1,0,750.0,...,9.99,GROUSE,5.83,NaT,NaT,NaT,False,True,3.37,0.337337
500933,53_GROUSE_5391,1,15.99,15.99,2024-01-01,1.84,12546,1,0,1750.0,...,15.99,GROUSE,12.02,NaT,NaT,NaT,False,True,2.13,0.133208
501032,53_GROUSE_5404,1,5.49,5.49,2024-01-01,0.39,3252,1,0,375.0,...,5.49,GROUSE,4.32,NaT,NaT,NaT,False,True,0.78,0.142077
501095,53_GROUSE_5447,1,9.99,9.99,2024-01-01,0.39,3089,1,0,375.0,...,9.99,GROUSE,7.63,NaT,NaT,NaT,False,True,1.97,0.197197
501144,53_GROUSE_5490,1,0.99,0.99,2024-01-01,0.05,12546,1,0,50.0,...,0.99,GROUSE,0.76,NaT,NaT,NaT,False,True,0.18,0.181818
501150,53_GROUSE_5491,7,13.93,1.99,2024-01-01,0.37,3960,1,0,50.0,...,1.99,GROUSE,1.46,NaT,NaT,NaT,False,True,3.34,0.23977


In [106]:
sales_real['fifo_is_opening'].value_counts()

False    10631350
True      2194013
Name: fifo_is_opening, dtype: int64

In [107]:
sales_real.isnull().sum()

inventory_id                      0
sales_quantity                    0
sales_dollars                     0
sales_price                       0
sales_date                        0
excise_tax                        0
vendor_id                         0
month                             0
dayofweek                         0
volume                            0
purchase_price                    0
retail_price                      0
city                              0
fifo_purchase_cost                0
fifo_purchase_order_date    2143895
fifo_receiving_date         2143895
fifo_invoice_date           2143895
is_shortage                       0
fifo_is_opening                   0
gross_profit                      0
gross_margin_rate                 0
dtype: int64

In [108]:
sales_real.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12825363 entries, 0 to 12825362
Data columns (total 21 columns):
 #   Column                    Dtype         
---  ------                    -----         
 0   inventory_id              object        
 1   sales_quantity            int64         
 2   sales_dollars             float64       
 3   sales_price               float64       
 4   sales_date                datetime64[ns]
 5   excise_tax                float64       
 6   vendor_id                 int64         
 7   month                     int64         
 8   dayofweek                 int64         
 9   volume                    float64       
 10  purchase_price            float64       
 11  retail_price              float64       
 12  city                      object        
 13  fifo_purchase_cost        float64       
 14  fifo_purchase_order_date  datetime64[ns]
 15  fifo_receiving_date       datetime64[ns]
 16  fifo_invoice_date         datetime64[ns]
 17  is_sho

In [109]:
# ✅ inventory_id 분해 후 CSV 저장
if 'inventory_id' in sales_real.columns:
    id_parts = sales_real['inventory_id'].str.strip().str.split('_', expand=True)
    if id_parts.shape[1] >= 3:
        sales_real['store_id'] = id_parts[0].str.strip()
        sales_real['city'] = id_parts[1].str.strip()
        sales_real['brand_id'] = id_parts[2].str.strip()
        # 숫자 가능한 필드는 정수 변환 시도
        for col in ['store_id', 'brand_id']:
            sales_real[col] = pd.to_numeric(sales_real[col], errors='ignore')

sales_real.to_csv('../data/preprocessed/sales_fifo.csv', index=False)
print('저장 완료: ../data/preprocessed/sales_fifo.csv')


저장 완료: ../data/preprocessed/sales_fifo.csv
