In [4]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from datetime import timedelta

receivals = pd.read_csv('./Project_materials/data/kernel/receivals.csv')
purchase_orders = pd.read_csv('./Project_materials/data/kernel/purchase_orders.csv')
prediction_mapping = pd.read_csv('./Project_materials/data/prediction_mapping.csv')
sample_submission = pd.read_csv('./Project_materials/data/sample_submission.csv')

# Convert dates
receivals['date_arrival'] = pd.to_datetime(receivals['date_arrival'], utc=True).dt.tz_localize(None)
purchase_orders['delivery_date'] = pd.to_datetime(purchase_orders['delivery_date'], utc=True).dt.tz_localize(None)
prediction_mapping['forecast_start_date'] = pd.to_datetime(prediction_mapping['forecast_start_date'])
prediction_mapping['forecast_end_date'] = pd.to_datetime(prediction_mapping['forecast_end_date'])


# clean data
print("\n[1] DATA CLEANING")
receivals = receivals[receivals['net_weight'] > 0]
receivals = receivals[receivals['rm_id'].notna()]
receivals = receivals.sort_values('date_arrival')  # Sort for efficiency
print(f"Clean receivals: {len(receivals)}")

# bare nyere dato, tok ekstremt lang tid med alle..
train_dates = pd.date_range(start='2024-01-01', end='2024-11-30', freq='MS')  # Monthly start
forecast_horizons = [7, 30, 60, 90, 150]  # Fewer horizons

print(f"Using {len(train_dates)} training dates x {len(forecast_horizons)} horizons")
print(f"Expected samples: ~{len(train_dates) * 60 * len(forecast_horizons)} (only active rm_ids)")

training_data = []

# Pre-compute which rm_ids are active in 2024
active_rm_ids = receivals[receivals['date_arrival'] >= '2024-01-01']['rm_id'].unique()
print(f"Active rm_ids in 2024: {len(active_rm_ids)}")

for i, train_date in enumerate(train_dates):
    print(f"Processing date {i+1}/{len(train_dates)}: {train_date.date()}...")
    
    # skip inactive ones for speed
    for rm_id in active_rm_ids:
        
        # Get historical receivals before train_date
        hist = receivals[
            (receivals['rm_id'] == rm_id) &
            (receivals['date_arrival'] < train_date)
        ]
        
        if len(hist) == 0:
            continue
        
        # Pre-compute historical features
        cutoff_365 = train_date - timedelta(days=365)
        cutoff_90 = train_date - timedelta(days=90)
        
        recent_365 = hist[hist['date_arrival'] >= cutoff_365]
        recent_90 = hist[hist['date_arrival'] >= cutoff_90]
        
        # Historical features
        if len(recent_365) > 0:
            total_365 = recent_365['net_weight'].sum()
            avg_365 = recent_365['net_weight'].mean()
            count_365 = len(recent_365)
            days_since = (train_date - recent_365['date_arrival'].max()).days
        else:
            total_365 = avg_365 = count_365 = days_since = 0
        
        if len(recent_90) > 0:
            total_90 = recent_90['net_weight'].sum()
            count_90 = len(recent_90)
        else:
            total_90 = count_90 = 0
        
        daily_rate = total_365 / 365 if count_365 > 0 else 0
        
        # For each horizon
        for horizon in forecast_horizons:
            forecast_end = train_date + timedelta(days=horizon)
            
            # Get actual deliveries in window
            actual = receivals[
                (receivals['rm_id'] == rm_id) &
                (receivals['date_arrival'] >= train_date) &
                (receivals['date_arrival'] <= forecast_end)
            ]
            target = actual['net_weight'].sum()
            
            # Get future POs in window
            rm_products = hist['product_id'].unique()
            future_pos = purchase_orders[
                (purchase_orders['product_id'].isin(rm_products)) &
                (purchase_orders['delivery_date'] >= train_date) &
                (purchase_orders['delivery_date'] <= forecast_end) &
                (purchase_orders['status'] != 'Deleted')
            ]
            po_qty = future_pos['quantity'].sum() if len(future_pos) > 0 else 0
            po_count = len(future_pos)
            
            # Create sample
            training_data.append({
                'rm_id': rm_id,
                'forecast_horizon': horizon,
                'month': train_date.month,
                'quarter': train_date.quarter,
                'total_weight_365d': total_365,
                'avg_weight_365d': avg_365,
                'count_365d': count_365,
                'days_since_last': days_since,
                'total_weight_90d': total_90,
                'count_90d': count_90,
                'daily_rate_365d': daily_rate,
                'future_po_quantity': po_qty,
                'future_po_count': po_count,
                'target': target
            })

print(f"\nGenerated {len(training_data)} training samples")
train_df = pd.DataFrame(training_data)

print("\nTraining data statistics:")
print(train_df.describe())
print(f"\nSamples with target > 0: {(train_df['target'] > 0).sum()} ({(train_df['target'] > 0).sum() / len(train_df) * 100:.1f}%)")

# TRAINING

feature_cols = [c for c in train_df.columns if c != 'target']
X = train_df[feature_cols]
y = train_df['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")

# XGBoost with quantile regression
model = xgb.XGBRegressor(
    objective='reg:quantileerror',
    quantile_alpha=0.2,
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    tree_method='hist'  # raskere trening, fjern etterpå!
)

print("Training XGBoost (quantile=0.2)...")
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=100
)


#PREDICTIOSNS
print("\n[4] MAKING PREDICTIONS")
print("-"*80)

forecast_start = pd.to_datetime('2025-01-01')

# Pre-compute features for all rm_ids
rm_features = {}

for rm_id in prediction_mapping['rm_id'].unique():
    hist = receivals[
        (receivals['rm_id'] == rm_id) &
        (receivals['date_arrival'] < forecast_start)
    ]
    
    if len(hist) == 0:
        rm_features[rm_id] = {
            'total_weight_365d': 0,
            'avg_weight_365d': 0,
            'count_365d': 0,
            'days_since_last': 999,
            'total_weight_90d': 0,
            'count_90d': 0,
            'daily_rate_365d': 0,
            'rm_products': []
        }
        continue
    
    cutoff_365 = forecast_start - timedelta(days=365)
    cutoff_90 = forecast_start - timedelta(days=90)
    
    recent_365 = hist[hist['date_arrival'] >= cutoff_365]
    recent_90 = hist[hist['date_arrival'] >= cutoff_90]
    
    if len(recent_365) > 0:
        total_365 = recent_365['net_weight'].sum()
        avg_365 = recent_365['net_weight'].mean()
        count_365 = len(recent_365)
        days_since = (forecast_start - recent_365['date_arrival'].max()).days
    else:
        total_365 = avg_365 = count_365 = days_since = 0
    
    if len(recent_90) > 0:
        total_90 = recent_90['net_weight'].sum()
        count_90 = len(recent_90)
    else:
        total_90 = count_90 = 0
    
    rm_features[rm_id] = {
        'total_weight_365d': total_365,
        'avg_weight_365d': avg_365,
        'count_365d': count_365,
        'days_since_last': days_since,
        'total_weight_90d': total_90,
        'count_90d': count_90,
        'daily_rate_365d': total_365 / 365 if count_365 > 0 else 0,
        'rm_products': hist['product_id'].unique()
    }

print(f"Pre-computed features for {len(rm_features)} rm_ids")

# Make predictions
predictions = []

for idx, row in prediction_mapping.iterrows():
    rm_id = row['rm_id']
    forecast_end = row['forecast_end_date']
    horizon = (forecast_end - forecast_start).days + 1
    
    # Get pre-computed features
    feat = rm_features[rm_id]
    
    # Get future POs
    future_pos = purchase_orders[
        (purchase_orders['product_id'].isin(feat['rm_products'])) &
        (purchase_orders['delivery_date'] >= forecast_start) &
        (purchase_orders['delivery_date'] <= forecast_end) &
        (purchase_orders['status'] != 'Deleted')
    ]
    po_qty = future_pos['quantity'].sum() if len(future_pos) > 0 else 0
    po_count = len(future_pos)
    
    # Create feature vector
    feature_dict = {
        'rm_id': rm_id,
        'forecast_horizon': horizon,
        'month': forecast_start.month,
        'quarter': forecast_start.quarter,
        'total_weight_365d': feat['total_weight_365d'],
        'avg_weight_365d': feat['avg_weight_365d'],
        'count_365d': feat['count_365d'],
        'days_since_last': feat['days_since_last'],
        'total_weight_90d': feat['total_weight_90d'],
        'count_90d': feat['count_90d'],
        'daily_rate_365d': feat['daily_rate_365d'],
        'future_po_quantity': po_qty,
        'future_po_count': po_count
    }
    
    feature_vector = pd.DataFrame([feature_dict])[feature_cols]
    pred = model.predict(feature_vector)[0]
    pred = max(0, pred)
    
    predictions.append({'ID': row['ID'], 'predicted_weight': pred})
    
    if (idx + 1) % 5000 == 0:
        print(f"Processed {idx + 1}/{len(prediction_mapping)}...")

predictions_df = pd.DataFrame(predictions)

print("\nPrediction statistics:")
print(predictions_df['predicted_weight'].describe())
print(f"Predictions > 0: {(predictions_df['predicted_weight'] > 0).sum()}")

# SUBMISSION
print("\n[5] CREATING SUBMISSION")
print("-"*80)

submission = sample_submission.copy()
submission['predicted_weight'] = predictions_df['predicted_weight'].values
submission.to_csv('xgboost_optimized_submission.csv', index=False)
print("Saved to 'xgboost_optimized_submission.csv'")

print("\n" + "="*80)
print("COMPLETE!")
print("="*80)


[1] DATA CLEANING
Clean receivals: 122383
Using 11 training dates x 5 horizons
Expected samples: ~3300 (only active rm_ids)
Active rm_ids in 2024: 60
Processing date 1/11: 2024-01-01...
Processing date 2/11: 2024-02-01...
Processing date 3/11: 2024-03-01...
Processing date 4/11: 2024-04-01...
Processing date 5/11: 2024-05-01...
Processing date 6/11: 2024-06-01...
Processing date 7/11: 2024-07-01...
Processing date 8/11: 2024-08-01...
Processing date 9/11: 2024-09-01...
Processing date 10/11: 2024-10-01...
Processing date 11/11: 2024-11-01...

Generated 2725 training samples

Training data statistics:
             rm_id  forecast_horizon        month      quarter  \
count  2725.000000       2725.000000  2725.000000  2725.000000   
mean   3082.660550         67.400000     6.264220     2.451376   
std     771.694638         49.879385     3.184245     1.077879   
min    2123.000000          7.000000     1.000000     1.000000   
25%    2143.000000         30.000000     4.000000     2.00000