In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# STEP 1: DATA LOADING AND INITIAL EXPLORATION


In [2]:
# STEP 1: DATA LOADING AND INITIAL EXPLORATION
print("\n🔍 STEP 1: Data Loading and Initial Exploration")
print("-" * 40)
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')
print("✅ Successfully loaded train.csv and test.csv!")
print(f"📈 Train dataset size: {train_df.shape}")
print(f"📈 Test dataset size: {test_df.shape}")
    
print("\n📋 First 3 rows of training data:")
print(train_df.head(3))
    
print("\n📊 Data types:")
print(train_df.dtypes)
    
print("\n📊 Training data info:")
print(train_df.info())
    
# Check if session_value exists in test data
has_target_in_test = 'session_value' in test_df.columns
print(f"\n🎯 Target variable in test set: {'Yes' if has_target_in_test else 'No'}")


🔍 STEP 1: Data Loading and Initial Exploration
----------------------------------------
✅ Successfully loaded train.csv and test.csv!
📈 Train dataset size: (141219, 7)
📈 Test dataset size: (62951, 6)

📋 First 3 rows of training data:
                  event_time event_type   product_id category_id      user_id  \
0  2025-06-19 10:23:07+00:00   ADD_CART  PROD_011223   CAT_00054  USER_097562   
1  2025-06-07 21:34:45+00:00   ADD_CART  PROD_005519   CAT_00144  USER_006535   
2  2025-06-21 21:29:09+00:00   ADD_CART  PROD_000577   CAT_00273  USER_047199   

     user_session  session_value  
0  SESSION_158779          90.29  
1  SESSION_029987          16.39  
2  SESSION_022134          64.27  

📊 Data types:
event_time        object
event_type        object
product_id        object
category_id       object
user_id           object
user_session      object
session_value    float64
dtype: object

📊 Training data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141219 entries, 0 to 14

In [3]:
print("\n📈 Session Value Statistics:")
print(train_df['session_value'].describe())
    
# Check for missing values
print(f"\n❓ Missing values in training data:")
print(train_df.isnull().sum())



📈 Session Value Statistics:
count    141219.000000
mean         75.348539
std         121.794683
min           5.380000
25%          23.780000
50%          40.950000
75%          86.440000
max        2328.660000
Name: session_value, dtype: float64

❓ Missing values in training data:
event_time       0
event_type       0
product_id       0
category_id      0
user_id          0
user_session     0
session_value    0
dtype: int64


# STEP 2: DATA CLEANING AND TRANSFORMATIONS


In [4]:
# STEP 2: DATA CLEANING AND TRANSFORMATIONS
print("\n🧹 STEP 2: Data Cleaning and Transformations")
print("-" * 40)

def clean_and_transform_data(df, is_train=True):
    """Clean and transform the dataset"""
    df_cleaned = df.copy()
    
    # Convert event_time to datetime
    df_cleaned['event_time'] = pd.to_datetime(df_cleaned['event_time'])
    print(f"✅ Converted event_time to datetime for {'train' if is_train else 'test'} set")
    
    # Extract numeric values from categorical columns
    df_cleaned['product_numeric'] = df_cleaned['product_id'].str.extract('(\d+)').astype(int)
    df_cleaned['category_numeric'] = df_cleaned['category_id'].str.extract('(\d+)').astype(int)
    df_cleaned['user_numeric'] = df_cleaned['user_id'].str.extract('(\d+)').astype(int)
    df_cleaned['session_numeric'] = df_cleaned['user_session'].str.extract('(\d+)').astype(int)
    
    print(f"✅ Extracted numeric values from categorical columns")
    
    return df_cleaned

# Clean both datasets
train_cleaned = clean_and_transform_data(train_df, is_train=True)
test_cleaned = clean_and_transform_data(test_df, is_train=False)


🧹 STEP 2: Data Cleaning and Transformations
----------------------------------------
✅ Converted event_time to datetime for train set
✅ Extracted numeric values from categorical columns
✅ Converted event_time to datetime for test set
✅ Extracted numeric values from categorical columns


# STEP 3: FEATURE ENGINEERING


In [5]:
# STEP 3: FEATURE ENGINEERING
print("\n⚙️ STEP 3: Feature Engineering")
print("-" * 40)

def create_features(df):
    """Create new features from existing data"""
    df_features = df.copy()
    
    # Time-based features
    df_features['hour'] = df_features['event_time'].dt.hour
    df_features['day_of_week'] = df_features['event_time'].dt.dayofweek
    df_features['day_of_month'] = df_features['event_time'].dt.day
    df_features['month'] = df_features['event_time'].dt.month
    df_features['is_weekend'] = (df_features['day_of_week'] >= 5).astype(int)
    
    # Time period categories
    def get_time_period(hour):
        if 6 <= hour < 12:
            return 'morning'
        elif 12 <= hour < 18:
            return 'afternoon'
        elif 18 <= hour < 22:
            return 'evening'
        else:
            return 'night'
    
    df_features['time_period'] = df_features['hour'].apply(get_time_period)
    
    # Advanced features
    # Product ID patterns (assuming higher numbers might indicate newer products)
    df_features['product_age_proxy'] = df_features['product_numeric'].max() - df_features['product_numeric']
    
    # User activity patterns (proxy based on user_numeric)
    df_features['user_activity_proxy'] = df_features['user_numeric'] % 100  # Simple proxy
    
    # Session patterns
    df_features['session_activity_proxy'] = df_features['session_numeric'] % 1000  # Simple proxy
    
    print("✅ Created new features:")
    new_features = ['hour', 'day_of_week', 'day_of_month', 'month', 'is_weekend', 
                   'time_period', 'product_age_proxy', 'user_activity_proxy', 'session_activity_proxy']
    for i, feature in enumerate(new_features, 1):
        print(f"   {i}. {feature}")
    
    return df_features

# Create features for both datasets
train_with_features = create_features(train_cleaned)
test_with_features = create_features(test_cleaned)



⚙️ STEP 3: Feature Engineering
----------------------------------------
✅ Created new features:
   1. hour
   2. day_of_week
   3. day_of_month
   4. month
   5. is_weekend
   6. time_period
   7. product_age_proxy
   8. user_activity_proxy
   9. session_activity_proxy
✅ Created new features:
   1. hour
   2. day_of_week
   3. day_of_month
   4. month
   5. is_weekend
   6. time_period
   7. product_age_proxy
   8. user_activity_proxy
   9. session_activity_proxy


# STEP 4: EXPLORATORY DATA ANALYSIS (EDA)


In [6]:
# STEP 4: EXPLORATORY DATA ANALYSIS (EDA)
print("\n📊 STEP 4: Exploratory Data Analysis (EDA)")
print("-" * 40)

if 'session_value' in train_with_features.columns:
    print("🔍 Session Value Distribution:")
    print(f"   • Minimum: {train_with_features['session_value'].min():.2f}")
    print(f"   • Maximum: {train_with_features['session_value'].max():.2f}")
    print(f"   • Mean: {train_with_features['session_value'].mean():.2f}")
    print(f"   • Median: {train_with_features['session_value'].median():.2f}")
    print(f"   • Standard deviation: {train_with_features['session_value'].std():.2f}")
    
    print("\n⏰ Hourly distribution:")
    if len(train_with_features) > 1:
        hourly_stats = train_with_features.groupby('hour')['session_value'].agg(['mean', 'count']).round(2)
        print(hourly_stats.head())
    
    print("\n📅 Day of week distribution:")
    if len(train_with_features) > 1:
        daily_stats = train_with_features.groupby('day_of_week')['session_value'].agg(['mean', 'count']).round(2)
        day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
        if not daily_stats.empty:
            daily_stats.index = [day_names[i] if i < len(day_names) else f'Day_{i}' for i in daily_stats.index]
            print(daily_stats.head())
    
    print("\n🏷️ Category distribution:")
    if len(train_with_features) > 1:
        category_stats = train_with_features.groupby('category_numeric')['session_value'].agg(['mean', 'count']).round(2)
        print(category_stats.head())


📊 STEP 4: Exploratory Data Analysis (EDA)
----------------------------------------
🔍 Session Value Distribution:
   • Minimum: 5.38
   • Maximum: 2328.66
   • Mean: 75.35
   • Median: 40.95
   • Standard deviation: 121.79

⏰ Hourly distribution:
       mean  count
hour              
0     80.41   1516
1     76.73   1424
2     74.41   1709
3     72.55   2172
4     62.04   2849

📅 Day of week distribution:
            mean  count
Monday     73.54  23695
Tuesday    74.42  19619
Wednesday  78.44  18139
Thursday   70.39  15846
Friday     69.78  20348

🏷️ Category distribution:
                   mean  count
category_numeric              
1                 65.10    410
2                 90.73    415
3                 69.55    478
4                 94.23   1551
5                 26.91      9


# STEP 5: FEATURE SELECTION AND PREPARATION


In [7]:
# STEP 5: FEATURE SELECTION AND PREPARATION
print("\n🎯 STEP 5: Feature Selection and Model Preparation")
print("-" * 40)

# Define target variable
target = 'session_value'
print(f"🎯 Target variable: {target}")

# Define numerical features
numerical_features = [
    'hour', 'day_of_week', 'day_of_month', 'month', 'is_weekend',
    'product_numeric', 'category_numeric', 'user_numeric', 'session_numeric',
    'product_age_proxy', 'user_activity_proxy', 'session_activity_proxy'
]

print(f"📊 Numerical features ({len(numerical_features)} features):")
for i, feature in enumerate(numerical_features, 1):
    print(f"   {i}. {feature}")

# Encode categorical features
le_time = LabelEncoder()
train_with_features['time_period_encoded'] = le_time.fit_transform(train_with_features['time_period'])
test_with_features['time_period_encoded'] = le_time.transform(test_with_features['time_period'])
numerical_features.append('time_period_encoded')

print(f"🔤 Encoded categorical feature: time_period -> time_period_encoded")


🎯 STEP 5: Feature Selection and Model Preparation
----------------------------------------
🎯 Target variable: session_value
📊 Numerical features (12 features):
   1. hour
   2. day_of_week
   3. day_of_month
   4. month
   5. is_weekend
   6. product_numeric
   7. category_numeric
   8. user_numeric
   9. session_numeric
   10. product_age_proxy
   11. user_activity_proxy
   12. session_activity_proxy
🔤 Encoded categorical feature: time_period -> time_period_encoded


In [8]:
# Create feature matrix and target vector
X_train = train_with_features[numerical_features].copy()
y_train = train_with_features[target].copy() if target in train_with_features.columns else None

X_test = test_with_features[numerical_features].copy()
y_test = test_with_features[target].copy() if target in test_with_features.columns else None

print(f"\n✅ Training feature matrix ready: {X_train.shape}")
print(f"✅ Test feature matrix ready: {X_test.shape}")

if y_train is not None:
    print(f"✅ Training target vector ready: {y_train.shape}")
if y_test is not None:
    print(f"✅ Test target vector ready: {y_test.shape}")

# Correlation analysis
if y_train is not None and len(X_train) > 1:
    print("\n🔗 Feature correlation with target:")
    correlations = X_train.corrwith(y_train).sort_values(key=abs, ascending=False)
    print(correlations.round(3).head(10))


✅ Training feature matrix ready: (141219, 13)
✅ Test feature matrix ready: (62951, 13)
✅ Training target vector ready: (141219,)

🔗 Feature correlation with target:
user_numeric          -0.093
user_activity_proxy    0.041
category_numeric      -0.037
is_weekend             0.025
day_of_week            0.020
day_of_month           0.018
time_period_encoded   -0.018
product_numeric       -0.008
product_age_proxy      0.008
session_numeric        0.005
dtype: float64


# STEP 6: DATA NORMALIZATION


In [9]:
print("\n📏 STEP 6: Data Normalization")
print("-" * 40)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=numerical_features)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=numerical_features)

print("✅ Features scaled using StandardScaler")
print(f"📊 Scaled training data shape: {X_train_scaled_df.shape}")
print(f"📊 Scaled test data shape: {X_test_scaled_df.shape}")

print("\nSample normalized values (first 3 rows):")
print(X_train_scaled_df.head(3).round(3))


📏 STEP 6: Data Normalization
----------------------------------------
✅ Features scaled using StandardScaler
📊 Scaled training data shape: (141219, 13)
📊 Scaled test data shape: (62951, 13)

Sample normalized values (first 3 rows):
    hour  day_of_week  day_of_month  month  is_weekend  product_numeric  \
0 -0.512        0.000         1.393    0.0      -0.668           -0.357   
1  1.480        0.963        -0.582    0.0       1.497           -0.950   
2  1.480        0.963         1.722    0.0       1.497           -1.463   

   category_numeric  user_numeric  session_numeric  product_age_proxy  \
0            -0.746         1.871            1.164              0.357   
1            -0.030        -1.306           -1.185              0.950   
2             0.997         0.113           -1.328              1.463   

   user_activity_proxy  session_activity_proxy  time_period_encoded  
0                0.420                   0.976                0.693  
1               -0.513           

# STEP 7: MODEL TRAINING AND EVALUATION

In [10]:
print("\n🤖 STEP 7: Model Training and Evaluation")
print("-" * 40)

models = {}
predictions = {}

# =========================
# Model 1: Linear Regression
# =========================
print("📈 Model 1: Linear Regression")
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)
models['Linear Regression'] = lr_model

# Predictions
lr_train_pred = lr_model.predict(X_train_scaled)
lr_test_pred = lr_model.predict(X_test_scaled)
predictions['Linear Regression'] = lr_test_pred

# Metrics - Train
print(f"   Training Metrics:")
print(f"   • MSE: {mean_squared_error(y_train, lr_train_pred):.2f}")
print(f"   • RMSE: {np.sqrt(mean_squared_error(y_train, lr_train_pred)):.2f}")
print(f"   • MAE: {mean_absolute_error(y_train, lr_train_pred):.2f}")
print(f"   • R²: {r2_score(y_train, lr_train_pred):.3f}")

# Metrics - Test
if y_test is not None:
    print(f"   Test Metrics:")
    print(f"   • MSE: {mean_squared_error(y_test, lr_test_pred):.2f}")
    print(f"   • RMSE: {np.sqrt(mean_squared_error(y_test, lr_test_pred)):.2f}")
    print(f"   • MAE: {mean_absolute_error(y_test, lr_test_pred):.2f}")
    print(f"   • R²: {r2_score(y_test, lr_test_pred):.3f}")




🤖 STEP 7: Model Training and Evaluation
----------------------------------------
📈 Model 1: Linear Regression
   Training Metrics:
   • MSE: 14641.81
   • RMSE: 121.00
   • MAE: 60.49
   • R²: 0.013


In [12]:
# =========================
# Model 2: Random Forest
# =========================
print("\n🌳 Model 2: Random Forest")
rf_model = RandomForestRegressor(
    n_estimators=100, 
    max_depth=7, 
    random_state=42,
    min_samples_split=3,
    min_samples_leaf=2
)
rf_model.fit(X_train_scaled, y_train)
models['Random Forest'] = rf_model

# Predictions
rf_train_pred = rf_model.predict(X_train_scaled)
rf_test_pred = rf_model.predict(X_test_scaled)
predictions['Random Forest'] = rf_test_pred

# Metrics - Train
print(f"   Training Metrics:")
print(f"   • MSE: {mean_squared_error(y_train, rf_train_pred):.2f}")
print(f"   • RMSE: {np.sqrt(mean_squared_error(y_train, rf_train_pred)):.2f}")
print(f"   • MAE: {mean_absolute_error(y_train, rf_train_pred):.2f}")
print(f"   • R²: {r2_score(y_train, rf_train_pred):.3f}")

# Metrics - Test
if y_test is not None:
    print(f"   Test Metrics:")
    print(f"   • MSE: {mean_squared_error(y_test, rf_test_pred):.2f}")
    print(f"   • RMSE: {np.sqrt(mean_squared_error(y_test, rf_test_pred)):.2f}")
    print(f"   • MAE: {mean_absolute_error(y_test, rf_test_pred):.2f}")
    print(f"   • R²: {r2_score(y_test, rf_test_pred):.3f}")

# =========================
# Random Forest Feature Importance
# =========================
print("\n🔍 Random Forest - Feature Importance:")
feature_importance = pd.DataFrame({
    'feature': numerical_features,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

for idx, row in feature_importance.head(10).iterrows():
    print(f"   {row['feature']}: {row['importance']:.3f}")


🌳 Model 2: Random Forest
   Training Metrics:
   • MSE: 7230.91
   • RMSE: 85.03
   • MAE: 53.25
   • R²: 0.513

🔍 Random Forest - Feature Importance:
   user_numeric: 0.756
   session_activity_proxy: 0.088
   user_activity_proxy: 0.067
   session_numeric: 0.051
   day_of_month: 0.009
   day_of_week: 0.009
   category_numeric: 0.009
   hour: 0.005
   time_period_encoded: 0.003
   product_age_proxy: 0.002


# STEP 8: PREDICTION RESULTS

In [13]:
# STEP 8: PREDICTION RESULTS
print("\n🎯 STEP 8: Prediction Results")
print("-" * 40)

if predictions:
    print("📊 Test Set Predictions:")
    results_df = pd.DataFrame({
        'Index': range(len(X_test)),
        'Linear_Regression': predictions.get('Linear Regression', []),
        'Random_Forest': predictions.get('Random Forest', [])
    })
    
    if y_test is not None:
        results_df['Actual'] = y_test.values
        results_df['LR_Error'] = abs(results_df['Actual'] - results_df['Linear_Regression'])
        results_df['RF_Error'] = abs(results_df['Actual'] - results_df['Random_Forest'])
    
    print(results_df.round(2))
    
    # Save predictions
    print("\n💾 Saving predictions...")
    results_df.to_csv('predictions.csv', index=False)
    print("✅ Predictions saved to 'predictions.csv'")


🎯 STEP 8: Prediction Results
----------------------------------------
📊 Test Set Predictions:
       Index  Linear_Regression  Random_Forest
0          0              68.21          71.59
1          1              90.74          90.78
2          2              88.08          75.04
3          3              90.49          79.00
4          4              83.38          70.65
...      ...                ...            ...
62946  62946              61.14          69.76
62947  62947              60.92          69.97
62948  62948              90.27          74.29
62949  62949             108.19          84.26
62950  62950             108.63          79.81

[62951 rows x 3 columns]

💾 Saving predictions...
✅ Predictions saved to 'predictions.csv'
