# Purchase Value Prediction - Data Exploration

This notebook provides exploratory data analysis for the purchase value prediction dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

%matplotlib inline

## 1. Load Data

In [None]:
# Load the datasets
train_df = pd.read_csv('../data/raw/train_data.csv')
test_df = pd.read_csv('../data/raw/test_data.csv')
sample_submission = pd.read_csv('../data/raw/sample_submission.csv')

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"Sample submission shape: {sample_submission.shape}")

## 2. Basic Dataset Overview

In [None]:
# Display basic information
print("Training Data Info:")
train_df.info()
print("\n" + "="*50 + "\n")
print("First few rows:")
train_df.head()

## 3. Target Variable Analysis

In [None]:
# Target variable statistics
target = train_df['purchaseValue']

print("Purchase Value Statistics:")
print(target.describe())
print(f"\nZeros: {(target == 0).sum()} ({(target == 0).mean()*100:.1f}%)")
print(f"Non-zeros: {(target > 0).sum()} ({(target > 0).mean()*100:.1f}%)")

In [None]:
# Target distribution visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Full distribution
axes[0, 0].hist(target, bins=50, alpha=0.7, edgecolor='black')
axes[0, 0].set_title('Purchase Value Distribution (Full)')
axes[0, 0].set_xlabel('Purchase Value')
axes[0, 0].set_ylabel('Frequency')

# Log scale (for non-zero values)
non_zero_target = target[target > 0]
if len(non_zero_target) > 0:
    axes[0, 1].hist(np.log1p(non_zero_target), bins=50, alpha=0.7, edgecolor='black')
    axes[0, 1].set_title('Purchase Value Distribution (Log Scale, Non-Zero)')
    axes[0, 1].set_xlabel('Log(Purchase Value + 1)')
    axes[0, 1].set_ylabel('Frequency')

# Box plot
axes[1, 0].boxplot(target, vert=True)
axes[1, 0].set_title('Purchase Value Box Plot')
axes[1, 0].set_ylabel('Purchase Value')

# Box plot (non-zero only)
if len(non_zero_target) > 0:
    axes[1, 1].boxplot(non_zero_target, vert=True)
    axes[1, 1].set_title('Purchase Value Box Plot (Non-Zero)')
    axes[1, 1].set_ylabel('Purchase Value')

plt.tight_layout()
plt.show()

## 4. Missing Values Analysis

In [None]:
# Missing values analysis
missing_train = train_df.isnull().sum().sort_values(ascending=False)
missing_test = test_df.isnull().sum().sort_values(ascending=False)

print("Missing Values in Training Data:")
print(missing_train[missing_train > 0])
print(f"\nTotal missing values: {missing_train.sum()}")

print("\nMissing Values in Test Data:")
print(missing_test[missing_test > 0])
print(f"\nTotal missing values: {missing_test.sum()}")

## 5. Feature Types Analysis

In [None]:
# Analyze feature types
numeric_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = train_df.select_dtypes(include=['object']).columns.tolist()

if 'purchaseValue' in numeric_cols:
    numeric_cols.remove('purchaseValue')

print(f"Numeric columns ({len(numeric_cols)}):")
for col in numeric_cols[:10]:  # Show first 10
    print(f"  - {col}")
if len(numeric_cols) > 10:
    print(f"  ... and {len(numeric_cols)-10} more")

print(f"\nCategorical columns ({len(categorical_cols)}):")
for col in categorical_cols[:10]:  # Show first 10
    unique_count = train_df[col].nunique()
    print(f"  - {col} ({unique_count} unique values)")
if len(categorical_cols) > 10:
    print(f"  ... and {len(categorical_cols)-10} more")

## 6. Key Categorical Features Analysis

In [None]:
# Analyze some key categorical features
key_categorical = ['browser', 'deviceType', 'userChannel', 'locationCountry']

fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.ravel()

for i, col in enumerate(key_categorical):
    if col in train_df.columns:
        value_counts = train_df[col].value_counts().head(10)
        value_counts.plot(kind='bar', ax=axes[i])
        axes[i].set_title(f'{col} Distribution (Top 10)')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Frequency')
        axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 7. Numeric Features Analysis

In [None]:
# Analyze some key numeric features
key_numeric = ['pageViews', 'totalHits', 'sessionNumber', 'totals.visits']

fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for i, col in enumerate(key_numeric):
    if col in train_df.columns:
        # Remove extreme outliers for better visualization
        data = train_df[col].dropna()
        q99 = data.quantile(0.99)
        data_clipped = data[data <= q99]
        
        axes[i].hist(data_clipped, bins=30, alpha=0.7, edgecolor='black')
        axes[i].set_title(f'{col} Distribution')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 8. Purchase Value by Key Categories

In [None]:
# Analyze purchase value by key categorical variables
categorical_analysis = ['userChannel', 'deviceType', 'browser']

fig, axes = plt.subplots(len(categorical_analysis), 1, figsize=(12, 4*len(categorical_analysis)))
if len(categorical_analysis) == 1:
    axes = [axes]

for i, col in enumerate(categorical_analysis):
    if col in train_df.columns:
        # Group by category and calculate mean purchase value
        grouped = train_df.groupby(col)['purchaseValue'].agg(['mean', 'count']).sort_values('mean', ascending=False)
        
        # Only show categories with at least 100 samples
        grouped_filtered = grouped[grouped['count'] >= 100].head(10)
        
        if len(grouped_filtered) > 0:
            grouped_filtered['mean'].plot(kind='bar', ax=axes[i])
            axes[i].set_title(f'Average Purchase Value by {col}')
            axes[i].set_xlabel(col)
            axes[i].set_ylabel('Average Purchase Value')
            axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 9. Correlation Analysis

In [None]:
# Correlation analysis for numeric features
numeric_features = train_df.select_dtypes(include=[np.number]).columns
correlation_matrix = train_df[numeric_features].corr()

# Plot correlation with target variable
target_corr = correlation_matrix['purchaseValue'].drop('purchaseValue').sort_values(key=abs, ascending=False)

plt.figure(figsize=(10, 8))
target_corr.head(20).plot(kind='barh')
plt.title('Top 20 Features - Correlation with Purchase Value')
plt.xlabel('Correlation Coefficient')
plt.tight_layout()
plt.show()

print("Top 10 Correlated Features with Purchase Value:")
print(target_corr.head(10))

## 10. Summary and Next Steps

In [None]:
print("=== DATASET SUMMARY ===")
print(f"Training samples: {len(train_df):,}")
print(f"Test samples: {len(test_df):,}")
print(f"Total features: {len(train_df.columns)-1}")
print(f"Numeric features: {len(numeric_cols)}")
print(f"Categorical features: {len(categorical_cols)}")
print(f"Missing values: {train_df.isnull().sum().sum():,}")
print(f"Target variable range: {train_df['purchaseValue'].min():.2f} - {train_df['purchaseValue'].max():.2f}")
print(f"Zero purchase values: {(train_df['purchaseValue'] == 0).sum():,} ({(train_df['purchaseValue'] == 0).mean()*100:.1f}%)")

print("\n=== KEY INSIGHTS ===")
print("1. This is a highly imbalanced regression problem with many zero purchase values")
print("2. Mix of numerical and categorical features from web analytics")
print("3. Some features have missing values that need handling")
print("4. Geographic, device, and traffic source features may be important predictors")

print("\n=== NEXT STEPS ===")
print("1. Handle missing values appropriately")
print("2. Encode categorical variables")
print("3. Consider feature engineering for web analytics data")
print("4. Try different models suitable for imbalanced regression")
print("5. Use appropriate evaluation metrics for imbalanced data")