In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

print("Libraries imported successfully!")

## 1. Load and Inspect Dataset

In [None]:
# Load dataset
df = pd.read_csv('data/logistics_dataset.csv')

print(f"Dataset Shape: {df.shape}")
print(f"\nColumns ({len(df.columns)}): {list(df.columns)}")
print(f"\nMemory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
# Display first few rows
df.head(10)

In [None]:
# Data types and basic info
df.info()

In [None]:
# Statistical summary
df.describe()

## 2. Missing Values Analysis

In [None]:
# Check for missing values
missing_data = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isnull().sum(),
    'Missing_Percentage': (df.isnull().sum() / len(df) * 100).round(2)
})
missing_data = missing_data[missing_data['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

if len(missing_data) > 0:
    print("Missing Values Found:")
    print(missing_data)
    
    # Visualize missing values
    plt.figure(figsize=(10, 6))
    plt.barh(missing_data['Column'], missing_data['Missing_Percentage'])
    plt.xlabel('Missing Percentage (%)')
    plt.title('Missing Values by Feature')
    plt.tight_layout()
    plt.show()
else:
    print("✅ No missing values detected in the dataset!")

## 3. Target Variable Analysis (KPI_score)

In [None]:
# Target variable statistics
target = 'KPI_score'
print(f"Target Variable: {target}")
print(f"Mean: {df[target].mean():.4f}")
print(f"Median: {df[target].median():.4f}")
print(f"Std Dev: {df[target].std():.4f}")
print(f"Min: {df[target].min():.4f}")
print(f"Max: {df[target].max():.4f}")
print(f"Skewness: {df[target].skew():.4f}")
print(f"Kurtosis: {df[target].kurtosis():.4f}")

In [None]:
# Visualize target distribution
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Histogram
axes[0].hist(df[target], bins=50, edgecolor='black', alpha=0.7)
axes[0].axvline(df[target].mean(), color='red', linestyle='--', label=f'Mean: {df[target].mean():.3f}')
axes[0].axvline(df[target].median(), color='green', linestyle='--', label=f'Median: {df[target].median():.3f}')
axes[0].set_xlabel('KPI Score')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Target Distribution (Histogram)')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Box plot
axes[1].boxplot(df[target], vert=True)
axes[1].set_ylabel('KPI Score')
axes[1].set_title('Target Distribution (Box Plot)')
axes[1].grid(True, alpha=0.3)

# Q-Q plot
stats.probplot(df[target], dist="norm", plot=axes[2])
axes[2].set_title('Q-Q Plot (Normality Check)')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Categorical Features Analysis

In [None]:
# Identify categorical columns
categorical_cols = ['category', 'zone', 'storage_location_id', 'item_id']

for col in categorical_cols:
    if col in df.columns:
        print(f"\n{col}:")
        print(f"  Unique values: {df[col].nunique()}")
        print(f"  Value counts:\n{df[col].value_counts().head(10)}")

In [None]:
# Visualize categorical features vs target
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.flatten()

for idx, col in enumerate(['category', 'zone']):
    if col in df.columns:
        # Box plot
        df.boxplot(column='KPI_score', by=col, ax=axes[idx*2])
        axes[idx*2].set_title(f'KPI Score by {col}')
        axes[idx*2].set_xlabel(col)
        axes[idx*2].set_ylabel('KPI Score')
        plt.sca(axes[idx*2])
        plt.xticks(rotation=45)
        
        # Mean KPI by category
        mean_kpi = df.groupby(col)['KPI_score'].mean().sort_values(ascending=False)
        axes[idx*2+1].bar(range(len(mean_kpi)), mean_kpi.values)
        axes[idx*2+1].set_xticks(range(len(mean_kpi)))
        axes[idx*2+1].set_xticklabels(mean_kpi.index, rotation=45)
        axes[idx*2+1].set_ylabel('Mean KPI Score')
        axes[idx*2+1].set_title(f'Average KPI Score by {col}')
        axes[idx*2+1].grid(True, alpha=0.3)

plt.suptitle('')
plt.tight_layout()
plt.show()

## 5. Numerical Features Analysis

In [None]:
# Select numerical features (excluding ID columns and target)
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numerical_cols = [col for col in numerical_cols if col not in ['KPI_score']]

print(f"Numerical Features ({len(numerical_cols)}):")
print(numerical_cols)

In [None]:
# Distribution of numerical features
fig, axes = plt.subplots(5, 4, figsize=(20, 20))
axes = axes.flatten()

for idx, col in enumerate(numerical_cols[:20]):
    axes[idx].hist(df[col], bins=30, edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'{col}\nSkew: {df[col].skew():.2f}')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Correlation Analysis

In [None]:
# Correlation matrix with target
correlation_with_target = df[numerical_cols + ['KPI_score']].corr()['KPI_score'].sort_values(ascending=False)
print("Correlation with Target (KPI_score):")
print(correlation_with_target)

# Visualize top correlations
plt.figure(figsize=(10, 8))
correlation_with_target[1:].plot(kind='barh')
plt.xlabel('Correlation Coefficient')
plt.title('Feature Correlation with KPI Score')
plt.axvline(x=0, color='black', linestyle='--', linewidth=0.5)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Full correlation heatmap
plt.figure(figsize=(18, 14))
correlation_matrix = df[numerical_cols + ['KPI_score']].corr()
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Heatmap', fontsize=16, pad=20)
plt.tight_layout()
plt.show()

In [None]:
# Identify highly correlated feature pairs (multicollinearity check)
high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) > 0.85:
            high_corr_pairs.append({
                'Feature_1': correlation_matrix.columns[i],
                'Feature_2': correlation_matrix.columns[j],
                'Correlation': correlation_matrix.iloc[i, j]
            })

if high_corr_pairs:
    print("⚠️ Highly Correlated Feature Pairs (|corr| > 0.85):")
    high_corr_df = pd.DataFrame(high_corr_pairs).sort_values('Correlation', ascending=False)
    print(high_corr_df)
else:
    print("✅ No highly correlated feature pairs found (threshold: 0.85)")

## 7. Outlier Detection

In [None]:
# Detect outliers using IQR method
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return len(outliers), (len(outliers) / len(data)) * 100

outlier_summary = []
for col in numerical_cols:
    count, percentage = detect_outliers_iqr(df, col)
    outlier_summary.append({
        'Feature': col,
        'Outlier_Count': count,
        'Outlier_Percentage': round(percentage, 2)
    })

outlier_df = pd.DataFrame(outlier_summary).sort_values('Outlier_Count', ascending=False)
print("Outlier Summary (IQR Method):")
print(outlier_df.head(15))

In [None]:
# Visualize outliers for top features
top_outlier_features = outlier_df.head(8)['Feature'].tolist()
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.flatten()

for idx, col in enumerate(top_outlier_features):
    axes[idx].boxplot(df[col], vert=True)
    axes[idx].set_ylabel(col)
    axes[idx].set_title(f'{col}\n{outlier_df[outlier_df["Feature"]==col]["Outlier_Percentage"].values[0]:.1f}% outliers')
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Date Feature Analysis

In [None]:
# Convert date column
df['last_restock_date'] = pd.to_datetime(df['last_restock_date'])

# Extract date components
df['restock_year'] = df['last_restock_date'].dt.year
df['restock_month'] = df['last_restock_date'].dt.month
df['restock_day_of_week'] = df['last_restock_date'].dt.dayofweek
df['restock_day'] = df['last_restock_date'].dt.day

# Calculate days since restock (from latest date in dataset)
reference_date = df['last_restock_date'].max()
df['days_since_restock'] = (reference_date - df['last_restock_date']).dt.days

print(f"Date Range: {df['last_restock_date'].min()} to {df['last_restock_date'].max()}")
print(f"\nDays Since Restock Statistics:")
print(df['days_since_restock'].describe())

In [None]:
# Visualize date features
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# Restocks by month
df['restock_month'].value_counts().sort_index().plot(kind='bar', ax=axes[0, 0])
axes[0, 0].set_xlabel('Month')
axes[0, 0].set_ylabel('Count')
axes[0, 0].set_title('Restocks by Month')
axes[0, 0].grid(True, alpha=0.3)

# Restocks by day of week
day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
df['restock_day_of_week'].value_counts().sort_index().plot(kind='bar', ax=axes[0, 1])
axes[0, 1].set_xticklabels(day_names, rotation=45)
axes[0, 1].set_xlabel('Day of Week')
axes[0, 1].set_ylabel('Count')
axes[0, 1].set_title('Restocks by Day of Week')
axes[0, 1].grid(True, alpha=0.3)

# Days since restock distribution
axes[1, 0].hist(df['days_since_restock'], bins=50, edgecolor='black', alpha=0.7)
axes[1, 0].set_xlabel('Days Since Restock')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Distribution of Days Since Restock')
axes[1, 0].grid(True, alpha=0.3)

# KPI score vs days since restock
axes[1, 1].scatter(df['days_since_restock'], df['KPI_score'], alpha=0.5)
axes[1, 1].set_xlabel('Days Since Restock')
axes[1, 1].set_ylabel('KPI Score')
axes[1, 1].set_title('KPI Score vs Days Since Restock')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 9. Feature Relationships with Target

In [None]:
# Scatter plots for top correlated features
top_features = correlation_with_target[1:9].index.tolist()
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.flatten()

for idx, col in enumerate(top_features):
    axes[idx].scatter(df[col], df['KPI_score'], alpha=0.5, s=10)
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('KPI Score')
    axes[idx].set_title(f'{col} vs KPI Score\nCorr: {correlation_with_target[col]:.3f}')
    axes[idx].grid(True, alpha=0.3)
    
    # Add trend line
    z = np.polyfit(df[col], df['KPI_score'], 1)
    p = np.poly1d(z)
    axes[idx].plot(df[col], p(df[col]), "r--", alpha=0.8, linewidth=2)

plt.tight_layout()
plt.show()

## 10. Key Insights Summary

In [None]:
print("="*80)
print("KEY INSIGHTS FROM EDA")
print("="*80)

print(f"\n1. DATASET OVERVIEW:")
print(f"   - Total samples: {len(df):,}")
print(f"   - Total features: {len(df.columns)}")
print(f"   - Numerical features: {len(numerical_cols)}")
print(f"   - Categorical features: 4 (category, zone, storage_location_id, item_id)")

print(f"\n2. TARGET VARIABLE (KPI_score):")
print(f"   - Range: [{df['KPI_score'].min():.3f}, {df['KPI_score'].max():.3f}]")
print(f"   - Mean: {df['KPI_score'].mean():.3f}")
print(f"   - Std Dev: {df['KPI_score'].std():.3f}")
print(f"   - Skewness: {df['KPI_score'].skew():.3f}")

print(f"\n3. MISSING VALUES:")
if df.isnull().sum().sum() == 0:
    print(f"   ✅ No missing values in dataset")
else:
    print(f"   ⚠️ {df.isnull().sum().sum()} missing values found")

print(f"\n4. TOP 5 FEATURES CORRELATED WITH TARGET:")
for idx, (feature, corr) in enumerate(correlation_with_target[1:6].items(), 1):
    print(f"   {idx}. {feature}: {corr:.4f}")

print(f"\n5. MULTICOLLINEARITY:")
if high_corr_pairs:
    print(f"   ⚠️ {len(high_corr_pairs)} highly correlated pairs found (|corr| > 0.85)")
else:
    print(f"   ✅ No severe multicollinearity detected")

print(f"\n6. OUTLIERS:")
total_outlier_count = outlier_df['Outlier_Count'].sum()
print(f"   - Total outlier instances across features: {total_outlier_count:,}")
print(f"   - Features with >5% outliers: {len(outlier_df[outlier_df['Outlier_Percentage'] > 5])}")

print(f"\n7. CATEGORICAL FEATURES:")
print(f"   - category: {df['category'].nunique()} unique values")
print(f"   - zone: {df['zone'].nunique()} unique values")
print(f"   - storage_location_id: {df['storage_location_id'].nunique()} unique values (high cardinality)")
print(f"   - item_id: {df['item_id'].nunique()} unique values (should drop)")

print(f"\n8. DATE FEATURE:")
print(f"   - Date range: {df['last_restock_date'].min().date()} to {df['last_restock_date'].max().date()}")
print(f"   - Days since restock range: {df['days_since_restock'].min()}-{df['days_since_restock'].max()} days")

print("\n" + "="*80)
print("READY FOR FEATURE ENGINEERING AND MODEL TRAINING")
print("="*80)

In [None]:
# Save processed dataframe for model training
df.to_csv('data/logistics_dataset_with_date_features.csv', index=False)
print("✅ Dataset with date features saved to: data/logistics_dataset_with_date_features.csv")