In [1]:
# Web3 Trading Analysis - Data Preprocessing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Import custom modules
import sys
from pathlib import Path
sys.path.append(str(Path('../src').resolve()))

from preprocessor import WebTradingPreprocessor, TimestampProcessor
from utils import DataProfiler

print("🛠️ **DATA PREPROCESSING & TIME ALIGNMENT**")
print("=" * 60)
print(f"📅 Processing Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


🛠️ **DATA PREPROCESSING & TIME ALIGNMENT**
📅 Processing Date: 2025-08-08 18:13:15


In [2]:
# Load raw datasets
print("📥 Loading raw datasets for preprocessing...\n")

# Load datasets
sentiment_df = pd.read_csv("../data/raw/fear_greed_index.csv")
trading_df = pd.read_csv("../data/raw/historical_trader_data.csv")

print(f"✅ Raw sentiment data: {sentiment_df.shape}")
print(f"✅ Raw trading data: {trading_df.shape}")

# Quick preview of timestamp columns
print("\n🕒 **TIMESTAMP COLUMNS PREVIEW:**")
print("\nSentiment Data:")
for col in sentiment_df.columns:
    if any(keyword in col.lower() for keyword in ['date', 'time', 'timestamp']):
        print(f"   {col}: {sentiment_df[col].iloc[0]} ({sentiment_df[col].dtype})")

print("\nTrading Data:")  
for col in trading_df.columns:
    if any(keyword in col.lower() for keyword in ['date', 'time', 'timestamp']):
        print(f"   {col}: {trading_df[col].iloc[0]} ({trading_df[col].dtype})")


📥 Loading raw datasets for preprocessing...

✅ Raw sentiment data: (2644, 4)
✅ Raw trading data: (211224, 16)

🕒 **TIMESTAMP COLUMNS PREVIEW:**

Sentiment Data:
   timestamp: 1517463000 (int64)
   date: 2018-02-01 (object)

Trading Data:
   Timestamp IST: 02-12-2024 22:50 (object)
   Timestamp: 1730000000000.0 (float64)


In [3]:
# Initialize the main preprocessor
preprocessor = WebTradingPreprocessor()

print("🚀 **Preprocessor initialized successfully!**")
print("\nAvailable processing methods:")
print("   • preprocess_sentiment_data()")
print("   • preprocess_trading_data()")
print("   • validate_processed_data()")
print("   • create_alignment_summary()")


🚀 **Preprocessor initialized successfully!**

Available processing methods:
   • preprocess_sentiment_data()
   • preprocess_trading_data()
   • validate_processed_data()
   • create_alignment_summary()


In [4]:
# Preprocess sentiment data
sentiment_processed = preprocessor.preprocess_sentiment_data(sentiment_df)

# Show processing results
print(f"\n📊 **SENTIMENT DATA PROCESSING RESULTS:**")
print(f"   • Original shape: {sentiment_df.shape}")
print(f"   • Processed shape: {sentiment_processed.shape}")
print(f"   • New columns added: {set(sentiment_processed.columns) - set(sentiment_df.columns)}")

# Preview processed data
print(f"\n📋 **PROCESSED SENTIMENT DATA SAMPLE:**")
print(sentiment_processed[['date', 'classification', 'value', 'date_standardized']].head())


😰😤 **PREPROCESSING SENTIMENT DATA**
🔄 Processing timestamp column: timestamp
✅ No duplicates found
✅ Sentiment data preprocessing complete: (2644, 7)

📊 **SENTIMENT DATA PROCESSING RESULTS:**
   • Original shape: (2644, 4)
   • Processed shape: (2644, 7)
   • New columns added: {'date_parsed', 'date_standardized', 'timestamp_UTC'}

📋 **PROCESSED SENTIMENT DATA SAMPLE:**
         date classification  value date_standardized
0  2018-02-01           Fear     30        2018-02-01
1  2018-02-02   Extreme Fear     15        2018-02-02
2  2018-02-03           Fear     40        2018-02-03
3  2018-02-04   Extreme Fear     24        2018-02-04
4  2018-02-05   Extreme Fear     11        2018-02-05


In [5]:
# Preprocess trading data
trading_processed = preprocessor.preprocess_trading_data(trading_df)

# Show processing results
print(f"\n📊 **TRADING DATA PROCESSING RESULTS:**")
print(f"   • Original shape: {trading_df.shape}")
print(f"   • Processed shape: {trading_processed.shape}")
print(f"   • New columns added: {set(trading_processed.columns) - set(trading_df.columns)}")

# Preview key processed columns
print(f"\n📋 **PROCESSED TRADING DATA SAMPLE:**")
key_cols = ['Account', 'Side', 'Coin', 'Closed PnL', 'Timestamp IST', 'trading_date']
available_cols = [col for col in key_cols if col in trading_processed.columns]
print(trading_processed[available_cols].head())



📈 **PREPROCESSING TRADING DATA**
🔄 Processing timestamp column: Timestamp IST
🔄 Processing timestamp column: Timestamp
📊 Size USD: 32,661 outliers detected (>1.5×IQR)
📊 Closed PnL: 48,941 outliers detected (>1.5×IQR)
🧹 Removed 175,666 duplicate rows
✅ Trading data preprocessing complete: (35558, 21)

📊 **TRADING DATA PROCESSING RESULTS:**
   • Original shape: (211224, 16)
   • Processed shape: (35558, 21)
   • New columns added: {'Timestamp IST_UTC', 'Timestamp_UTC', 'Closed PnL_outlier_flag', 'Size USD_outlier_flag', 'trading_date'}

📋 **PROCESSED TRADING DATA SAMPLE:**
                                      Account Side  Coin  Closed PnL  \
0  0xae5eacaf9c6b9111fd53034a602c192a04e082ed  BUY  @107         0.0   
1  0xae5eacaf9c6b9111fd53034a602c192a04e082ed  BUY  @107         0.0   
2  0xae5eacaf9c6b9111fd53034a602c192a04e082ed  BUY  @107         0.0   
3  0xae5eacaf9c6b9111fd53034a602c192a04e082ed  BUY  @107         0.0   
4  0xae5eacaf9c6b9111fd53034a602c192a04e082ed  BUY  @107     

In [6]:
# Validate processed data
validation_results = preprocessor.validate_processed_data(sentiment_processed, trading_processed)

print("🔍 **DATA VALIDATION RESULTS:**")
print("=" * 50)

# Show validation summary
for dataset_name, results in validation_results.items():
    if isinstance(results, dict) and 'error' not in results:
        print(f"\n📊 **{dataset_name.upper()} VALIDATION:**")
        for key, value in results.items():
            if key not in ['missing_dates']:
                print(f"   • {key}: {value}")
        
        if 'missing_count' in results and results['missing_count'] > 0:
            print(f"   ⚠️ Missing {results['missing_count']} dates")



🔍 **VALIDATING PROCESSED DATA**
🔍 **DATA VALIDATION RESULTS:**

📊 **SENTIMENT VALIDATION:**
   • actual_start: 2018-02-01 00:00:00
   • actual_end: 2025-05-02 00:00:00
   • expected_start: 2018-02-01 00:00:00
   • expected_end: 2025-08-08 00:00:00
   • start_match: True
   • end_match: False
   • total_days: 2647
   • missing_count: 4
   ⚠️ Missing 4 dates

📊 **TRADING VALIDATION:**
   • actual_start: 2023-04-30 00:00:00
   • actual_end: 2025-05-01 00:00:00
   • expected_start: 2024-01-01 00:00:00
   • expected_end: 2025-08-08 00:00:00
   • start_match: False
   • end_match: False
   • total_days: 732
   • missing_count: 282
   ⚠️ Missing 282 dates

📊 **SENTIMENT_CONSISTENCY VALIDATION:**
   • classification: {'value_check': {'all_valid': True, 'invalid_values': [], 'unique_count': 5}}
   • value: {'range_check': {'min_valid': np.True_, 'max_valid': np.True_, 'actual_min': np.int64(5), 'actual_max': np.int64(95)}}

📊 **TRADING_CONSISTENCY VALIDATION:**
   • Side: {'value_check': {'all

In [7]:
# Create alignment summary
alignment_summary = preprocessor.create_alignment_summary(sentiment_processed, trading_processed)

print("📊 **DATA ALIGNMENT ANALYSIS:**")
print("=" * 50)

for key, values in alignment_summary.items():
    print(f"\n🗂️ **{key.replace('_', ' ').title()}:**")
    if isinstance(values, dict):
        for sub_key, sub_value in values.items():
            if isinstance(sub_value, float):
                print(f"   • {sub_key}: {sub_value:.1f}")
            else:
                print(f"   • {sub_key}: {sub_value}")



📊 **DATA ALIGNMENT SUMMARY**
📊 **DATA ALIGNMENT ANALYSIS:**

🗂️ **Sentiment Date Range:**
   • start: 2018-02-01
   • end: 2025-05-02
   • total_days: 2644
   • unique_dates: 2644

🗂️ **Trading Date Range:**
   • start: 2023-04-30
   • end: 2025-05-01
   • total_trades: 35558
   • unique_trading_dates: 451

🗂️ **Overlap Analysis:**
   • overlapping_dates: 451
   • sentiment_only_dates: 2193
   • trading_only_dates: 0
   • overlap_percentage: 17.1


In [8]:
# Save processed datasets
print("💾 **SAVING PROCESSED DATA:**")
print("=" * 40)

# Save to processed folder
sentiment_processed.to_csv("../data/processed/sentiment_processed.csv", index=False)
trading_processed.to_csv("../data/processed/trading_processed.csv", index=False)

print("✅ Processed sentiment data saved: data/processed/sentiment_processed.csv")
print("✅ Processed trading data saved: data/processed/trading_processed.csv")

print(f"\n📊 **FINAL PROCESSED DATA SUMMARY:**")
print(f"   • Sentiment records: {len(sentiment_processed):,}")
print(f"   • Trading records: {len(trading_processed):,}")
print(f"   • Memory usage: {(sentiment_processed.memory_usage(deep=True).sum() + trading_processed.memory_usage(deep=True).sum()) / 1024**2:.2f} MB")

print(f"\n🎯 **Phase 3A Complete - Data Preprocessing Success!**")
print("✅ Ready for Phase 3B: Feature Engineering")


💾 **SAVING PROCESSED DATA:**
✅ Processed sentiment data saved: data/processed/sentiment_processed.csv
✅ Processed trading data saved: data/processed/trading_processed.csv

📊 **FINAL PROCESSED DATA SUMMARY:**
   • Sentiment records: 2,644
   • Trading records: 35,558
   • Memory usage: 22.56 MB

🎯 **Phase 3A Complete - Data Preprocessing Success!**
✅ Ready for Phase 3B: Feature Engineering
