# Task 1: Laying the Foundation for Analysis

This notebook demonstrates the comprehensive data analysis workflow for Brent oil price change point detection.

## Objectives
- Load and validate Brent oil price data
- Analyze time series properties (trend, stationarity, volatility)
- Visualize data and key events
- Generate summary statistics
- Test all implemented modules

## 1. Setup and Imports

In [1]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Add src to path
project_root = Path().resolve().parent
sys.path.insert(0, str(project_root / "src"))

# Import our modules
from data_processing.load_data import BrentOilDataLoader, DataLoadError
from data_processing.preprocess import TimeSeriesAnalyzer, PreprocessingError
from visualization.plots import TimeSeriesVisualizer, VisualizationError
from utils.config import setup_logging

# Setup logging
logger = setup_logging(log_level="INFO")

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['font.size'] = 10

print("‚úÖ All imports successful!")

‚úÖ All imports successful!


## 2. Load Data

In [2]:
# Initialize data loader
data_path = project_root / "data" / "raw" / "BrentOilPrices.csv"
loader = BrentOilDataLoader(data_path)

# Load data
print("Loading Brent oil price data...")
data = loader.load()

# Display basic info
print(f"\n‚úÖ Data loaded successfully!")
print(f"   Total records: {len(data):,}")
print(f"   Date range: {data['Date'].min()} to {data['Date'].max()}")
print(f"   Price range: ${data['Price'].min():.2f} - ${data['Price'].max():.2f}")

# Display first few rows
print("\nFirst 5 rows:")
display(data.head())

Loading Brent oil price data...

‚úÖ Data loaded successfully!
   Total records: 9,011
   Date range: 1987-05-20 00:00:00 to 2022-11-14 00:00:00
   Price range: $9.10 - $143.95

First 5 rows:


Unnamed: 0,Date,Price
0,1987-05-20,18.63
1,1987-05-21,18.45
2,1987-05-22,18.55
3,1987-05-25,18.6
4,1987-05-26,18.63


## 3. Data Overview and Summary Statistics

In [3]:
# Basic statistics
print("üìä Summary Statistics:")
print("=" * 60)
stats = data['Price'].describe()
print(stats)

print("\nüìà Additional Statistics:")
print(f"   Skewness: {data['Price'].skew():.4f}")
print(f"   Kurtosis: {data['Price'].kurtosis():.4f}")
print(f"   Coefficient of Variation: {(data['Price'].std() / data['Price'].mean()):.4f}")

üìä Summary Statistics:
count    9011.000000
mean       48.420782
std        32.860110
min         9.100000
25%        19.050000
50%        38.570000
75%        70.090000
max       143.950000
Name: Price, dtype: float64

üìà Additional Statistics:
   Skewness: 0.7653
   Kurtosis: -0.6104
   Coefficient of Variation: 0.6786


## 4. Compute Returns

In [4]:
# Compute log returns
returns = loader.compute_returns(method='log')

print(f"‚úÖ Computed {len(returns):,} log returns")
print(f"   Mean return: {returns.mean():.6f}")
print(f"   Std return: {returns.std():.6f}")
print(f"   Annualized volatility: {returns.std() * np.sqrt(252):.2%}")

# Display first few returns
print("\nFirst 5 returns:")
display(returns.head())

‚úÖ Computed 9,010 log returns
   Mean return: 0.000179
   Std return: 0.025532
   Annualized volatility: 40.53%

First 5 returns:


1   -0.009709
2    0.005405
3    0.002692
4    0.001612
5   -0.001612
Name: Price, dtype: float64

## 5. Time Series Analysis

In [5]:
# Initialize analyzer
analyzer = TimeSeriesAnalyzer(data)

print("üîç Performing time series analysis...")
print("=" * 60)

üîç Performing time series analysis...


### 5.1 Trend Analysis

In [6]:
# Analyze trend
trend_results = analyzer.analyze_trend()

print("üìà Trend Analysis Results:")
print(f"   Trend direction: {trend_results['trend_direction']}")
print(f"   Slope: {trend_results['slope']:.6f}")
print(f"   R-squared: {trend_results['r_squared']:.4f}")
print(f"   P-value: {trend_results['p_value']:.4e}")
print(f"   Statistically significant: {trend_results['is_significant']}")
print(f"   Estimated price change per year: ${trend_results['slope_per_year']:.2f}")

üìà Trend Analysis Results:
   Trend direction: increasing
   Slope: 0.009108
   R-squared: 0.5199
   P-value: 0.0000e+00
   Statistically significant: True
   Estimated price change per year: $2309.50


### 5.2 Stationarity Testing

In [7]:
# ADF Test
adf_results = analyzer.test_stationarity(method='adf')

print("üî¨ Augmented Dickey-Fuller (ADF) Test:")
print(f"   Test statistic: {adf_results['test_statistic']:.4f}")
print(f"   P-value: {adf_results['p_value']:.4e}")
print(f"   Result: {adf_results['interpretation']}")
print(f"   Critical values:")
for level, value in adf_results['critical_values'].items():
    print(f"      {level}: {value:.4f}")

üî¨ Augmented Dickey-Fuller (ADF) Test:
   Test statistic: -1.9939
   P-value: 2.8927e-01
   Result: Non-stationary
   Critical values:
      1%: -3.4311
      5%: -2.8619
      10%: -2.5669


In [8]:
# KPSS Test
kpss_results = analyzer.test_stationarity(method='kpss')

print("üî¨ KPSS Test:")
print(f"   Test statistic: {kpss_results['test_statistic']:.4f}")
print(f"   P-value: {kpss_results['p_value']:.4e}")
print(f"   Result: {kpss_results['interpretation']}")
print(f"   Critical values:")
for level, value in kpss_results['critical_values'].items():
    print(f"      {level}: {value:.4f}")

üî¨ KPSS Test:
   Test statistic: 1.0840
   P-value: 1.0000e-02
   Result: Non-stationary
   Critical values:
      10%: 0.1190
      5%: 0.1460
      2.5%: 0.1760
      1%: 0.2160


look-up table. The actual p-value is smaller than the p-value returned.

  result = kpss(prices, regression='ct', nlags='auto')


### 5.3 Volatility Analysis

In [None]:
# Analyze volatility
volatility_results = analyzer.analyze_volatility(window=30)

print("üìä Volatility Analysis:")
print(f"   Mean volatility: {volatility_results['mean_volatility']:.6f}")
print(f"   Annualized volatility: {volatility_results['annualized_volatility']:.2%}")
print(f"   Min rolling volatility: {volatility_results['min_volatility']:.6f}")
print(f"   Max rolling volatility: {volatility_results['max_volatility']:.6f}")
print(f"   Mean rolling volatility: {volatility_results['mean_rolling_volatility']:.6f}")
print(f"\n   Volatility clustering: {volatility_results['volatility_clustering']['interpretation']}")
if 'p_value' in volatility_results['volatility_clustering']:
    print(f"   Clustering test p-value: {volatility_results['volatility_clustering']['p_value']:.4f}")

## 6. Load Event Data

In [None]:
# Load event data
events_path = project_root / "data" / "external" / "key_events.csv"
events = pd.read_csv(events_path)
events['Event_Date'] = pd.to_datetime(events['Event_Date'])

print(f"‚úÖ Loaded {len(events)} key events")
print("\nEvent Summary:")
print(f"   Date range: {events['Event_Date'].min()} to {events['Event_Date'].max()}")
print(f"   Event types: {events['Event_Type'].nunique()}")
print(f"   Regions: {', '.join(events['Region'].unique())}")

# Display events by impact level
print("\nEvents by Impact Level:")
print(events['Impact_Level'].value_counts())

# Display first few events
print("\nFirst 5 events:")
display(events.head())

## 7. Visualizations

In [None]:
# Initialize visualizer
visualizer = TimeSeriesVisualizer()

print("üé® Creating visualizations...")

### 7.1 Price Series Over Time

In [None]:
# Prepare data for visualization
data_indexed = data.set_index('Date')

# Plot price series
fig = visualizer.plot_price_series(
    data_indexed,
    title="Brent Oil Price Time Series (1987-2022)"
)
plt.show()
print("‚úÖ Price series plot saved to reports/price_series.png")

### 7.2 Log Returns

In [None]:
# Plot returns
returns_indexed = returns.to_frame('Returns')
returns_indexed.index = data_indexed.index[1:]  # Align with dates

fig = visualizer.plot_returns(
    returns_indexed['Returns'],
    title="Log Returns Time Series"
)
plt.show()
print("‚úÖ Returns plot saved to reports/returns_series.png")

### 7.3 Rolling Volatility

In [None]:
# Plot volatility
fig = visualizer.plot_volatility(
    volatility_results['rolling_volatility'],
    title="Rolling Volatility (30-day window)"
)
plt.show()
print("‚úÖ Volatility plot saved to reports/volatility_series.png")

### 7.4 Price Distribution

In [None]:
# Plot price distribution
fig = visualizer.plot_distribution(
    data['Price'],
    title="Price Distribution"
)
plt.show()
print("‚úÖ Price distribution plot saved")

### 7.5 Returns Distribution

In [None]:
# Plot returns distribution
fig = visualizer.plot_distribution(
    returns,
    title="Returns Distribution"
)
plt.show()
print("‚úÖ Returns distribution plot saved")

### 7.6 Events Overlay on Price Series

In [None]:
# Plot price series with events
fig = visualizer.plot_events_overlay(
    data_indexed,
    events,
    title="Brent Oil Prices with Key Events"
)
plt.show()
print("‚úÖ Events overlay plot saved to reports/price_with_events.png")

## 8. Summary Statistics Export

In [None]:
# Get comprehensive summary statistics
summary_stats = analyzer.get_summary_statistics()

print("üìä Comprehensive Summary Statistics:")
display(summary_stats)

# Save to CSV
reports_dir = project_root / "reports"
reports_dir.mkdir(exist_ok=True)
summary_stats.to_csv(reports_dir / "summary_statistics.csv", index=False)
print("\n‚úÖ Summary statistics saved to reports/summary_statistics.csv")

## 9. Additional Analysis: Price by Decade

In [None]:
# Add decade column
data_indexed['Decade'] = (data_indexed.index.year // 10) * 10

# Summary by decade
decade_stats = data_indexed.groupby('Decade')['Price'].agg([
    'mean', 'std', 'min', 'max', 'count'
]).round(2)

print("üìä Price Statistics by Decade:")
display(decade_stats)

# Visualize
fig, ax = plt.subplots(figsize=(12, 6))
decades = decade_stats.index
means = decade_stats['mean']
stds = decade_stats['std']

ax.bar(decades, means, yerr=stds, capsize=5, alpha=0.7, color='#2E86AB')
ax.set_xlabel('Decade', fontsize=12, fontweight='bold')
ax.set_ylabel('Average Price (USD/barrel)', fontsize=12, fontweight='bold')
ax.set_title('Average Brent Oil Price by Decade', fontsize=14, fontweight='bold', pad=20)
ax.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

print("‚úÖ Decade analysis complete")

## 10. Key Insights and Conclusions

In [None]:
print("üîç Key Insights from Task 1 Analysis:")
print("=" * 60)
print(f"\n1. Data Coverage:")
print(f"   - {len(data):,} daily price observations")
print(f"   - Spanning {data['Date'].max().year - data['Date'].min().year} years")
print(f"   - Price range: ${data['Price'].min():.2f} to ${data['Price'].max():.2f}")

print(f"\n2. Trend Analysis:")
print(f"   - Overall trend: {trend_results['trend_direction']}")
print(f"   - Statistical significance: {trend_results['is_significant']}")
print(f"   - R¬≤ = {trend_results['r_squared']:.4f}")

print(f"\n3. Stationarity:")
print(f"   - ADF test: {adf_results['interpretation']}")
print(f"   - KPSS test: {kpss_results['interpretation']}")
print(f"   - Implication: {'Non-stationary series requires change point modeling' if not adf_results['is_stationary'] else 'Series is stationary'}")

print(f"\n4. Volatility:")
print(f"   - Annualized volatility: {volatility_results['annualized_volatility']:.2%}")
print(f"   - Volatility clustering: {volatility_results['volatility_clustering']['interpretation']}")

print(f"\n5. Events:")
print(f"   - {len(events)} key events identified")
print(f"   - {sum(events['Impact_Level'] == 'High')} high-impact events")
print(f"   - Ready for change point association analysis")

print("\n" + "=" * 60)
print("‚úÖ Task 1 Foundation Analysis Complete!")
print("   Ready to proceed with Task 2: Bayesian Change Point Modeling")