# Module 5.1 – Time Series Analysis for Semiconductor Manufacturing

This notebook demonstrates time series forecasting techniques for semiconductor manufacturing data using ARIMA and Seasonal ARIMA models.

## Setup and Data Loading

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Configure plotting
plt.rcParams['figure.figsize'] = (12, 6)
sns.set_style('whitegrid')

In [None]:
# Import time series modules
import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller, acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Import our pipeline
from time_series_pipeline import TimeSeriesPipeline, generate_semiconductor_time_series

print("Time series libraries loaded successfully!")

## 1. Data Generation and Exploration

Let's generate synthetic semiconductor manufacturing time series data to demonstrate the concepts.

In [None]:
# Generate synthetic semiconductor time series data
df = generate_semiconductor_time_series(n_periods=500, seed=42)
print(f"Generated {len(df)} time series observations")
print(f"Date range: {df.index.min()} to {df.index.max()}")
print(f"Frequency: {df.index.freq}")
df.head()

In [None]:
# Plot the complete time series
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Semiconductor Manufacturing Time Series Data', fontsize=16)

# Temperature
axes[0,0].plot(df.index, df['temperature'], color='red', alpha=0.7)
axes[0,0].set_title('Chamber Temperature (°C)')
axes[0,0].set_ylabel('Temperature')

# Pressure
axes[0,1].plot(df.index, df['pressure'], color='blue', alpha=0.7)
axes[0,1].set_title('Chamber Pressure (Torr)')
axes[0,1].set_ylabel('Pressure')

# Flow rate
axes[1,0].plot(df.index, df['flow_rate'], color='green', alpha=0.7)
axes[1,0].set_title('Gas Flow Rate (sccm)')
axes[1,0].set_ylabel('Flow Rate')

# Target yield
axes[1,1].plot(df.index, df['target'], color='orange', alpha=0.7)
axes[1,1].set_title('Yield Target (%)')
axes[1,1].set_ylabel('Yield')

plt.tight_layout()
plt.show()

## 2. Stationarity Analysis

Before fitting ARIMA models, we need to check if our time series is stationary.

In [None]:
def check_stationarity(timeseries, title):
    """Perform ADF test and plot rolling statistics."""
    
    # Rolling statistics
    rolling_mean = timeseries.rolling(window=24).mean()  # 24-hour window
    rolling_std = timeseries.rolling(window=24).std()
    
    # Plot
    fig, ax = plt.subplots(figsize=(12, 6))
    ax.plot(timeseries.index, timeseries, color='blue', label='Original', alpha=0.7)
    ax.plot(rolling_mean.index, rolling_mean, color='red', label='Rolling Mean')
    ax.plot(rolling_std.index, rolling_std, color='black', label='Rolling Std')
    ax.legend(loc='best')
    ax.set_title(f'Rolling Mean & Standard Deviation - {title}')
    plt.tight_layout()
    plt.show()
    
    # ADF test
    print(f'\nAugmented Dickey-Fuller Test for {title}:')
    dftest = adfuller(timeseries.dropna(), autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput[f'Critical Value ({key})'] = value
    print(dfoutput)
    
    if dftest[1] <= 0.05:
        print("Result: Series is stationary")
    else:
        print("Result: Series is non-stationary")
    
    return dftest[1] <= 0.05

In [None]:
# Check stationarity of target series
target_series = df['target']
is_stationary = check_stationarity(target_series, 'Yield Target')

## 3. Seasonal Decomposition

Let's decompose the time series to understand its components.

In [None]:
# Seasonal decomposition
decomposition = seasonal_decompose(target_series, model='additive', period=24)  # Daily seasonality

fig, axes = plt.subplots(4, 1, figsize=(15, 12))
fig.suptitle('Seasonal Decomposition of Yield Target', fontsize=16)

decomposition.observed.plot(ax=axes[0], title='Original')
decomposition.trend.plot(ax=axes[1], title='Trend')
decomposition.seasonal.plot(ax=axes[2], title='Seasonal')
decomposition.resid.plot(ax=axes[3], title='Residual')

plt.tight_layout()
plt.show()

## 4. Autocorrelation Analysis

ACF and PACF plots help us determine appropriate ARIMA parameters.

In [None]:
# Plot ACF and PACF
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# ACF plot
plot_acf(target_series.dropna(), ax=axes[0], lags=50, title='Autocorrelation Function')

# PACF plot
plot_pacf(target_series.dropna(), ax=axes[1], lags=50, title='Partial Autocorrelation Function')

plt.tight_layout()
plt.show()

## 5. Model Fitting and Comparison

Let's fit different ARIMA models and compare their performance.

In [None]:
# Split data for validation
train_size = int(len(df) * 0.8)
train_data = df.iloc[:train_size]
test_data = df.iloc[train_size:]

print(f"Training data: {len(train_data)} observations")
print(f"Test data: {len(test_data)} observations")

In [None]:
# Define models to compare
models_to_test = [
    {'name': 'ARIMA(1,1,1)', 'order': (1,1,1), 'seasonal_order': None},
    {'name': 'ARIMA(2,1,2)', 'order': (2,1,2), 'seasonal_order': None},
    {'name': 'SARIMA(1,1,1)(1,1,1,24)', 'order': (1,1,1), 'seasonal_order': (1,1,1,24)},
    {'name': 'SARIMA(1,1,1)(0,1,1,24)', 'order': (1,1,1), 'seasonal_order': (0,1,1,24)}
]

results = []

for model_config in models_to_test:
    print(f"\nTraining {model_config['name']}...")
    
    try:
        # Initialize pipeline
        pipeline = TimeSeriesPipeline(
            model_type='sarima' if model_config['seasonal_order'] else 'arima',
            order=model_config['order'],
            seasonal_order=model_config['seasonal_order'],
            auto_arima=False
        )
        
        # Fit model
        pipeline.fit(train_data, 'target')
        
        # Evaluate on test data
        metrics = pipeline.evaluate(test_data, 'target', test_size=len(test_data))
        
        results.append({
            'Model': model_config['name'],
            'MAE': metrics['mae'],
            'RMSE': metrics['rmse'],
            'R²': metrics['r2'],
            'MAPE': metrics['mape'],
            'PWS': metrics['pws']
        })
        
        print(f"✓ {model_config['name']} - RMSE: {metrics['rmse']:.3f}")
        
    except Exception as e:
        print(f"✗ {model_config['name']} - Failed: {str(e)}")
        
# Display results
results_df = pd.DataFrame(results)
print("\nModel Comparison Results:")
print(results_df.round(3))

## 6. Best Model Analysis and Forecasting

In [None]:
# Select best model (lowest RMSE)
if results:
    best_model_row = results_df.loc[results_df['RMSE'].idxmin()]
    print(f"Best model: {best_model_row['Model']} (RMSE: {best_model_row['RMSE']:.3f})")
    
    # Train best model on full training data
    best_pipeline = TimeSeriesPipeline(
        model_type='sarima',
        order=(1,1,1),
        seasonal_order=(1,1,1,24),  # Using SARIMA as it typically performs well
        auto_arima=False
    )
    
    best_pipeline.fit(train_data, 'target')
else:
    # Fallback to simple ARIMA
    print("Using fallback ARIMA(1,1,1) model")
    best_pipeline = TimeSeriesPipeline(
        model_type='arima',
        order=(1,1,1),
        auto_arima=False
    )
    best_pipeline.fit(train_data, 'target')

In [None]:
# Generate forecasts
forecast_horizon = len(test_data)
forecast_result = best_pipeline.predict(horizon=forecast_horizon, return_conf_int=True)

# Create forecast index
forecast_index = test_data.index[:forecast_horizon]
forecasts = pd.Series(forecast_result['forecasts'], index=forecast_index)
lower_bound = pd.Series(forecast_result['confidence_intervals']['lower'], index=forecast_index)
upper_bound = pd.Series(forecast_result['confidence_intervals']['upper'], index=forecast_index)

print(f"Generated {len(forecasts)} forecasts")

In [None]:
# Plot forecasts vs actual
plt.figure(figsize=(15, 8))

# Plot training data
plt.plot(train_data.index, train_data['target'], label='Training Data', color='blue', alpha=0.7)

# Plot actual test data
plt.plot(test_data.index, test_data['target'], label='Actual', color='green', linewidth=2)

# Plot forecasts
plt.plot(forecasts.index, forecasts, label='Forecast', color='red', linewidth=2)

# Plot confidence intervals
plt.fill_between(forecasts.index, lower_bound, upper_bound, 
                color='red', alpha=0.2, label='95% Confidence Interval')

plt.axvline(x=train_data.index[-1], color='black', linestyle='--', alpha=0.7, label='Train/Test Split')
plt.title('Time Series Forecast vs Actual - Semiconductor Yield Target')
plt.xlabel('Date')
plt.ylabel('Yield (%)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 7. Residual Diagnostics

Let's analyze the model residuals to validate our assumptions.

In [None]:
# Calculate residuals
residuals = test_data['target'][:len(forecasts)] - forecasts

# Residual analysis plots
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Residual Analysis', fontsize=16)

# Residuals over time
axes[0,0].plot(residuals.index, residuals, alpha=0.7)
axes[0,0].axhline(y=0, color='red', linestyle='--')
axes[0,0].set_title('Residuals Over Time')
axes[0,0].set_ylabel('Residual')

# Histogram of residuals
axes[0,1].hist(residuals, bins=20, alpha=0.7, density=True)
axes[0,1].set_title('Residual Distribution')
axes[0,1].set_xlabel('Residual')
axes[0,1].set_ylabel('Density')

# Q-Q plot
from scipy import stats
stats.probplot(residuals, dist="norm", plot=axes[1,0])
axes[1,0].set_title('Q-Q Plot')

# Residuals vs fitted
axes[1,1].scatter(forecasts, residuals, alpha=0.7)
axes[1,1].axhline(y=0, color='red', linestyle='--')
axes[1,1].set_title('Residuals vs Fitted')
axes[1,1].set_xlabel('Fitted Values')
axes[1,1].set_ylabel('Residuals')

plt.tight_layout()
plt.show()

In [None]:
# Statistical tests on residuals
from scipy.stats import jarque_bera, shapiro
from statsmodels.stats.diagnostic import acorr_ljungbox

print("Residual Diagnostic Tests:")
print("=" * 40)

# Normality tests
jb_stat, jb_pval = jarque_bera(residuals)
print(f"Jarque-Bera Test: statistic={jb_stat:.4f}, p-value={jb_pval:.4f}")
if jb_pval > 0.05:
    print("✓ Residuals appear normally distributed")
else:
    print("✗ Residuals may not be normally distributed")

# Autocorrelation test
lb_result = acorr_ljungbox(residuals, lags=10, return_df=True)
print(f"\nLjung-Box Test (lag 10): p-value={lb_result['lb_pvalue'].iloc[-1]:.4f}")
if lb_result['lb_pvalue'].iloc[-1] > 0.05:
    print("✓ No significant autocorrelation in residuals")
else:
    print("✗ Residuals show autocorrelation")

# Summary statistics
print(f"\nResidual Summary Statistics:")
print(f"Mean: {residuals.mean():.4f}")
print(f"Std: {residuals.std():.4f}")
print(f"Min: {residuals.min():.4f}")
print(f"Max: {residuals.max():.4f}")

## 8. Exogenous Variables Analysis

Let's explore how including process parameters as exogenous variables affects forecast performance.

In [None]:
# Train model with exogenous variables
exog_pipeline = TimeSeriesPipeline(
    model_type='arima',
    order=(1,1,1),
    exog_features=['temperature', 'pressure'],
    auto_arima=False
)

# Fit on training data
exog_pipeline.fit(train_data, 'target')

print("ARIMA model with exogenous variables trained successfully")

In [None]:
# Generate forecasts with exogenous variables
# Use test data exogenous variables for forecasting
test_exog = test_data[['temperature', 'pressure']].iloc[:forecast_horizon]

exog_forecast_result = exog_pipeline.predict(
    horizon=forecast_horizon, 
    exog_future=test_exog,
    return_conf_int=True
)

exog_forecasts = pd.Series(exog_forecast_result['forecasts'], index=forecast_index)
exog_lower = pd.Series(exog_forecast_result['confidence_intervals']['lower'], index=forecast_index)
exog_upper = pd.Series(exog_forecast_result['confidence_intervals']['upper'], index=forecast_index)

In [None]:
# Compare models with and without exogenous variables
plt.figure(figsize=(15, 8))

# Plot actual data
plt.plot(test_data.index, test_data['target'], label='Actual', color='green', linewidth=2)

# Plot forecasts without exogenous variables
plt.plot(forecasts.index, forecasts, label='ARIMA Forecast', color='red', linewidth=2, alpha=0.7)

# Plot forecasts with exogenous variables
plt.plot(exog_forecasts.index, exog_forecasts, label='ARIMA + Exog Forecast', 
         color='purple', linewidth=2, alpha=0.7)

# Confidence intervals for exogenous model
plt.fill_between(exog_forecasts.index, exog_lower, exog_upper, 
                color='purple', alpha=0.2, label='95% CI (Exog Model)')

plt.title('Forecast Comparison: With vs Without Exogenous Variables')
plt.xlabel('Date')
plt.ylabel('Yield (%)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Calculate and compare metrics
actual_values = test_data['target'][:len(forecasts)]

# Metrics for ARIMA without exogenous
mae_basic = np.mean(np.abs(actual_values - forecasts))
rmse_basic = np.sqrt(np.mean((actual_values - forecasts)**2))

# Metrics for ARIMA with exogenous
mae_exog = np.mean(np.abs(actual_values - exog_forecasts))
rmse_exog = np.sqrt(np.mean((actual_values - exog_forecasts)**2))

print("Model Comparison:")
print("=" * 50)
print(f"ARIMA (basic):        MAE = {mae_basic:.3f}, RMSE = {rmse_basic:.3f}")
print(f"ARIMA + Exogenous:    MAE = {mae_exog:.3f}, RMSE = {rmse_exog:.3f}")
print("=" * 50)

if rmse_exog < rmse_basic:
    improvement = ((rmse_basic - rmse_exog) / rmse_basic) * 100
    print(f"✓ Exogenous variables improve RMSE by {improvement:.1f}%")
else:
    degradation = ((rmse_exog - rmse_basic) / rmse_basic) * 100
    print(f"✗ Exogenous variables increase RMSE by {degradation:.1f}%")

## 9. Production Deployment Example

Demonstrate how to save, load, and use the model in a production environment.

In [None]:
# Save the best model
model_path = Path('semiconductor_yield_forecast_model.joblib')
best_pipeline.save(model_path)
print(f"Model saved to {model_path}")

# Load the model (simulating production environment)
loaded_pipeline = TimeSeriesPipeline.load(model_path)
print("Model loaded successfully")

# Verify the loaded model works
test_forecast = loaded_pipeline.predict(horizon=5, return_conf_int=True)
print(f"\nTest forecast (next 5 periods):")
for i, (pred, lower, upper) in enumerate(zip(
    test_forecast['forecasts'],
    test_forecast['confidence_intervals']['lower'],
    test_forecast['confidence_intervals']['upper']
print(f"\nTest forecast (next 5 periods):")
for i, (pred, lower, upper) in enumerate(zip(
    test_forecast['forecasts'],
    test_forecast['confidence_intervals']['lower'],
    test_forecast['confidence_intervals']['upper']
)):
    print(f"Period {i+1}: {pred:.2f} [{lower:.2f}, {upper:.2f}]")

## 10. Key Insights and Recommendations

Based on our analysis, here are the key takeaways for semiconductor time series forecasting:

In [None]:
print("Key Insights from Time Series Analysis:")
print("=" * 60)
print("\n1. SEASONALITY:")
print("   - Clear daily patterns in semiconductor data")
print("   - SARIMA models capture seasonality better than basic ARIMA")
print("   - Consider 24-hour cycles for hourly data")

print("\n2. EXOGENOUS VARIABLES:")
if mae_exog < mae_basic:
    print("   ✓ Process parameters (temperature, pressure) improve forecasts")
    print("   ✓ Include correlated process variables when available")
else:
    print("   - Process parameters may add noise in this dataset")
    print("   - Careful selection of exogenous variables is important")

print("\n3. MODEL VALIDATION:")
print("   - Always use time-ordered train/test splits")
print("   - Monitor residual autocorrelation")
print("   - Check prediction intervals for uncertainty quantification")

print("\n4. PRODUCTION RECOMMENDATIONS:")
print("   - Retrain models regularly with new data")
print("   - Monitor forecast accuracy drift over time")
print("   - Implement alerts based on prediction intervals")
print("   - Consider ensemble methods for robustness")

print("\n5. MANUFACTURING-SPECIFIC CONSIDERATIONS:")
print("   - Account for maintenance events and tool changes")
print("   - Consider forecast reconciliation with physical constraints")
print("   - Use PWS (Prediction Within Spec) for process control")
print("   - Integrate with existing SPC systems")

## Summary

This notebook demonstrated:

1. **Data Generation**: Created realistic semiconductor time series with trends and seasonality
2. **Stationarity Testing**: Used ADF tests to check stationarity requirements
3. **Seasonal Decomposition**: Analyzed trend, seasonal, and residual components
4. **Model Comparison**: Evaluated different ARIMA and SARIMA configurations
5. **Forecasting**: Generated predictions with confidence intervals
6. **Residual Analysis**: Validated model assumptions through diagnostic tests
7. **Exogenous Variables**: Demonstrated incorporating process parameters
8. **Production Workflow**: Showed model persistence and deployment patterns

The techniques shown here can be applied to various semiconductor manufacturing time series problems including tool drift detection, SPC monitoring, and yield forecasting.

**Next Steps**: 
- Explore more advanced models (VAR, state space models)
- Implement real-time forecasting systems
- Integrate with manufacturing execution systems (MES)
- Develop automated model retraining pipelines