# Semiconductor Yield Prediction - EDA & Preprocessing

This notebook covers:
1. Data Loading & Integration
2. Exploratory Data Analysis (EDA)
3. Feature Engineering
4. Data Preprocessing

**Note:** Common functions are imported from `utils.py`

## 1. Setup & Data Loading

In [None]:
# Import utility functions
from utils import *

# Additional imports for this notebook
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load data
path = "../02_Data/raw/"
train_sensor, train_quality, predict_sensor = load_data(path)

print(f"Train Sensor Shape: {train_sensor.shape}")
print(f"Train Quality Shape: {train_quality.shape}")
print(f"Predict Sensor Shape: {predict_sensor.shape}")

In [None]:
# Create integrated datasets
train = make_dataset(train_sensor, train_quality)
predict = make_dataset(predict_sensor)

print(f"\nTrain Shape: {train.shape}")
print(f"Predict Shape: {predict.shape}")

## 2. Exploratory Data Analysis

In [None]:
# Descriptive statistics with correlation to target
details = describe_(train, 'y')
display(details.sort_values(by='corr y', ascending=False).head(20))

In [None]:
# Setup for EDA
df_eda = train.copy()

# Define column groups
col_sensor = get_sensor_columns(df_eda)
col_time = get_time_columns(df_eda)

print(f"Sensor columns: {len(col_sensor)}")
print(f"Time columns: {len(col_time)}")

# Get sensor names
sensors_nm, lst_sensors = get_sensor_names(col_sensor, LST_STEPS)
print(f"Unique sensors: {len(sensors_nm)}")

# Convert time columns to datetime
df_eda[col_time] = df_eda[col_time].apply(pd.to_datetime)

### 2.1 Target Variable Distribution

In [None]:
# Target distribution analysis
QQ_plot(df_eda['y'], 'y')

print(f"\nTarget Statistics:")
print(f"Mean: {df_eda['y'].mean():.2f}")
print(f"Std: {df_eda['y'].std():.2f}")
print(f"Min: {df_eda['y'].min():.2f}")
print(f"Max: {df_eda['y'].max():.2f}")
print(f"\nOutliers (y < 1240): {len(df_eda[df_eda['y'] < 1240])}")

### 2.2 Equipment Effect Analysis

In [None]:
# Generate equipment category feature
df_eda = gen_cate_feats(df_eda)

# Module analysis
print(f"Unique modules: {df_eda['module_name'].nunique()}")
print(f"Equipment categories: {df_eda['module_name_eq'].unique()}")

# Boxplot by equipment
plt.figure(figsize=(12, 5))
sns.boxplot(x='module_name_eq', y='y', data=df_eda)
plt.title('Quality Index by Equipment Category')
plt.show()

### 2.3 Process Duration Analysis

In [None]:
# Generate duration features
df_eda = gen_duration_feats(df_eda, LST_STEPSGAP)

# Duration statistics
col_tmdiff = df_eda.filter(regex='gen_tmdiff($|_\d)').columns.tolist()

def tmdiff_stats(x):
    return [round(x.min()/60, 1), round(x.max()/60, 1), round(x.mean()/60, 1)]

df_tmp = df_eda[col_tmdiff].apply(tmdiff_stats).T
df_tmp.columns = ['MIN (min)', 'MAX (min)', 'MEAN (min)']
display(df_tmp)

In [None]:
# Duration vs Quality visualization
df_eda.loc[df_eda['gen_tmdiff'] < 1870, 'tmdiff_speed'] = 'E'  # Early
df_eda.loc[df_eda['gen_tmdiff'] > 1870, 'tmdiff_speed'] = 'L'  # Late

fig, axes = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(14, 5))
sns.scatterplot(x='gen_tmdiff', y='y', hue='tmdiff_speed', data=df_eda, ax=axes[0])
sns.scatterplot(x='gen_tmdiff', y='y', hue='module_name_eq', data=df_eda, ax=axes[1])
axes[0].set_title('Duration vs Quality (by Speed)')
axes[1].set_title('Duration vs Quality (by Equipment)')
axes[1].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

### 2.4 Sensor Statistical Features

In [None]:
# Generate statistical features
df_eda = gen_stats_feats(df_eda, sensors_nm, LST_STEPS)
df_eda = gen_avg_feats(df_eda, sensors_nm, LST_STEPS)

# Visualize key statistical features vs target
fig, axes = plt.subplots(nrows=2, ncols=2, sharey=True, figsize=(12, 8))
fig.subplots_adjust(hspace=.4, wspace=.1)

sns.scatterplot(x='gen_hv_para3_std', y='y', data=df_eda, ax=axes[0,0])
sns.scatterplot(x='gen_hv_para45_std', y='y', data=df_eda, ax=axes[0,1])
sns.scatterplot(x='gen_pressure_para91_std', y='y', data=df_eda, ax=axes[1,0])
sns.scatterplot(x='gen_time_para5_std', y='y', data=df_eda, ax=axes[1,1])

axes[0,0].set_title('HV Para3 Std vs Quality')
axes[0,1].set_title('HV Para45 Std vs Quality')
axes[1,0].set_title('Pressure Para91 Std vs Quality')
axes[1,1].set_title('Time Para5 Std vs Quality')
plt.tight_layout()
plt.show()

## 3. Feature Engineering

In [None]:
# Prepare data for preprocessing
df_prep_train = train.copy()
df_prep_predict = predict.copy()

# Convert time columns
df_prep_train[col_time] = df_prep_train[col_time].apply(pd.to_datetime)
df_prep_predict[col_time] = df_prep_predict[col_time].apply(pd.to_datetime)

In [None]:
# 1. Equipment category features
df_prep_train = gen_cate_feats(df_prep_train)
df_prep_predict = gen_cate_feats(df_prep_predict)

# 2. Clip target outliers
print(f"Before clipping - y range: [{df_prep_train['y'].min():.2f}, {df_prep_train['y'].max():.2f}]")
df_prep_train['y'] = df_prep_train['y'].clip(1240, 1500)
print(f"After clipping - y range: [{df_prep_train['y'].min():.2f}, {df_prep_train['y'].max():.2f}]")

In [None]:
# 3. Duration features
df_prep_train = gen_duration_feats(df_prep_train, LST_STEPSGAP)
df_prep_predict = gen_duration_feats(df_prep_predict, LST_STEPSGAP)

# 4. Statistical features (std)
df_prep_train = gen_stats_feats(df_prep_train, sensors_nm, LST_STEPS)
df_prep_predict = gen_stats_feats(df_prep_predict, sensors_nm, LST_STEPS)

# 5. Statistical features (mean)
df_prep_train = gen_avg_feats(df_prep_train, sensors_nm, LST_STEPS)
df_prep_predict = gen_avg_feats(df_prep_predict, sensors_nm, LST_STEPS)

print(f"Features after engineering: {df_prep_train.shape[1]}")

In [None]:
# 6. Speed category
df_prep_train.loc[df_prep_train['gen_tmdiff'] < 1870, 'tmdiff_speed'] = 'E'
df_prep_train.loc[df_prep_train['gen_tmdiff'] > 1870, 'tmdiff_speed'] = 'L'
df_prep_predict.loc[df_prep_predict['gen_tmdiff'] < 1870, 'tmdiff_speed'] = 'E'
df_prep_predict.loc[df_prep_predict['gen_tmdiff'] > 1870, 'tmdiff_speed'] = 'L'

## 4. Data Preprocessing

### 4.1 Outlier Treatment

In [None]:
# Define numerical columns
col_numerical = col_sensor + df_prep_train.filter(regex='^gen_').columns.tolist()
col_numerical = [c for c in col_numerical if c in df_prep_train.columns]

print(f"Numerical columns to process: {len(col_numerical)}")

# Apply IQR clipping
df_prep_train = clipping(df_prep_train, col_numerical)
df_prep_predict = clipping(df_prep_predict, col_numerical)

### 4.2 Variance Threshold

In [None]:
# Remove zero-variance features
thresholder = VarianceThreshold(threshold=0)
numeric_cols = [c for c in col_numerical if c in df_prep_train.columns]
_ = thresholder.fit_transform(df_prep_train[numeric_cols])

mask = ~thresholder.get_support()
cols_var_drop = np.asarray(numeric_cols)[mask].tolist()

print(f"Features to drop (zero variance): {len(cols_var_drop)}")
if cols_var_drop:
    print(f"Dropped: {cols_var_drop[:10]}...")

### 4.3 Multicollinearity (VIF)

In [None]:
# Calculate VIF (this may take a few minutes)
idx_numerical = [col for col in df_prep_train.columns 
                 if (df_prep_train[col].dtype == 'float') & ('y' not in col)]

print(f"Calculating VIF for {len(idx_numerical)} features...")

# Note: VIF calculation is computationally expensive
# Uncomment below to run (takes ~10 minutes)
# vif = calculate_vif(df_prep_train, idx_numerical)
# display(vif.head(20))

### 4.4 One-Hot Encoding

In [None]:
# Drop NA columns
df_prep_train.dropna(axis=1, inplace=True)
df_prep_predict.dropna(axis=1, inplace=True)

# One-hot encode categorical variables
df_prep_train, df_prep_predict = prep_cate_feats(df_prep_train, df_prep_predict, 'module_name_eq')
df_prep_train, df_prep_predict = prep_cate_feats(df_prep_train, df_prep_predict, 'tmdiff_speed')

print(f"Final train shape: {df_prep_train.shape}")
print(f"Final predict shape: {df_prep_predict.shape}")

### 4.5 Feature Selection

In [None]:
# Define feature columns
col_numeric = [k for k in df_prep_train.columns 
               if (df_prep_train[k].dtype == 'float') & ('y' not in k)]

print(f"Numeric features available: {len(col_numeric)}")

# SelectKBest with Mutual Information
skb = SelectKBest(score_func=mutual_info_regression, k=min(250, len(col_numeric)))
skb.fit(df_prep_train[col_numeric], df_prep_train['y'])

col_selected = pd.Index(col_numeric)[skb.get_support()].tolist()
print(f"Selected features: {len(col_selected)}")

## 5. Save Preprocessed Data

In [None]:
# Define final feature set
col_cate = df_prep_train.filter(regex='module_name_eq|tmdiff_speed').columns.tolist()
col_X = col_selected + col_cate

# Prepare final datasets
X_train = df_prep_train[col_X]
y_train = df_prep_train['y']
X_predict = df_prep_predict[col_X]

print(f"\nFinal Dataset Summary:")
print(f"X_train: {X_train.shape}")
print(f"y_train: {y_train.shape}")
print(f"X_predict: {X_predict.shape}")

In [None]:
# Save preprocessed data for modeling notebook
import pickle

preprocessed_data = {
    'X_train': X_train,
    'y_train': y_train,
    'X_predict': X_predict,
    'col_X': col_X,
    'col_selected': col_selected,
    'df_prep_train': df_prep_train,
    'df_prep_predict': df_prep_predict
}

with open('../03_Results/preprocessed_data.pkl', 'wb') as f:
    pickle.dump(preprocessed_data, f)

print("Preprocessed data saved to 03_Results/preprocessed_data.pkl")