# Linear Regression Training Pipeline

This notebook implements an end-to-end Linear Regression training pipeline with:
- Data loading and exploration
- Preprocessing and scaling
- Train-test split
- Model training
- Hyperparameter tuning with Optuna
- Model evaluation and validation

## 1. Import Required Libraries

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Hyperparameter optimization
import optuna

# Utilities
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("Libraries imported successfully!")

## 2. Load and Explore Dataset

In [None]:
# Load the dataset
try:
    df = pd.read_csv("data.csv")
    print(f"Dataset loaded successfully! Shape: {df.shape}")
except FileNotFoundError:
    print("Error: data.csv file not found. Please ensure the file exists in the current directory.")
    print("For demonstration purposes, creating a synthetic dataset...")
    
    # Create a synthetic dataset for demonstration
    np.random.seed(42)
    n_samples = 1000
    df = pd.DataFrame({
        'feature_1': np.random.normal(50, 15, n_samples),
        'feature_2': np.random.normal(100, 25, n_samples),
        'feature_3': np.random.uniform(0, 100, n_samples),
        'feature_4': np.random.exponential(2, n_samples),
        'category': np.random.choice(['A', 'B', 'C'], n_samples)
    })
    
    # Create target variable with some relationship to features
    df['target'] = (df['feature_1'] * 0.5 + 
                   df['feature_2'] * 0.3 + 
                   df['feature_3'] * 0.2 + 
                   np.random.normal(0, 10, n_samples))
    
    print(f"Synthetic dataset created! Shape: {df.shape}")

In [None]:
# Display basic information about the dataset
print("Dataset Head:")
print(df.head())
print("\n" + "="*50 + "\n")

print("Dataset Info:")
print(df.info())
print("\n" + "="*50 + "\n")

print("Dataset Description:")
print(df.describe())
print("\n" + "="*50 + "\n")

print("Missing Values Count:")
missing_values = df.isnull().sum()
print(missing_values)
print(f"\nTotal missing values: {missing_values.sum()}")

## 3. Data Preprocessing

In [None]:
# Create a copy for preprocessing
df_processed = df.copy()

# Handle missing values (simple approach)
print("Handling missing values...")
if df_processed.isnull().sum().sum() > 0:
    # For numeric columns, fill with median
    numeric_cols = df_processed.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if df_processed[col].isnull().sum() > 0:
            df_processed[col].fillna(df_processed[col].median(), inplace=True)
    
    # For categorical columns, fill with mode
    categorical_cols = df_processed.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        if df_processed[col].isnull().sum() > 0:
            df_processed[col].fillna(df_processed[col].mode()[0], inplace=True)
    
    print("Missing values handled.")
else:
    print("No missing values found.")

# Select numeric columns for features (excluding target)
numeric_columns = df_processed.select_dtypes(include=[np.number]).columns.tolist()
print(f"\nNumeric columns found: {numeric_columns}")

# Assume the last numeric column is the target (or specify manually)
if 'target' in numeric_columns:
    target_column = 'target'
elif 'Target' in numeric_columns:
    target_column = 'Target'
else:
    # Use the last numeric column as target
    target_column = numeric_columns[-1]
    
print(f"Target column: {target_column}")

# Create feature matrix X and target vector y
feature_columns = [col for col in numeric_columns if col != target_column]
print(f"Feature columns: {feature_columns}")

X = df_processed[feature_columns]
y = df_processed[target_column]

print(f"\nFeature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

In [None]:
# Optional: Handle categorical variables if present
categorical_cols = df_processed.select_dtypes(include=['object']).columns.tolist()
if categorical_cols:
    print(f"Categorical columns found: {categorical_cols}")
    print("Encoding categorical variables using one-hot encoding...")
    
    # One-hot encode categorical variables
    categorical_data = pd.get_dummies(df_processed[categorical_cols], drop_first=True)
    
    # Combine with numeric features
    X = pd.concat([X, categorical_data], axis=1)
    
    print(f"Updated feature matrix shape: {X.shape}")
    print(f"New feature columns: {list(X.columns)}")
else:
    print("No categorical columns found.")

## 4. Train-Test Split

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    stratify=None  # For regression, we don't stratify
)

print(f"Training set shape: X_train {X_train.shape}, y_train {y_train.shape}")
print(f"Testing set shape: X_test {X_test.shape}, y_test {y_test.shape}")
print(f"Train-test split ratio: {len(X_train)}/{len(X_test)} = {len(X_train)/len(X_test):.2f}")

## 5. Feature Scaling

In [None]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on training data only
print("Fitting scaler on training data...")
X_train_scaled = scaler.fit_transform(X_train)

# Transform both training and testing data
print("Scaling training and testing data...")
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrames for easier handling
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns, index=X_test.index)

print(f"Scaled training data shape: {X_train_scaled.shape}")
print(f"Scaled testing data shape: {X_test_scaled.shape}")

# Show scaling statistics
print("\nScaling statistics (training data):")
print(f"Original data mean: {X_train.mean().mean():.4f}")
print(f"Original data std: {X_train.std().mean():.4f}")
print(f"Scaled data mean: {X_train_scaled.mean().mean():.4f}")
print(f"Scaled data std: {X_train_scaled.std().mean():.4f}")

## 6. Basic Linear Regression Model Training

In [None]:
# Initialize and train basic Linear Regression model
print("Training basic Linear Regression model...")
lr_basic = LinearRegression()
lr_basic.fit(X_train_scaled, y_train)

# Display model parameters
print(f"\nModel Intercept: {lr_basic.intercept_:.4f}")
print(f"Number of coefficients: {len(lr_basic.coef_)}")

print("\nModel Coefficients:")
coefficients_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lr_basic.coef_
})
coefficients_df['Abs_Coefficient'] = np.abs(coefficients_df['Coefficient'])
coefficients_df = coefficients_df.sort_values('Abs_Coefficient', ascending=False)
print(coefficients_df)

# Make predictions
y_train_pred_basic = lr_basic.predict(X_train_scaled)
y_test_pred_basic = lr_basic.predict(X_test_scaled)

print("\nBasic model predictions completed.")

## 7. Hyperparameter Tuning with Optuna

In [None]:
# Define the objective function for Optuna optimization
def objective(trial):
    """
    Objective function for Optuna hyperparameter optimization.
    Tests different regression models and their parameters.
    """
    
    # Suggest model type
    model_type = trial.suggest_categorical('model_type', ['linear', 'ridge', 'lasso'])
    
    if model_type == 'linear':
        # Linear Regression parameters
        fit_intercept = trial.suggest_categorical('fit_intercept', [True, False])
        positive = trial.suggest_categorical('positive', [True, False])
        
        model = LinearRegression(
            fit_intercept=fit_intercept,
            positive=positive
        )
    
    elif model_type == 'ridge':
        # Ridge Regression parameters
        alpha = trial.suggest_float('alpha', 0.01, 100.0, log=True)
        fit_intercept = trial.suggest_categorical('fit_intercept', [True, False])
        
        model = Ridge(
            alpha=alpha,
            fit_intercept=fit_intercept,
            random_state=42
        )
    
    else:  # lasso
        # Lasso Regression parameters
        alpha = trial.suggest_float('alpha', 0.01, 10.0, log=True)
        fit_intercept = trial.suggest_categorical('fit_intercept', [True, False])
        
        model = Lasso(
            alpha=alpha,
            fit_intercept=fit_intercept,
            random_state=42,
            max_iter=2000
        )
    
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    
    # Calculate RMSE (objective to minimize)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    return rmse

# Create and run the optimization study
print("Starting hyperparameter optimization with Optuna...")
print("This may take a few minutes...")

study = optuna.create_study(direction='minimize', study_name='linear_regression_optimization')
study.optimize(objective, n_trials=100, show_progress_bar=True)

# Get the best parameters
best_params = study.best_params
best_value = study.best_value

print(f"\nOptimization completed!")
print(f"Best RMSE: {best_value:.4f}")
print(f"Best parameters: {best_params}")

In [None]:
# Train the final model with best parameters
print("Training final model with best parameters...")

if best_params['model_type'] == 'linear':
    final_model = LinearRegression(
        fit_intercept=best_params['fit_intercept'],
        positive=best_params['positive']
    )
elif best_params['model_type'] == 'ridge':
    final_model = Ridge(
        alpha=best_params['alpha'],
        fit_intercept=best_params['fit_intercept'],
        random_state=42
    )
else:  # lasso
    final_model = Lasso(
        alpha=best_params['alpha'],
        fit_intercept=best_params['fit_intercept'],
        random_state=42,
        max_iter=2000
    )

# Train the final model
final_model.fit(X_train_scaled, y_train)

# Make predictions
y_train_pred_final = final_model.predict(X_train_scaled)
y_test_pred_final = final_model.predict(X_test_scaled)

print("Final model training completed!")

## 8. Model Evaluation and Validation

In [None]:
def calculate_metrics(y_true, y_pred, model_name="Model"):
    """Calculate and return evaluation metrics"""
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    
    return {
        'Model': model_name,
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'R²': r2
    }

# Calculate metrics for basic model
basic_train_metrics = calculate_metrics(y_train, y_train_pred_basic, "Basic LR (Train)")
basic_test_metrics = calculate_metrics(y_test, y_test_pred_basic, "Basic LR (Test)")

# Calculate metrics for optimized model
final_train_metrics = calculate_metrics(y_train, y_train_pred_final, "Optimized (Train)")
final_test_metrics = calculate_metrics(y_test, y_test_pred_final, "Optimized (Test)")

# Create results DataFrame
results_df = pd.DataFrame([
    basic_train_metrics,
    basic_test_metrics,
    final_train_metrics,
    final_test_metrics
])

print("Model Evaluation Results:")
print("="*60)
print(results_df.round(4))

In [None]:
# Detailed results summary
print("\n" + "="*60)
print("DETAILED EVALUATION SUMMARY")
print("="*60)

print(f"\nDataset Information:")
print(f"- Total samples: {len(df)}")
print(f"- Features: {len(feature_columns)}")
print(f"- Training samples: {len(X_train)}")
print(f"- Testing samples: {len(X_test)}")

print(f"\nBest Model Configuration:")
print(f"- Model type: {best_params['model_type'].upper()}")
for param, value in best_params.items():
    if param != 'model_type':
        print(f"- {param}: {value}")

print(f"\nFinal Model Performance (Test Set):")
print(f"- Mean Absolute Error (MAE): {final_test_metrics['MAE']:.4f}")
print(f"- Mean Squared Error (MSE): {final_test_metrics['MSE']:.4f}")
print(f"- Root Mean Squared Error (RMSE): {final_test_metrics['RMSE']:.4f}")
print(f"- R² Score: {final_test_metrics['R²']:.4f}")

# Model performance interpretation
r2_test = final_test_metrics['R²']
if r2_test > 0.9:
    performance = "Excellent"
elif r2_test > 0.8:
    performance = "Good"
elif r2_test > 0.6:
    performance = "Moderate"
else:
    performance = "Poor"

print(f"\nModel Performance Assessment: {performance}")
print(f"- The model explains {r2_test*100:.2f}% of the variance in the target variable")

# Overfitting check
train_r2 = final_train_metrics['R²']
r2_diff = train_r2 - r2_test
if r2_diff > 0.1:
    print(f"- Warning: Potential overfitting detected (Train R²: {train_r2:.4f}, Test R²: {r2_test:.4f})")
else:
    print(f"- Good generalization (Train R²: {train_r2:.4f}, Test R²: {r2_test:.4f})")

## 10. Model Output and Summary

In [None]:
# Create predictions DataFrame for analysis
predictions_df = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': y_test_pred_final,
    'Residual': y_test.values - y_test_pred_final
})

predictions_df['Abs_Residual'] = np.abs(predictions_df['Residual'])
predictions_df = predictions_df.sort_values('Abs_Residual', ascending=False)

print("Sample Predictions (Top 10 largest residuals):")
print(predictions_df.head(10).round(4))

print(f"\nPrediction Statistics:")
print(f"- Mean Residual: {predictions_df['Residual'].mean():.4f}")
print(f"- Std Residual: {predictions_df['Residual'].std():.4f}")
print(f"- Max Absolute Residual: {predictions_df['Abs_Residual'].max():.4f}")

print("\n" + "="*60)
print("LINEAR REGRESSION PIPELINE COMPLETED SUCCESSFULLY!")
print("="*60)
print(f"✓ Data loaded and preprocessed")
print(f"✓ Features scaled using StandardScaler")
print(f"✓ Train-test split completed (80-20)")
print(f"✓ Basic Linear Regression model trained")
print(f"✓ Hyperparameter optimization with Optuna completed")
print(f"✓ Final optimized model evaluated")
print(f"✓ Model performance: R² = {final_test_metrics['R²']:.4f}")

In [None]:
# Optional: Save model and predictions JOBLIB 
import joblib

# export model using joblib