In [None]:
# -*- coding: utf-8 -*-
"""
Batch Feature Engineering for Stock Data
"""

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer
import os
import glob

Mounted at /content/drive


In [None]:
# Define input and output directories
input_dir = '/content/drive/MyDrive/ci/'
output_dir = '/content/drive/MyDrive/ci_fe/'

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

In [None]:

# Get list of all CSV files in the input directory
csv_files = glob.glob(os.path.join(input_dir, '*.csv'))

# Process each file
for file_path in csv_files:
    # Extract filename without path and extension
    filename = os.path.basename(file_path)
    stock_name = os.path.splitext(filename)[0]

    print(f"Processing {filename}...")

    try:
        # Load data
        data = pd.read_csv(file_path)

        # Handle different column name formats
        if 'date' in data.columns or 'Date' in data.columns:
            date_col = 'date' if 'date' in data.columns else 'Date'
            data.rename(columns={date_col: 'Date'}, inplace=True)

        # Standardize column names
        expected_columns = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume']
        if len(data.columns) >= len(expected_columns):
            data.columns = expected_columns + list(data.columns[len(expected_columns):])
        else:
            print(f"Skipping {filename}: Missing expected columns")
            continue

        # Convert Date to datetime
        data['Date'] = pd.to_datetime(data['Date'])

        # Set Date as index
        data.set_index('Date', inplace=True)

        # Feature Engineering - Only the original indicators from the provided script

        # Simple Moving Average (SMA)
        data["SMA_20"] = data["Close"].rolling(window=20).mean()

        # Stochastic Oscillator (20-day)
        low_20 = data["Low"].rolling(window=20).min()
        high_20 = data["High"].rolling(window=20).max()
        # Handle division by zero
        denominator = high_20 - low_20
        data["Stochastic_%K"] = np.where(denominator != 0,
                                         ((data["Close"] - low_20) / denominator) * 100,
                                         0)
        data["Stochastic_%D"] = data["Stochastic_%K"].rolling(window=3).mean()

        # MACD (Moving Average Convergence Divergence)
        short_ema = data["Close"].ewm(span=20, adjust=False).mean()
        long_ema = data["Close"].ewm(span=50, adjust=False).mean()
        data["MACD"] = short_ema - long_ema
        data["MACD_Signal"] = data["MACD"].ewm(span=9, adjust=False).mean()

        # Data Transformations
        # Selecting numerical columns for transformation
        numeric_features = ["Open", "High", "Low", "Close", "Volume"]

        # # Log Transformation (avoiding log(0) error by adding a small constant)
        # data["Log_Volume"] = np.log1p(data["Volume"])

        # # Min-Max Scaling (scales data between 0 and 1)
        # scaler = MinMaxScaler()
        # data_scaled = scaler.fit_transform(data[numeric_features])
        # data_scaled = pd.DataFrame(data_scaled, columns=[f"{col}_MinMax" for col in numeric_features], index=data.index)

        # # Standardization (Z-score normalization)
        # scaler = StandardScaler()
        # data_standardized = scaler.fit_transform(data[numeric_features])
        # data_standardized = pd.DataFrame(data_standardized, columns=[f"{col}_Zscore" for col in numeric_features], index=data.index)

        # # Power Transformation (Yeo-Johnson method, works with zero/negative values)
        # power_transformer = PowerTransformer(method="yeo-johnson")
        # data_power = power_transformer.fit_transform(data[numeric_features])
        # data_power = pd.DataFrame(data_power, columns=[f"{col}_Power" for col in numeric_features], index=data.index)

        # # Concatenate all transformed dataframes
        # data_transformed = pd.concat([data, data_scaled, data_standardized, data_power], axis=1)

        # Drop NaN values resulting from transformations
        data.dropna(inplace=True)

        # Save the transformed data
        output_path = os.path.join(output_dir, f"Transformed_{stock_name}.csv")
        data.to_csv(output_path)

        print(f"Successfully processed {filename}. Output saved to {output_path}")

    except Exception as e:
        print(f"Error processing {filename}: {str(e)}")

print("Feature engineering completed for all files.")

Processing ci_Apple_20120101_to_20241212.csv...
Successfully processed ci_Apple_20120101_to_20241212.csv. Output saved to /content/drive/MyDrive/ci_fe/Transformed_ci_Apple_20120101_to_20241212.csv
Processing ci_Microsoft_20120101_to_20241212.csv...
Successfully processed ci_Microsoft_20120101_to_20241212.csv. Output saved to /content/drive/MyDrive/ci_fe/Transformed_ci_Microsoft_20120101_to_20241212.csv
Processing ci_Amazon_20120101_to_20241212.csv...
Successfully processed ci_Amazon_20120101_to_20241212.csv. Output saved to /content/drive/MyDrive/ci_fe/Transformed_ci_Amazon_20120101_to_20241212.csv
Processing ci_Google_20120101_to_20241212.csv...
Successfully processed ci_Google_20120101_to_20241212.csv. Output saved to /content/drive/MyDrive/ci_fe/Transformed_ci_Google_20120101_to_20241212.csv
Feature engineering completed for all files.
