In [2]:
import requests
import pandas as pd

# Define SEC API URL (for example, Apple Inc.'s CIK: 0000320193)
SEC_API_URL = "https://data.sec.gov/submissions/CIK0000320193.json"

# Set your User-Agent (required by SEC)
headers = {
    'User-Agent': 'Hasibur Rashid Mahi hmahi@mtu.edu'
}

# Make the request to the SEC API
response = requests.get(SEC_API_URL, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    # Parse the JSON response
    data = response.json()
    
    # Access filings from the data
    filings = data['filings']['recent']
    
    # Convert to DataFrame for better visualization or CSV export
    filings_df = pd.DataFrame(filings)
    
    # Display the first few rows of the filings
    print(filings_df.head())
    
    # Save the data to CSV if needed
    filings_df.to_csv('sec_filings.csv', index=False)
else:
    print(f"Error fetching data: {response.status_code}")


        accessionNumber  filingDate  reportDate        acceptanceDateTime act  \
0  0001958244-24-005135  2024-10-04              2024-10-04T17:04:02.000Z  33   
1  0000320193-24-000112  2024-10-03  2024-10-01  2024-10-03T18:31:01.000Z       
2  0000320193-24-000111  2024-10-03  2024-10-01  2024-10-03T18:30:50.000Z       
3  0000320193-24-000110  2024-10-03  2024-10-01  2024-10-03T18:30:40.000Z       
4  0000320193-24-000109  2024-10-03  2024-10-01  2024-10-03T18:30:30.000Z       

  form fileNumber filmNumber items core_type   size  isXBRL  isInlineXBRL  \
0  144  001-36743  241355561             144   5409       0             0   
1    4                                     4  15107       0             0   
2    4                                     4  10988       0             0   
3    4                                     4  20468       0             0   
4    4                                     4  18949       0             0   

                      primaryDocument primaryDocDe

In [1]:
import requests
import pandas as pd
import time

# Define SEC API URL (for example, Apple Inc.'s CIK: 0000320193)
SEC_API_URL = "https://data.sec.gov/submissions/CIK0000320193.json"

# Set your User-Agent (required by SEC)
headers = {
    'User-Agent': 'Hasibur Rashid Mahi hmahi@mtu.edu'
}

# Function to fetch filings data and save it to CSV
def fetch_sec_data(url, headers, output_csv, target_count=10000):
    # Initialize an empty DataFrame to store all filings
    all_filings_df = pd.DataFrame()

    # Keep track of how many filings have been fetched so far
    fetched_count = 0
    page = 0

    while fetched_count < target_count:
        print(f"Fetching page {page + 1}...")
        response = requests.get(url, headers=headers)

        # Check if the request was successful
        if response.status_code == 200:
            # Parse the JSON response
            data = response.json()

            # Access filings from the data
            filings = data['filings']['recent']

            # Convert to DataFrame
            filings_df = pd.DataFrame(filings)

            # Append to the full DataFrame
            all_filings_df = pd.concat([all_filings_df, filings_df], ignore_index=True)
            
            # Update the count and page for fetching data
            fetched_count = len(all_filings_df)
            page += 1

            # Break if we have enough data
            if fetched_count >= target_count:
                break
        else:
            print(f"Error fetching data: {response.status_code}")
            break

        # Sleep for a while to avoid hitting the SEC rate limit
        time.sleep(1)

    # Truncate to the target count if necessary
    all_filings_df = all_filings_df.head(target_count)

    # Save the data to CSV
    all_filings_df.to_csv(output_csv, index=False)
    print(f"Saved {len(all_filings_df)} filings to {output_csv}")

# Fetch and save the dataset
fetch_sec_data(SEC_API_URL, headers, 'sec_filings_10000.csv', target_count=10000)


Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Fetching page 6...
Fetching page 7...
Fetching page 8...
Fetching page 9...
Fetching page 10...
Saved 10000 filings to sec_filings_10000.csv


In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import KNNImputer
from scipy.stats import zscore
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller

# Step 1: Load the SEC dataset
data = pd.read_csv('sec_filings_10000.csv')

In [5]:
# Step 2: Handle Missing Values
# Fill missing values with forward fill method initially, then use KNN imputer for critical columns
critical_columns = ['revenue', 'assets', 'netIncome']  # Example critical columns

# Forward fill for general missing values
data = data.ffill()

# Check if critical columns exist in the dataset before applying KNN Imputer
existing_critical_columns = [col for col in critical_columns if col in data.columns]
if existing_critical_columns:
    # KNN Imputer for critical columns
    imputer = KNNImputer(n_neighbors=5)
    data[existing_critical_columns] = imputer.fit_transform(data[existing_critical_columns])

In [6]:
# Step 3: Parse Dates
# Convert filingDate to datetime object if available
if 'filingDate' in data.columns:
    data['filingDate'] = pd.to_datetime(data['filingDate'], errors='coerce')

    # Drop rows with invalid dates
    data.dropna(subset=['filingDate'], inplace=True)

    # Create new time-based features
    data['year'] = data['filingDate'].dt.year
    data['month'] = data['filingDate'].dt.month
    data['quarter'] = data['filingDate'].dt.quarter
    data['day_of_week'] = data['filingDate'].dt.dayofweek

In [7]:
# Step 4: Set filingDate as Index
# Sort the data by filingDate and set it as index for time series analysis
if 'filingDate' in data.columns:
    data.set_index('filingDate', inplace=True)
    data.sort_index(inplace=True)

# Step 5: Handle Categorical Variables
# Convert form type to dummy variables if it exists in the dataset
if 'form' in data.columns:
    data = pd.get_dummies(data, columns=['form'], drop_first=True)

In [8]:
# Step 6: Outlier Detection and Treatment
# Use Z-score to identify and cap outliers for numerical features
numeric_features = data.select_dtypes(include=['float64', 'int64']).columns
for col in numeric_features:
    data[col] = np.where(np.abs(zscore(data[col])) > 3, data[col].median(), data[col])

# Step 7: Feature Engineering
# Create rolling features and lag features for time series analysis
rolling_window = 3  # Example 3-month rolling window
for col in existing_critical_columns:
    data[f'{col}_rolling_mean'] = data[col].rolling(window=rolling_window).mean()
    data[f'{col}_lag_1'] = data[col].shift(1)

In [9]:
# Step 8: Stationarity Check and Transformation
# Check for stationarity using Augmented Dickey-Fuller test if 'revenue' column exists
if 'revenue' in data.columns:
    adf_result = adfuller(data['revenue'].dropna())
    if adf_result[1] > 0.05:
        # If p-value > 0.05, the series is non-stationary; apply differencing
        data['revenue_diff'] = data['revenue'].diff().dropna()

# Step 9: Interaction Features
# Create interaction features between multiple financial metrics if columns exist
if 'revenue' in data.columns and 'assets' in data.columns:
    data['revenue_assets_ratio'] = data['revenue'] / data['assets']

In [10]:
# Step 10: Feature Scaling
# Identify numerical features for scaling
numeric_features = data.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
data[numeric_features] = scaler.fit_transform(data[numeric_features])

# Step 11: Save the Preprocessed Dataset
data.to_csv('sec_filings_10000_preprocessed.csv')
print("Preprocessing completed and saved to sec_filings_10000_preprocessed.csv")

Preprocessing completed and saved to sec_filings_10000_preprocessed.csv
