In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the data
data = pd.read_csv('data/SNP500 Historical data.csv')

# Prepare data (cleaning, feature engineering)
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True)

# Select features and target variable
X = data[['Open', 'High', 'Low']]  # Example features
y = data['Close']  # Target variable

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

In [30]:
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
import numpy as np

# Load the data
data = pd.read_csv('data/SNP500 Historical data.csv', na_values=['#NUM!', '#DIV/0!', 'N/A'])

# Convert 'Date' to datetime and set as index
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True)

# Prepare features and target
X = data[['Open', 'High', 'Low', 'Return', 'Return%', 'Log Return', 'Log Return-3d',
          'Log Return-5d', 'SMA-5d', 'SMA-8d', 'SMA-13d', 'SMA-21d', 'EMA-5d',
          'EMA-8d', 'EMA-12d', 'EMA-26d', 'RSI-14d', 'RSI-21d', 'UpperBollingerBand',
          'LowerBollingerBand', 'MACD', 'Stoc%K', 'TR', 'ATR']]
y = data['Close']

# Replace any remaining non-numeric values with NaN
X = X.replace(['#NUM!', '#DIV/0!', 'N/A'], np.nan)
y = y.replace(['#NUM!', '#DIV/0!', 'N/A'], np.nan)

# Remove rows with NaN values from X and y
data_clean = pd.concat([X, y], axis=1).dropna()
X_clean = data_clean[X.columns]
y_clean = data_clean['Close']

# Convert all features to float to ensure numeric data
X_clean = X_clean.astype(float)
y_clean = y_clean.astype(float)

# Train the decision tree regressor
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_clean, y_clean)

# Get feature importance
importance = dt.feature_importances_

# Create a DataFrame for better visualization
feature_importance_df = pd.DataFrame({
    'Feature': X_clean.columns,
    'Importance': importance
}).sort_values(by='Importance', ascending=False)

print("Feature Importance:")
print(feature_importance_df)

# Print the cleaned DataFrame shape to verify
print("\nCleaned DataFrame Shape:", data_clean.shape)

Feature Importance:
               Feature  Importance
2                  Low    0.798190
12              EMA-5d    0.127845
1                 High    0.068552
11             SMA-21d    0.005163
0                 Open    0.000098
21              Stoc%K    0.000024
22                  TR    0.000022
18  UpperBollingerBand    0.000016
19  LowerBollingerBand    0.000010
14             EMA-12d    0.000009
16             RSI-14d    0.000008
4              Return%    0.000008
23                 ATR    0.000008
17             RSI-21d    0.000007
6        Log Return-3d    0.000007
15             EMA-26d    0.000005
8               SMA-5d    0.000005
7        Log Return-5d    0.000005
5           Log Return    0.000005
20                MACD    0.000004
13              EMA-8d    0.000004
10             SMA-13d    0.000002
3               Return    0.000002
9               SMA-8d    0.000001

Cleaned DataFrame Shape: (2487, 25)
