In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the data
data = pd.read_csv('data/SNP500 Historical data.csv')

# Prepare data (cleaning, feature engineering)
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True)

# Select features and target variable
X = data[['Open', 'High', 'Low']]  # Example features
y = data['Close']  # Target variable

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')
print(model.coef_)


Mean Squared Error: 8485.457280960542
[-0.69639441  0.88293785  0.80831927]


In [9]:
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
import numpy as np

# Load the data
data = pd.read_csv('data/SNP500 Historical data.csv', na_values=['#NUM!', '#DIV/0!', 'N/A'])

# Convert 'Date' to datetime and set as index
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True)

# Prepare features and target
X = data[['Open', 'Low', 'High','Close Lag-1', 'Open Lag-1', 'High Lag-1', 'Low Lag-1',
            'SMA-5d', 'SMA-8d', 'SMA-13d', 'SMA-21d', 'EMA-5d',
          'EMA-8d', 'EMA-12d', 'EMA-26d', 'RSI-14d', 'RSI-21d', 'UpperBollingerBand',
          'LowerBollingerBand', 'MACD', 'Stoc%K', 'TR', 'ATR',
          'Return Lag-1', 'Return% Lag-1', 'Log Return Lag-1', 'Log Return-3d Lag-1',
          'Log Return-5d Lag-1', 'SMA-5d Lag-1', 'SMA-8d Lag-1', 'SMA-13d Lag-1', 'SMA-21d Lag-1', 'EMA-5d Lag-1',
          'EMA-8d Lag-1', 'EMA-12d Lag-1', 'EMA-26d Lag-1', 'RSI-14d Lag-1', 'RSI-21d Lag-1', 'UpperBollingerBand Lag-1',
          'LowerBollingerBand Lag-1', 'MACD Lag-1', 'Stoc%K Lag-1', 'TR Lag-1', 'ATR Lag-1']]
y = data['Log Return']

# Replace any remaining non-numeric values with NaN
X = X.replace(['#NUM!', '#DIV/0!', 'N/A'], np.nan)
y = y.replace(['#NUM!', '#DIV/0!', 'N/A'], np.nan)

# Remove rows with NaN values from X and y
data_clean = pd.concat([X, y], axis=1).dropna()
X_clean = data_clean[X.columns]
y_clean = data_clean['Log Return']

# Convert all features to float to ensure numeric data
X_clean = X_clean.astype(float)
y_clean = y_clean.astype(float)

# Train the decision tree regressor
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_clean, y_clean)

# Get feature importance
importance = dt.feature_importances_

# Create a DataFrame for better visualization
feature_importance_df = pd.DataFrame({
    'Feature': X_clean.columns,
    'Importance': importance
}).sort_values(by='Importance', ascending=False)

print("Feature Importance:")
print(feature_importance_df)

# Print the cleaned DataFrame shape to verify
print("\nCleaned DataFrame Shape:", data_clean.shape)

Feature Importance:
                     Feature  Importance
20                    Stoc%K    0.387256
41              Stoc%K Lag-1    0.340110
21                        TR    0.091194
42                  TR Lag-1    0.038234
26       Log Return-3d Lag-1    0.030521
25          Log Return Lag-1    0.016153
27       Log Return-5d Lag-1    0.013623
24             Return% Lag-1    0.013585
43                 ATR Lag-1    0.011154
22                       ATR    0.010285
40                MACD Lag-1    0.007725
23              Return Lag-1    0.005608
15                   RSI-14d    0.004374
16                   RSI-21d    0.004244
33              EMA-8d Lag-1    0.003057
29              SMA-8d Lag-1    0.002891
1                        Low    0.002177
6                  Low Lag-1    0.001767
37             RSI-21d Lag-1    0.001754
35             EMA-26d Lag-1    0.001536
19                      MACD    0.001212
9                    SMA-13d    0.001179
11                    EMA-5d    0.001

In [None]:
# Feature Importance:
#                      Feature  Importance
# 20                    Stoc%K    0.387256
# 41              Stoc%K Lag-1    0.340110
# 21                        TR    0.091194
# 42                  TR Lag-1    0.038234
# 26       Log Return-3d Lag-1    0.030521
# 25          Log Return Lag-1    0.016153
# 27       Log Return-5d Lag-1    0.013623
# 24             Return% Lag-1    0.013585
# 43                 ATR Lag-1    0.011154