In [1]:
import pandas as pd
import numpy as np

In [55]:
df = pd.read_csv('nvda_us_d_5m.csv')
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)

In [56]:
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-07-01,123.47,124.84,118.83,124.30,284885534
2024-07-02,121.13,123.41,121.03,122.67,218373969
2024-07-03,121.66,128.28,121.36,128.28,215748955
2024-07-05,127.38,128.85,125.68,125.83,214176689
2024-07-08,127.49,130.77,127.04,128.20,237677322
...,...,...,...,...,...
2024-11-21,149.35,152.89,140.70,146.67,400946570
2024-11-22,145.93,147.16,141.10,141.95,236406154
2024-11-25,141.99,142.05,135.82,136.02,344941875
2024-11-26,137.70,139.30,135.67,136.92,190287654


# Feature Engineering

In [57]:
def calculate_rsi(prices, periods=14):
    delta = prices.diff()
    gains = delta.where(delta > 0, 0)
    losses = -delta.where(delta < 0, 0)
    avg_gains = gains.rolling(window=periods).mean()
    avg_losses = losses.rolling(window=periods).mean()
    for i in range(periods, len(prices)):
        avg_gains.iloc[i] = (avg_gains.iloc[i-1] * (periods-1) + gains.iloc[i]) / periods
        avg_losses.iloc[i] = (avg_losses.iloc[i-1] * (periods-1) + losses.iloc[i]) / periods
    
    rs = avg_gains / avg_losses
    
    rsi = 100 - (100 / (1 + rs))
    
    return rsi

In [72]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

def create_features(df):
    df_features = df.copy()
    
    df_features['Prev_Close'] = df_features['Close'].shift(1)
    df_features['Prev_High'] = df_features['High'].shift(1)
    df_features['Prev_Low'] = df_features['Low'].shift(1)
    df_features['Prev_Volume'] = df_features['Volume'].shift(1)
    
    df_features['Price_Range'] = df_features['High'] - df_features['Low']
    df_features['Prev_Price_Range'] = df_features['Price_Range'].shift(1)
    
    
    df_features['MA5'] = df_features['Close'].rolling(window=5).mean()
    df_features['MA10'] = df_features['Close'].rolling(window=10).mean()
    
    df_features['Volatility'] = df_features['Close'].rolling(window=5).std()
    
    df_features['Volume_MA5'] = df_features['Volume'].rolling(window=5).mean()

    df_features['Close_Pct_Change'] = df_features['Close'].pct_change()
    df_features['Volume_Pct_Change'] = df_features['Volume'].pct_change()

    df_features['Vol_Price_Interaction'] = df_features['Close_Pct_Change'] * df_features['Volume_Pct_Change']
    # df_features['BB_Upper'] = df_features['MA5'] + (df_features['Volatility'] * 2)
    # df_features['BB_Lower'] = df_features['MA5'] - (df_features['Volatility'] * 2)
    
    df_features['MACD'] = df_features['Close'].ewm(span=12).mean() - df_features['Close'].ewm(span=26).mean()
    df_features['RSI_14'] = calculate_rsi(df_features['Close'], periods=14)  # Standard period
    df_features['RSI_7'] = calculate_rsi(df_features['Close'], periods=7)    # Shorter period
    df_features['RSI_21'] = calculate_rsi(df_features['Close'], periods=21)  # Longer period
    df_features['RSI_MA5'] = df_features['RSI_14'].rolling(window=5).mean()
    df_features['RSI_Diff'] = df_features['RSI_14'] - df_features['RSI_MA5']
    df_features['RSI_Overbought'] = (df_features['RSI_14'] > 70).astype(int)
    df_features['RSI_Oversold'] = (df_features['RSI_14'] < 30).astype(int)
    df_features['RSI_Cross'] = ((df_features['RSI_7'] > df_features['RSI_21']).astype(int) - 
                           (df_features['RSI_7'] < df_features['RSI_21']).astype(int))
    
    df_features.dropna(inplace=True)
    
    return df_features


df_processed = create_features(df)

In [73]:
df_processed.to_csv('nvda_processed.csv')

In [74]:

feature_columns = [
    'Prev_Close', 'Prev_High', 'Prev_Low', 'Prev_Volume',
    'Prev_Price_Range', 'MA5', 'MA10', 'Volatility', 'Volume_MA5',
    'Close_Pct_Change', 'Volume_Pct_Change', 'Vol_Price_Interaction',
    'MACD','RSI_14', 'RSI_7', 'RSI_21', 'RSI_MA5', 'RSI_Diff',
    'RSI_Overbought', 'RSI_Oversold', 'RSI_Cross'
]
# 'BB_Upper', 'BB_Lower',
X = df_processed[feature_columns]
y = df_processed['High']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Linear Regression

In [75]:

model = LinearRegression()
model.fit(X_train_scaled, y_train)


y_pred = model.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Model Performance:")
print(f'Mean Squared Error: {mse:.4f}')
print(f'R-squared Score: {r2:.4f}')

feature_importance = pd.DataFrame({
    'Feature': feature_columns,
    'Coefficient': model.coef_
})
print("\nFeature Importance:")
print(feature_importance.sort_values(by='Coefficient', key=abs, ascending=False))

last_row = df_processed[feature_columns].iloc[-1:].values
last_row_scaled = scaler.transform(last_row)
next_day_pred = model.predict(last_row_scaled)[0]
print(f'\nPredicted next day high: ${next_day_pred:.2f}')

# import matplotlib.pyplot as plt

# plt.figure(figsize=(12, 6))
# plt.scatter(y_test, y_pred, alpha=0.5)
# plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
# plt.xlabel('Actual High')
# plt.ylabel('Predicted High')
# plt.title('Actual vs Predicted High Prices')
# plt.tight_layout()
# plt.show()

Model Performance:
Mean Squared Error: 4.0590
R-squared Score: 0.9771

Feature Importance:
                  Feature  Coefficient
5                     MA5    10.170954
0              Prev_Close     6.363365
12                   MACD    -2.695590
13                 RSI_14     2.672569
14                  RSI_7    -2.437811
17               RSI_Diff     2.309148
3             Prev_Volume     1.984712
16                RSI_MA5     1.639024
10      Volume_Pct_Change     0.991839
1               Prev_High    -0.897902
4        Prev_Price_Range    -0.878052
2                Prev_Low    -0.682901
6                    MA10    -0.549863
9        Close_Pct_Change     0.251751
11  Vol_Price_Interaction     0.196035
8              Volume_MA5    -0.168604
18         RSI_Overbought    -0.093108
15                 RSI_21     0.088182
7              Volatility     0.047061
20              RSI_Cross    -0.009659
19           RSI_Oversold     0.000000

Predicted next day high: $138.43




# Time Series Cross Validation + Linear Regression

In [10]:
df_processed[feature_columns].iloc[-1:]

Unnamed: 0_level_0,Prev_Close,Prev_High,Prev_Low,Prev_Volume,Prev_Price_Range,MA5,MA10,Volatility,Volume_MA5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2024-11-29,135.34,137.22,131.8,226370912.0,5.42,137.696,141.018,2.615594,227973950.2


In [76]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.pipeline import Pipeline
import numpy as np


pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])

tscv = TimeSeriesSplit(n_splits=5, test_size=10)

cv_scores = {
    'r2': cross_val_score(pipeline, X, y, cv=tscv, scoring='r2'),
    'neg_mean_squared_error': cross_val_score(pipeline, X, y, cv=tscv, scoring='neg_mean_squared_error'),
    'neg_root_mean_squared_error': cross_val_score(pipeline, X, y, cv=tscv, scoring='neg_root_mean_squared_error')
}

print("Cross-validation results:")
print("\nR-squared scores for each fold:")
for fold, score in enumerate(cv_scores['r2'], 1):
    print(f"Fold {fold}: {score:.4f}")
print(f"Average R-squared: {cv_scores['r2'].mean():.4f} (+/- {cv_scores['r2'].std() * 2:.4f})")

print("\nRMSE scores for each fold:")
rmse_scores = np.sqrt(-cv_scores['neg_mean_squared_error'])
for fold, score in enumerate(rmse_scores, 1):
    print(f"Fold {fold}: ${score:.2f}")
print(f"Average RMSE: ${rmse_scores.mean():.2f} (+/- ${rmse_scores.std() * 2:.2f})")

pipeline.fit(X, y)

feature_importance = pd.DataFrame({
    'Feature': feature_columns,
    'Coefficient': pipeline.named_steps['model'].coef_
})

print("\nFeature Importance (from final model):")
print(feature_importance.sort_values(by='Coefficient', key=abs, ascending=False))

next_day_pred = pipeline.predict(X.iloc[-1:])[0]
print(f'\nPredicted next day high: ${next_day_pred:.2f}')

# plt.figure(figsize=(12, 6))
# plt.subplot(1, 2, 1)
# plt.boxplot(cv_scores['r2'])
# plt.title('R-squared Scores Distribution')
# plt.ylabel('R-squared')

# plt.subplot(1, 2, 2)
# plt.boxplot(rmse_scores)
# plt.title('RMSE Scores Distribution')
# plt.ylabel('RMSE ($)')

# plt.tight_layout()
# plt.show()

# y_pred_full = pipeline.predict(X)

# plt.figure(figsize=(12, 6))
# plt.scatter(y, y_pred_full, alpha=0.5)
# plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=2)
# plt.xlabel('Actual High')
# plt.ylabel('Predicted High')
# plt.title('Actual vs Predicted High Prices (Full Dataset)')
# plt.tight_layout()
# plt.show()

Cross-validation results:

R-squared scores for each fold:
Fold 1: 0.6590
Fold 2: 0.8531
Fold 3: 0.2451
Fold 4: 0.7688
Fold 5: 0.8668
Average R-squared: 0.6785 (+/- 0.4581)

RMSE scores for each fold:
Fold 1: $1.78
Fold 2: $1.89
Fold 3: $1.47
Fold 4: $2.48
Fold 5: $1.66
Average RMSE: $1.86 (+/- $0.68)

Feature Importance (from final model):
                  Feature  Coefficient
0              Prev_Close     7.975573
5                     MA5     6.888703
13                 RSI_14     2.652683
17               RSI_Diff     2.048971
12                   MACD    -1.853659
14                  RSI_7    -1.788701
16                RSI_MA5     1.722489
15                 RSI_21    -1.486420
3             Prev_Volume     1.471984
9        Close_Pct_Change     1.127339
10      Volume_Pct_Change     1.005708
8              Volume_MA5    -0.570720
4        Prev_Price_Range    -0.395132
1               Prev_High    -0.301110
2                Prev_Low    -0.215288
6                    MA10     0.0

# Random Forest

In [77]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
import numpy as np

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestRegressor(
        n_estimators=100,
        max_depth=5,
        random_state=42
    ))
])

tscv = TimeSeriesSplit(n_splits=5, test_size=10)

cv_scores = {
    'r2': cross_val_score(pipeline, X, y, cv=tscv, scoring='r2'),
    'neg_mean_squared_error': cross_val_score(pipeline, X, y, cv=tscv, scoring='neg_mean_squared_error'),
    'neg_root_mean_squared_error': cross_val_score(pipeline, X, y, cv=tscv, scoring='neg_root_mean_squared_error')
}

print("Cross-validation results:")
print("\nR-squared scores for each fold:")
for fold, score in enumerate(cv_scores['r2'], 1):
    print(f"Fold {fold}: {score:.4f}")
print(f"Average R-squared: {cv_scores['r2'].mean():.4f} (+/- {cv_scores['r2'].std() * 2:.4f})")

print("\nRMSE scores for each fold:")
rmse_scores = np.sqrt(-cv_scores['neg_mean_squared_error'])
for fold, score in enumerate(rmse_scores, 1):
    print(f"Fold {fold}: ${score:.2f}")
print(f"Average RMSE: ${rmse_scores.mean():.2f} (+/- ${rmse_scores.std() * 2:.2f})")

pipeline.fit(X, y)

feature_importance = pd.DataFrame({
    'Feature': feature_columns,
    'Importance': pipeline.named_steps['model'].feature_importances_
})

print("\nFeature Importance (from final model):")
print(feature_importance.sort_values(by='Importance', key=abs, ascending=False))

next_day_pred = pipeline.predict(X.iloc[-1:])[0]
print(f'\nPredicted next day high: ${next_day_pred:.2f}')

# plt.figure(figsize=(12, 6))
# plt.subplot(1, 2, 1)
# plt.boxplot(cv_scores['r2'])
# plt.title('R-squared Scores Distribution')
# plt.ylabel('R-squared')

# plt.subplot(1, 2, 2)
# plt.boxplot(rmse_scores)
# plt.title('RMSE Scores Distribution')
# plt.ylabel('RMSE ($)')

# plt.tight_layout()
# plt.show()

# y_pred_full = pipeline.predict(X)

# plt.figure(figsize=(12, 6))
# plt.scatter(y, y_pred_full, alpha=0.5)
# plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=2)
# plt.xlabel('Actual High')
# plt.ylabel('Predicted High')
# plt.title('Actual vs Predicted High Prices (Full Dataset)')
# plt.tight_layout()
# plt.show()

Cross-validation results:

R-squared scores for each fold:
Fold 1: 0.3282
Fold 2: -0.4861
Fold 3: -8.6454
Fold 4: -1.2319
Fold 5: 0.5449
Average R-squared: -1.8981 (+/- 6.8637)

RMSE scores for each fold:
Fold 1: $2.50
Fold 2: $6.02
Fold 3: $5.26
Fold 4: $7.71
Fold 5: $3.07
Average RMSE: $4.91 (+/- $3.83)

Feature Importance (from final model):
                  Feature    Importance
0              Prev_Close  2.874302e-01
2                Prev_Low  2.712549e-01
1               Prev_High  2.458087e-01
5                     MA5  8.731105e-02
13                 RSI_14  2.552064e-02
15                 RSI_21  2.360655e-02
6                    MA10  1.636181e-02
12                   MACD  1.530359e-02
16                RSI_MA5  9.250514e-03
14                  RSI_7  7.668759e-03
8              Volume_MA5  2.161688e-03
17               RSI_Diff  2.088929e-03
9        Close_Pct_Change  1.668907e-03
10      Volume_Pct_Change  1.010732e-03
4        Prev_Price_Range  9.917809e-04
3            

# Ridge

In [78]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=10)),  # Reduce dimensionality
    ('model', Ridge(alpha=0.1))  # Use Ridge regression to handle multicollinearity
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

next_day_pred = pipeline.predict(X.iloc[-1:])[0]
print(f'Predicted next day high: {next_day_pred:.2f}')
print(f'Mean Squared Error: {mse:.4f}')
print(f'R-squared Score: {r2:.4f}')

Predicted next day high: 137.86
Mean Squared Error: 2.4886
R-squared Score: 0.9860


In [65]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'alpha': [0.1, 0.5, 1.0, 2.0, 5.0, 10.0]
}

ridge = Ridge()
grid_search = GridSearchCV(ridge, param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

print(f"Best alpha: {grid_search.best_params_['alpha']}")

Best alpha: 0.1


In [80]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.linear_model import ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd

df_processed = create_features(df)

feature_columns = [
    'Prev_Close', 'Prev_High', 'Prev_Low', 'Prev_Volume',
    'Prev_Price_Range', 'MA5', 'MA10', 'Volatility', 'Volume_MA5',
    'Close_Pct_Change', 'Volume_Pct_Change', 'Vol_Price_Interaction',
     'MACD',
    'RSI_14', 'RSI_7', 'RSI_21', 'RSI_MA5', 'RSI_Diff',
    'RSI_Overbought', 'RSI_Oversold', 'RSI_Cross'
]

X = df_processed[feature_columns]
y = df_processed['High']

tscv = TimeSeriesSplit(n_splits=5)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', ElasticNet(alpha=0.1, l1_ratio=0.9, random_state=42))
])

cv_scores = {
    'r2': cross_val_score(pipeline, X, y, cv=tscv, scoring='r2'),
    'neg_mean_squared_error': cross_val_score(pipeline, X, y, cv=tscv, scoring='neg_mean_squared_error'),
    'neg_root_mean_squared_error': cross_val_score(pipeline, X, y, cv=tscv, scoring='neg_root_mean_squared_error')
}

print("Cross-validation results with RSI features:")
print("\nR-squared scores for each fold:")
for fold, score in enumerate(cv_scores['r2'], 1):
    print(f"Fold {fold}: {score:.4f}")
print(f"Average R-squared: {cv_scores['r2'].mean():.4f} (+/- {cv_scores['r2'].std() * 2:.4f})")

print("\nRMSE scores for each fold:")
rmse_scores = np.sqrt(-cv_scores['neg_mean_squared_error'])
for fold, score in enumerate(rmse_scores, 1):
    print(f"Fold {fold}: ${score:.2f}")
print(f"Average RMSE: ${rmse_scores.mean():.2f} (+/- ${rmse_scores.std() * 2:.2f})")

pipeline.fit(X, y)

feature_importance = pd.DataFrame({
    'Feature': feature_columns,
    'Coefficient': pipeline.named_steps['model'].coef_
})

print("\nFeature Importance (non-zero coefficients):")
print(feature_importance[feature_importance['Coefficient'] != 0].sort_values(by='Coefficient', key=abs, ascending=False))

next_day_pred = pipeline.predict(X.iloc[-1:].values)[0]
print(f'\nPredicted next day high with RSI features: ${next_day_pred:.2f}')

Cross-validation results with RSI features:

R-squared scores for each fold:
Fold 1: 0.6922
Fold 2: 0.6743
Fold 3: 0.9545
Fold 4: 0.7058
Fold 5: 0.7803
Average R-squared: 0.7614 (+/- 0.2061)

RMSE scores for each fold:
Fold 1: $5.13
Fold 2: $1.74
Fold 3: $1.44
Fold 4: $1.71
Fold 5: $2.04
Average RMSE: $2.41 (+/- $2.74)

Feature Importance (non-zero coefficients):
              Feature  Coefficient
0          Prev_Close     5.414519
5                 MA5     3.933800
1           Prev_High     2.039638
17           RSI_Diff     1.192858
9    Close_Pct_Change     0.869470
6                MA10     0.860267
2            Prev_Low     0.721547
15             RSI_21     0.495472
10  Volume_Pct_Change     0.472195
14              RSI_7     0.264715
7          Volatility     0.096204
3         Prev_Volume     0.006532

Predicted next day high with RSI features: $138.38


