In [None]:
# Import used libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from scipy.special import expit
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout

In [None]:
#Import csv
stock = pd.read_csv('stock_data.csv')
print(stock.head())

In [None]:
stock = stock[['Date', 'Open', 'Close', 'Volume', 'Change']]
print(stock.head())

In [None]:
# logistic regression
# higher dimension
# CNN

In [None]:
# Traditional Step
# Regression using the Normal Equation (X.T*X)*theta = (X.T*y)
# Shift the change vector by 1 to make features project into next week
stock['Next Change'] = stock['Change'].shift(-1)
# Drop any rows with NaN values (will occur on last row due to shifting)
stock = stock.dropna()
# Define feature matrix X and target vector y
X = stock[['Open', 'Close', 'Volume']].values
y = stock['Next Change'].values
X = np.c_[np.ones((X.shape[0], 1)), X]
theta_linear = np.linalg.inv(X.T @ X) @ X.T @ y
print(theta_linear)

In [None]:
# Predictions on the training set
y_pred = X @ theta_linear

# Plot predictions
plt.plot(range(len(y)), y, label="Actual Values")
plt.plot(range(len(y_pred)), y_pred, label="Predicted Values")
plt.xlabel("Sample Index")
plt.ylabel("Percent Change")
plt.title("Actual vs Predicted Percent Change")
plt.legend()
plt.show()

In [None]:
# Get Error
mse = mean_squared_error(y, y_pred)
print(f"Mean Squared Error: {mse}")

In [None]:
# Higher dimension
# Create quadratic features
X_quad = np.hstack([X, X**2])
X_quad = np.c_[np.ones((X_quad.shape[0], 1)), X_quad]
theta_quad = np.linalg.pinv(X_quad.T @ X_quad) @ X_quad.T @ y
print(theta_quad)

In [None]:
# Predictions on the training set
y_pred = X_quad @ theta_quad

# Plot predictions
plt.plot(range(len(y)), y, label="Actual Values")
plt.plot(range(len(y_pred)), y_pred, label="Predicted Values")
plt.xlabel("Sample Index")
plt.ylabel("Percent Change")
plt.title("Actual vs Predicted Percent Change")
plt.legend()
plt.show()

In [None]:
# Get Error
mse = mean_squared_error(y, y_pred)
print(f"Mean Squared Error: {mse}")

In [None]:
# Ridge Regression
lambda_identity = 1e-5 * np.eye(X_quad.shape[1])
theta_quad = np.linalg.inv(X_quad.T @ X_quad + lambda_identity) @ X_quad.T @ y
print(theta_quad)

In [None]:
# Predictions on the training set
y_pred = X_quad @ theta_quad

# Plot predictions
plt.plot(range(len(y)), y, label="Actual Values")
plt.plot(range(len(y_pred)), y_pred, label="Predicted Values")
plt.xlabel("Sample Index")
plt.ylabel("Percent Change")
plt.title("Actual vs Predicted Percent Change")
plt.legend()
plt.show()

In [None]:
# Get Error
mse = mean_squared_error(y, y_pred)
print(f"Mean Squared Error: {mse}")

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a linear regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Predict on test set
y_pred_linear = linear_model.predict(X_test)

# Apply logistic transformation to the predictions (sigmoid)
y_pred_logistic = expit(y_pred_linear)

# Plot predictions
plt.figure(figsize=(12, 6))
plt.plot(range(len(y_test)), y_test, label="Actual Values")
plt.plot(range(len(y_pred_logistic)), y_pred_logistic, label="Predicted Values (Logistic Transformed)")
plt.xlabel("Sample Index")
plt.ylabel("Transformed Percent Change")
plt.title("Actual vs Predicted Stock Price Change (Logistic Transformed)")
plt.legend()
plt.show()

In [None]:
# Evaluate model
mse = mean_squared_error(y_test, y_pred_logistic)
print(f"Mean Squared Error (with logistic transformation): {mse}")

In [None]:
#Neural Network model
plt.style.use('ggplot')

scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Initialize the neural network
model = MLPRegressor( hidden_layer_sizes=(64, 32), activation='relu', solver='adam', 
    max_iter=10000,
    early_stopping=True,
    random_state=42
)

# Perform cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=kf)
cv_scores = -cv_scores  # Convert negative MSE to positive
mean_cv_score = np.mean(cv_scores)
print(f"Cross-Validation MSE Scores: {cv_scores}")
print(f"Mean CV MSE: {mean_cv_score}")


# Fit the model and evaluate on the test set
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Visualize predictions
plt.figure(figsize=(12, 6))
plt.plot(range(len(y_test)), y_test, label="Actual Values")
plt.plot(range(len(y_pred)), y_pred, label="Predicted Values (NN)")
plt.xlabel("Sample Index")
plt.ylabel("Percent Change")
plt.title("Actual vs Predicted Percent Change (Neural Network)")
plt.legend()
plt.show()

In [None]:
mse_test = mean_squared_error(y_test, y_pred)
print(f"Test Set MSE: {mse_test}")

In [None]:
# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Reshape X to (samples, timesteps, features) for CNN
# Assume each sample has a single timestep with all features as input
X = X.reshape((X.shape[0], 1, X.shape[1]))  # Shape: (samples, timesteps=1, features)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the CNN model
model = Sequential([
    Conv1D(filters=32, kernel_size=1, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])),
    MaxPooling1D(pool_size=1),
    Dropout(0.2),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)  # Single output for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Make predictions
y_pred = model.predict(X_test)

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=1)

# Plot predictions vs actual
plt.figure(figsize=(12, 6))
plt.plot(range(len(y_test)), y_test, label="Actual Values")
plt.plot(range(len(y_pred)), y_pred, label="Predicted Values (CNN)")
plt.xlabel("Sample Index")
plt.ylabel("Percent Change")
plt.title("Actual vs Predicted Percent Change (CNN)")
plt.legend()
plt.show()

In [None]:
# Calculate MSE
mse_test = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error on Test Set: {mse_test}")