In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data = pd.read_csv('path_to_stock_data.csv')

# Display the first few rows
print(data.head())

# Check for missing values and data types
print(data.info())
print(data.describe())

# Handling missing values
data.fillna(method='ffill', inplace=True)

# Convert the 'Date' column to datetime format
data['Date'] = pd.to_datetime(data['Date'])

# Set the 'Date' as index
data.set_index('Date', inplace=True)


In [None]:
# Plot stock prices over time
plt.figure(figsize=(12,6))
plt.plot(data['Close'], label='Closing Price')
plt.title('Stock Price Over Time')
plt.xlabel('Date')
plt.ylabel('Closing Price')
plt.legend()
plt.show()

# Calculate and plot moving averages
data['SMA_30'] = data['Close'].rolling(window=30).mean()
data['EMA_30'] = data['Close'].ewm(span=30, adjust=False).mean()

plt.figure(figsize=(12,6))
plt.plot(data['Close'], label='Close Price')
plt.plot(data['SMA_30'], label='30-Day SMA')
plt.plot(data['EMA_30'], label='30-Day EMA')
plt.title('Moving Averages')
plt.legend()
plt.show()

# Calculate and plot volatility
data['Volatility'] = data['Close'].rolling(window=30).std()

plt.figure(figsize=(12,6))
plt.plot(data['Volatility'])
plt.title('Stock Volatility Over Time')
plt.xlabel('Date')
plt.ylabel('Volatility')
plt.show()


In [None]:
# Create lag features
data['Previous_Close'] = data['Close'].shift(1)
data['Return'] = data['Close'].pct_change()
data['7_Day_MA'] = data['Close'].rolling(window=7).mean()
data['30_Day_MA'] = data['Close'].rolling(window=30).mean()

# Drop rows with NaN values generated from lag features
data.dropna(inplace=True)

# Define features and target variable
features = ['Previous_Close', '7_Day_MA', '30_Day_MA', 'Volatility']
X = data[features]
y = data['Close'].shift(-1).dropna()  # Target is next day's closing price
X = X[:-1]  # Adjust X to match y


In [None]:
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
from sklearn.linear_model import LinearRegression
# Train Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
# Make predictions
y_pred_lr = lr_model.predict(X_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor
# Train Random Forest Model
rf_model = RandomForestRegressor(n_estimators=100)
rf_model.fit(X_train, y_train)
# Make predictions
y_pred_rf = rf_model.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Evaluate Linear Regression Model
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr = r2_score(y_test, y_pred_lr)
# Evaluate Random Forest Model
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Linear Regression RMSE: {rmse_lr}, R²: {r2_lr}")
print(f"Random Forest RMSE: {rmse_rf}, R²: {r2_rf}")

# Plot Actual vs. Predicted Values for Linear Regression
plt.figure(figsize=(10,6))
plt.plot(y_test.index, y_test, label='Actual', color='blue')
plt.plot(y_test.index, y_pred_lr, label='Predicted (LR)', color='orange')
plt.title('Actual vs Predicted Stock Prices (Linear Regression)')
plt.legend()
plt.show()

# Plot Actual vs. Predicted Values for Random Forest
plt.figure(figsize=(10,6))
plt.plot(y_test.index, y_test, label='Actual', color='blue')
plt.plot(y_test.index, y_pred_rf, label='Predicted (RF)', color='green')
plt.title('Actual vs Predicted Stock Prices (Random Forest)')
plt.legend()
plt.show()
