In [1]:
# Part 1: Import Libraries and Fetch Data
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Select stock 
stock_symbol = 'TSLA'  # Tesla
# stock_symbol = 'AAPL'  # Apple
# stock_symbol = 'MSFT'  # Microsoft
# stock_symbol = 'GOOGL' # Google

print(f"Downloading data for {stock_symbol}...")

# Download historical data
stock_data = yf.download(stock_symbol, period='5y')
print(f"Downloaded {len(stock_data)} days of data")
print("\nFirst few rows of data:")
print(stock_data.head())

Downloading data for TSLA...


[*********************100%***********************]  1 of 1 completed

Downloaded 1256 days of data

First few rows of data:
Price            Close        High         Low        Open     Volume
Ticker            TSLA        TSLA        TSLA        TSLA       TSLA
Date                                                                 
2021-02-22  238.166672  256.166656  236.733337  254.213333  111809100
2021-02-23  232.946671  237.869995  206.333328  220.710007  199820700
2021-02-24  247.339996  248.333328  231.389999  237.283340  110301000
2021-02-25  227.406662  245.736664  223.526672  242.050003  117071700
2021-02-26  225.166672  235.566666  219.836670  233.333328  123267600





In [2]:
# Part 2: Create Features for Prediction
print("\nCreating features for prediction...")

# Create a copy of the data
df = stock_data.copy()

# Create target variable (next day's closing price)
df['Next_Close'] = df['Close'].shift(-1)

# Create additional features
df['High_Low_Pct'] = (df['High'] - df['Low']) / df['Close'] * 100
df['Close_Open_Pct'] = (df['Close'] - df['Open']) / df['Open'] * 100

# Create rolling averages
df['MA_5'] = df['Close'].rolling(window=5).mean()
df['MA_10'] = df['Close'].rolling(window=10).mean()
df['MA_20'] = df['Close'].rolling(window=20).mean()

# Create volatility features
df['Volatility'] = df['Close'].rolling(window=10).std()

# Create lag features (previous days' prices)
df['Prev_Close'] = df['Close'].shift(1)
df['Prev_Volume'] = df['Volume'].shift(1)

# Drop NaN values
df = df.dropna()

print(f"Dataset shape after feature engineering: {df.shape}")
print("\nFeatures created:")
print(df[['High_Low_Pct', 'Close_Open_Pct', 'MA_5', 'Volatility']].head())


Creating features for prediction...
Dataset shape after feature engineering: (1236, 14)

Features created:
Price      High_Low_Pct Close_Open_Pct        MA_5 Volatility
Ticker                                                       
Date                                                         
2021-03-19     4.979612       1.278991  226.310663  14.015235
2021-03-22     4.607460      -2.131207  223.781329   6.549333
2021-03-23     3.064215      -2.013999  222.799997   6.792841
2021-03-24     6.014886      -5.635494  218.030664   8.488300
2021-03-25     5.621574       4.468192  217.179330   8.666120


In [3]:
# Part 3: Prepare Data for Training
print("\nPreparing data for training...")

# Select features for prediction
feature_columns = ['Open', 'High', 'Low', 'Volume', 
                   'High_Low_Pct', 'Close_Open_Pct',
                   'MA_5', 'MA_10', 'MA_20', 
                   'Volatility', 'Prev_Close', 'Prev_Volume']

X = df[feature_columns]
y = df['Next_Close']

# Split the data (chronologically)
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

print(f"Training data size: {len(X_train)} days")
print(f"Testing data size: {len(X_test)} days")
print(f"\nFeatures used: {', '.join(feature_columns)}")


Preparing data for training...
Training data size: 988 days
Testing data size: 248 days

Features used: Open, High, Low, Volume, High_Low_Pct, Close_Open_Pct, MA_5, MA_10, MA_20, Volatility, Prev_Close, Prev_Volume


In [4]:
# Part 4: Train Linear Regression Model
print("\n" + "="*50)
print("TRAINING LINEAR REGRESSION MODEL")
print("="*50)

# Create and train Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions
lr_train_pred = lr_model.predict(X_train)
lr_test_pred = lr_model.predict(X_test)

# Calculate metrics
lr_train_mse = mean_squared_error(y_train, lr_train_pred)
lr_test_mse = mean_squared_error(y_test, lr_test_pred)
lr_train_r2 = r2_score(y_train, lr_train_pred)
lr_test_r2 = r2_score(y_test, lr_test_pred)
lr_mae = mean_absolute_error(y_test, lr_test_pred)

print("\nLinear Regression Results:")
print(f"Training MSE: {lr_train_mse:.2f}")
print(f"Testing MSE: {lr_test_mse:.2f}")
print(f"Training R² Score: {lr_train_r2:.4f}")
print(f"Testing R² Score: {lr_test_r2:.4f}")
print(f"Mean Absolute Error: {lr_mae:.2f}")

# Feature importance for Linear Regression
lr_coefficients = pd.DataFrame({
    'Feature': feature_columns,
    'Coefficient': lr_model.coef_
}).sort_values('Coefficient', key=abs, ascending=False)
print("\nTop 5 Important Features (Linear Regression):")
print(lr_coefficients.head())


TRAINING LINEAR REGRESSION MODEL

Linear Regression Results:
Training MSE: 93.55
Testing MSE: 162.23
Training R² Score: 0.9785
Testing R² Score: 0.9698
Mean Absolute Error: 9.90

Top 5 Important Features (Linear Regression):
          Feature  Coefficient
5  Close_Open_Pct     1.763555
4    High_Low_Pct     1.007472
2             Low     0.713098
0            Open     0.471429
9      Volatility     0.188678


In [5]:
# Part 5: Train Random Forest Model
print("\n" + "="*50)
print("TRAINING RANDOM FOREST MODEL")
print("="*50)

# Create and train Random Forest model
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)

# Make predictions
rf_train_pred = rf_model.predict(X_train)
rf_test_pred = rf_model.predict(X_test)

# Calculate metrics
rf_train_mse = mean_squared_error(y_train, rf_train_pred)
rf_test_mse = mean_squared_error(y_test, rf_test_pred)
rf_train_r2 = r2_score(y_train, rf_train_pred)
rf_test_r2 = r2_score(y_test, rf_test_pred)
rf_mae = mean_absolute_error(y_test, rf_test_pred)

print("\nRandom Forest Results:")
print(f"Training MSE: {rf_train_mse:.2f}")
print(f"Testing MSE: {rf_test_mse:.2f}")
print(f"Training R² Score: {rf_train_r2:.4f}")
print(f"Testing R² Score: {rf_test_r2:.4f}")
print(f"Mean Absolute Error: {rf_mae:.2f}")

# Feature importance for Random Forest
rf_importance = pd.DataFrame({
    'Feature': feature_columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)
print("\nTop 5 Important Features (Random Forest):")
print(rf_importance.head())


TRAINING RANDOM FOREST MODEL

Random Forest Results:
Training MSE: 18.75
Testing MSE: 290.89
Training R² Score: 0.9957
Testing R² Score: 0.9458
Mean Absolute Error: 12.52

Top 5 Important Features (Random Forest):
           Feature  Importance
1             High    0.578869
2              Low    0.390806
0             Open    0.009402
10      Prev_Close    0.004757
5   Close_Open_Pct    0.004212


In [6]:
# Part 6: Compare Models
print("\n" + "="*50)
print("MODEL COMPARISON")
print("="*50)

comparison_data = {
    'Model': ['Linear Regression', 'Random Forest'],
    'Train MSE': [lr_train_mse, rf_train_mse],
    'Test MSE': [lr_test_mse, rf_test_mse],
    'Train R²': [lr_train_r2, rf_train_r2],
    'Test R²': [lr_test_r2, rf_test_r2],
    'MAE': [lr_mae, rf_mae]
}

comparison_df = pd.DataFrame(comparison_data)
print("\nModel Performance Comparison:")
print(comparison_df.to_string(index=False))

# Determine better model
better_model = 'Random Forest' if rf_test_r2 > lr_test_r2 else 'Linear Regression'
print(f"\n✅ {better_model} performs better on test data!")


MODEL COMPARISON

Model Performance Comparison:
            Model  Train MSE   Test MSE  Train R²  Test R²       MAE
Linear Regression  93.551872 162.225815  0.978511 0.969764  9.902914
    Random Forest  18.750051 290.890834  0.995693 0.945783 12.516761

✅ Linear Regression performs better on test data!


In [None]:
# Part 7: Plot Actual vs Predicted Prices
print("\n" + "="*50)
print("PLOTTING RESULTS")
print("="*50)

# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle(f'{stock_symbol} Stock Price Prediction Results', fontsize=16, fontweight='bold')

# Plot 1: Linear Regression - Training
axes[0, 0].plot(y_train.values[:200], label='Actual', alpha=0.7)
axes[0, 0].plot(lr_train_pred[:200], label='Predicted', alpha=0.7)
axes[0, 0].set_title('Linear Regression - Training (First 200 days)')
axes[0, 0].set_xlabel('Days')
axes[0, 0].set_ylabel('Price ($)')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Plot 2: Linear Regression - Testing
axes[0, 1].plot(y_test.values, label='Actual', alpha=0.7)
axes[0, 1].plot(lr_test_pred, label='Predicted', alpha=0.7)
axes[0, 1].set_title('Linear Regression - Testing')
axes[0, 1].set_xlabel('Days')
axes[0, 1].set_ylabel('Price ($)')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Plot 3: Random Forest - Training
axes[1, 0].plot(y_train.values[:200], label='Actual', alpha=0.7)
axes[1, 0].plot(rf_train_pred[:200], label='Predicted', alpha=0.7)
axes[1, 0].set_title('Random Forest - Training (First 200 days)')
axes[1, 0].set_xlabel('Days')
axes[1, 0].set_ylabel('Price ($)')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Plot 4: Random Forest - Testing
axes[1, 1].plot(y_test.values, label='Actual', alpha=0.7)
axes[1, 1].plot(rf_test_pred, label='Predicted', alpha=0.7)
axes[1, 1].set_title('Random Forest - Testing')
axes[1, 1].set_xlabel('Days')
axes[1, 1].set_ylabel('Price ($)')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()