In [21]:
# Import necessary libraries
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, root_mean_squared_error
import matplotlib.pyplot as plt

In [22]:
#  Data Collection or Processing.

In [25]:

stock_symbol = 'AAPL'  # Example: Apple Stock
start_date = '2015-01-01'
end_date = '2025-01-01'

# Download stock data using yfinance
stock_data = yf.download(stock_symbol, start=start_date, end=end_date)

# Display the first few rows of data
# print(dataset)
stock_data.head()

[*********************100%***********************]  1 of 1 completed


Price,Close,High,Low,Open,Volume
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2015-01-02,24.347172,24.817057,23.906236,24.805922,212818400
2015-01-05,23.661274,24.195741,23.474212,24.115571,257142000
2015-01-06,23.663496,23.924048,23.300503,23.72585,263188400
2015-01-07,23.995317,24.095529,23.761488,23.872835,160423600
2015-01-08,24.917269,24.97517,24.206873,24.324903,237458000


In [16]:
# Data Analyzing.
stock_data.shape

#  so we have the 2516 rows and the 5 columns.

(2516, 5)

In [18]:
#  information of the dataset.
stock_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2516 entries, 2015-01-02 to 2024-12-31
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   (Close, AAPL)   2516 non-null   float64
 1   (High, AAPL)    2516 non-null   float64
 2   (Low, AAPL)     2516 non-null   float64
 3   (Open, AAPL)    2516 non-null   float64
 4   (Volume, AAPL)  2516 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 117.9 KB


In [19]:
#  getting the statistical methods of the dataset.
stock_data.describe()

Price,Close,High,Low,Open,Volume
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL
count,2516.0,2516.0,2516.0,2516.0,2516.0
mean,94.283436,95.199065,93.265184,94.196218,117086700.0
std,65.737265,66.333373,65.047489,65.656689,68399570.0
min,20.697264,21.001971,20.497944,20.619367,23234700.0
25%,35.382253,35.781973,35.002487,35.402551,71056100.0
50%,64.679226,65.236481,63.87365,64.519062,100364600.0
75%,150.913479,152.669429,148.938303,150.735438,142621600.0
max,259.019989,260.100006,257.630005,258.190002,648825200.0


In [26]:
#  Checking the null values into the dataset.
stock_data.isnull().sum()

Price   Ticker
Close   AAPL      0
High    AAPL      0
Low     AAPL      0
Open    AAPL      0
Volume  AAPL      0
dtype: int64

In [30]:
# Step 2: Feature Engineering
# Calculate Moving Averages
stock_data['5_day_MA'] = stock_data['Close'].rolling(window=5).mean()
stock_data['10_day_MA'] = stock_data['Close'].rolling(window=10).mean()
stock_data['50_day_MA'] = stock_data['Close'].rolling(window=50).mean()

# # Calculate Relative Strength Index (RSI)
# delta = stock_data['Close'].diff()
# gain = delta.where(delta > 0, 0)
# loss = -delta.where(delta < 0, 0)

# avg_gain = gain.rolling(window=14).mean()
# avg_loss = loss.rolling(window=14).mean()

# rs = avg_gain / avg_loss
# stock_data['RSI'] = 100 - (100 / (1 + rs))

# # Calculate MACD (Moving Average Convergence Divergence)
# stock_data['26_day_EMA'] = stock_data['Close'].ewm(span=26, adjust=False).mean()
# stock_data['12_day_EMA'] = stock_data['Close'].ewm(span=12, adjust=False).mean()
# stock_data['MACD'] = stock_data['12_day_EMA'] - stock_data['26_day_EMA']
# stock_data['Signal_Line'] = stock_data['MACD'].ewm(span=9, adjust=False).mean()


In [31]:
stock_data.head()

Price,Close,High,Low,Open,Volume,5_day_MA,10_day_MA,50_day_MA
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2015-01-02,24.347172,24.817057,23.906236,24.805922,212818400,,,
2015-01-05,23.661274,24.195741,23.474212,24.115571,257142000,,,
2015-01-06,23.663496,23.924048,23.300503,23.72585,263188400,,,
2015-01-07,23.995317,24.095529,23.761488,23.872835,160423600,,,
2015-01-08,24.917269,24.97517,24.206873,24.324903,237458000,,,


In [None]:
# Drop rows with missing values
stock_data.dropna(inplace=True)

In [None]:
# Step 3: Define Features and Target Variable
# Use technical indicators as features
features = ['5_day_MA', '10_day_MA', '50_day_MA', 'RSI', 'MACD', 'Signal_Line', 'Volume']
X = stock_data[features]

# Target variable is the future closing price (next day's Close)
y = stock_data['Close'].shift(-1).dropna()

# Align the data (features and target)
X = X.iloc[:-1]
y = y.iloc[:-1]

In [None]:
# Step 4: Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
 # Step 5: Train a Random Forest Model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
# Step 6: Make Predictions
predictions = model.predict(X_test)

In [None]:
# Step 7: Evaluate the Model
mae = mean_absolute_error(y_test, predictions)
print(f'Mean Absolute Error: {mae}')



In [None]:
# Step 8: Visualize the Predictions vs Actual Prices
plt.figure(figsize=(10, 6))
plt.plot(y_test.index, y_test.values, label='Actual Prices', color='blue')
plt.plot(y_test.index, predictions, label='Predicted Prices', color='red', linestyle='--')
plt.title(f'{stock_symbol} Stock Price Prediction')
plt.xlabel('Date')
plt.ylabel('Stock Price')
plt.legend()
plt.show()



In [None]:
# Step 9: Predict Future Price (for the next day)
latest_data = stock_data[features].iloc[-1].values.reshape(1, -1)
next_day_prediction = model.predict(latest_data)
print(f'Predicted Next Day Closing Price: {next_day_prediction[0]}')