# CMPS284-ML-Project
Done by: Jad Raad, Ali Younes, Ali Hamdan, Ahmad Termos
Presented to: Dr. Ahmad Elhaj
Fall 2023-2024

# Importing Libraries
requests: Used for making HTTP requests to fetch web data.
BeautifulSoup: A library for pulling data out of HTML and XML files.
pandas: A data manipulation library for data analysis.
yfinance: A library for accessing financial data from Yahoo Finance.
train_test_split: Function for splitting datasets into training and testing sets.
RandomForestRegressor: An ensemble learning method for regression tasks.
StandardScaler: Used for standardizing features by removing the mean and scaling to unit variance.
mean_squared_error and r2_score: Metrics for evaluating the performance of regression models.

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import yfinance as yf
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Function to Scrape Pepsi Stock Data
scrape_pepsi_stock_data: Function to scrape Pepsi stock data from Yahoo Finance.
It uses the requests library to get the HTML content of the Yahoo Finance page and BeautifulSoup to parse the HTML.
The function then uses yfinance to download historical stock data for Pepsi from June 1, 2023, to January 1, 2024.

In [None]:
def scrape_pepsi_stock_data():
    symbol = 'PEP'
    url = f'https://finance.yahoo.com/quote/PEP?p=PEP&.tsrc=fin-srch'
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        stock_data = yf.download(symbol, start='2023-06-01', end='2024-01-01')  # Extended start date
        print("Scraped Data:")
        print(stock_data.tail())  # Display the last few rows of scraped data
        return stock_data
    else:
        print(f"Failed to fetch the webpage. Status code: {response.status_code}")
        return None


# Function to Add Political Factor
add_political_factor: Function to add a binary political factor to the dataset.
The function takes a DataFrame df as an argument and adds a new column 'Political_Factor' initialized with zeros.
It then sets the political factor to 1 on specific dates defined in the political_events dictionary.

In [None]:
def add_political_factor(df):
    political_events = {
        '2023-10-07': 1,  # Example: Gaza War
        # Add more dates and corresponding political factors as needed
    }

    df['Political_Factor'] = 0

    for date, factor in political_events.items():
        if date in df.index:
            df.loc[date, 'Political_Factor'] = factor


# Scrape Pepsi Stock Data:
The function scrape_pepsi_stock_data is called to fetch historical stock data for PepsiCo from Yahoo Finance. The result is stored in the variable pepsi_stock_data.

In [None]:
# Scrape Pepsi stock data
pepsi_stock_data = scrape_pepsi_stock_data()

# Check if Data Retrieval was Successful:
he script checks whether the stock data retrieval was successful by verifying if pepsi_stock_data is not equal to None. If the data is successfully retrieved, the script proceeds; otherwise, it prints an error message.

In [None]:
# Check if the data was successfully retrieved
if pepsi_stock_data is not None:

# Create Lag Feature (Target):
A new column 'Target' is created in pepsi_stock_data by shifting the 'Close' prices by one day. This represents the next day's closing price.

In [None]:
pepsi_stock_data['Target'] = pepsi_stock_data['Close'].shift(-1)


# Add 'Date' Column:
A new column 'Date' is added to the dataset, containing the index values (dates) from the stock data.

In [None]:
pepsi_stock_data['Date'] = pepsi_stock_data.index


# Add Political Factor:
The add_political_factor function is called to incorporate a binary political factor into the dataset based on predefined political events and their dates.

In [None]:
add_political_factor(pepsi_stock_data)

# Handling Missing Values:
Rows with missing values introduced by creating lag features are dropped from the dataset.

In [None]:
pepsi_stock_data = pepsi_stock_data.dropna()

# Feature Selection:
Features ('Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Political_Factor') and the target variable ('Target') are selected.

In [None]:
features = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Political_Factor']
X = pepsi_stock_data[features]
y = pepsi_stock_data['Target']

# Feature Scaling:
Standard scaling is applied to the features using StandardScaler from scikit-learn. This ensures that all features have the same scale, preventing any particular feature from dominating the others during model training.

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-Test Split:
The dataset is split into training and testing sets using train_test_split from scikit-learn. The testing set comprises 20% of the data, and the random state is set for reproducibility (random_state=42).

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train Random Forest Model:
A Random Forest regression model is instantiated and trained using the training set.

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Model Evaluation:
The model is evaluated on the testing set using Mean Squared Error (MSE) and R-squared (R2) scores.

In [None]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# The evaluation metrics are printed to assess the performance of the model.

In [None]:
print("\nModel Evaluation:")
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

# Make Prediction for the Next Day:
The script predicts the next day's stock price using the trained model and the last data point in the dataset.

In [None]:
last_data_point = X_scaled[-1].reshape(1, -1)
next_day_prediction = model.predict(last_data_point)[0]

In [None]:
print("\nPredicted Next Day Stock Price:")
print(f"Last Data Point:\n{last_data_point}")
print(f'Prediction: {next_day_prediction}')

# Handle Missing Data Case:
If the stock data retrieval was not successful (i.e., pepsi_stock_data is None), an error message is printed.

In [None]:
else:
    print("No Pepsi stock data available.")