# Stock Price Web Scraping

###### In this project, I use pandas and Beautiful Soup to scrape stock price data from Yahoo Finance.

First, I wrote a function to scrape the page URLs for the top 25 stocks on the "Most Active Stocks Today" page on Yahoo Finance. I put all of these URLs into a list to be used in a second function that retrieves information about each stock. <br><br> This second function takes in the list of stock URLs then for each URL, scrapes the company name, most recent stock price, most recent stock price change, and the date/time the data was recorded. The function then appends the data to a CSV file. <br><br> I wrote a third function that checks the price of the top 25 stocks for a user entered amount of time at time intervals given by the user. Each time the prices are checked, a new row is added to the CSV file recording the company name, current price, and current price change for each of the top 25 stocks. This allows us to analyze the change in price of the most active stocks over time. <br><br> Finally, I displayed the data of the top 9 most active stocks in a subplot using Matplotlib.

In [None]:
# Import necessary libraries

import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
from csv import DictWriter
from datetime import datetime
import time
from time import sleep
import matplotlib.pyplot as plt
import matplotlib.dates

In [None]:
# Headers declaration with user agent

headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'}

In [None]:
# Function to get a list of URLs leading to the individual pages for the top 25 most active stocks on Yahoo Finance

def get_urls():
    # Connect to Most Active Stocks page on Yahoo Finance and use Beautiful Soup to get the HTML content
    url = 'https://finance.yahoo.com/most-active/'
    webpage = requests.get(url, headers=headers)
    soup = BeautifulSoup(webpage.content, 'html.parser')
    soup_clean = BeautifulSoup(soup.prettify(), "html.parser")
    
    # Find URLs to the pages for the top 25 most active stocks on Yahoo Finance
    url_tags = soup_clean.find_all(attrs={'data-test':'quoteLink', 'class':'Fw(600) C($linkColor)'})
    
    # Loop through url_tags and add the string for each URL to a new list
    urls = []
    for tag in url_tags:
        urls.append('https://finance.yahoo.com/' + tag.attrs.get('href'))
        
    # Return list of URLs
    return urls

# Get URLs to be used in get_info() function
urls = get_urls()

In [None]:
# Function that iterates through list of URLs and appends the company name, price, and price change of that stock to CSV file

def get_info(urls):
    for url in urls:
        try:
            # Connect to URL and use Beautiful Soup to get the HTML content
            webpage = requests.get(url, headers=headers)
            soup = BeautifulSoup(webpage.content, 'html.parser')
            soup_clean = BeautifulSoup(soup.prettify(), "html.parser")

            # Find the company name
            company = soup_clean.find(attrs={'class':'D(ib) Fz(18px)'}).get_text()
            company = company.strip()

            # Find the stock price
            price = soup_clean.find(attrs={'class':'Fw(b) Fz(36px) Mb(-4px) D(ib)'}).get_text()
            price = price.strip()

             # Find the stock price change
            price_change = soup_clean.find(attrs={'class':'Fw(500) Pstart(8px) Fz(24px)'}).get_text()
            price_change = price_change.strip()

            # Record the current time
            current_time = time.strftime("%H:%M:%S", time.localtime())

            # Create a list containing the new variables for company, price, price change, and current date/time
            new_row = [company, price, price_change, current_time]

            # Add new row of data to CSV file
            with open('stock_prices.csv', 'a') as file:
                writer = csv.writer(file)
                writer.writerow(new_row)
                file.close()
        except:
            # Print this statement if any errors occur in the previous code block
            print("Something went wrong")

In [None]:
# Function to run get_info() function on the list of URLs for a user entered amount of time at user entered time intervals

def check_stock_prices(duration_in_seconds, delay_in_seconds):
    # Get the time after the user input time_in_seconds has passed to know when to stop the loop
    timeout = time.time() + duration_in_seconds
    
    # Create loop that runs the get_info() function on the same list of URLs for duration_in_seconds amount of time in time intervals of delay_in_seconds 
    while(time.time() <= timeout):
        get_info(urls)
        print("A new row of data have been added for each stock.")
        # Delay for user input delay_in_seconds number of seconds
        sleep(delay_in_seconds)

In [None]:
# Setting up headers in CSV file

column_names = ['COMPANY', 'PRICE', 'PRICE CHANGE', 'TIME']
with open('stock_prices.csv', 'a') as file:
    writer = csv.writer(file)
    writer.writerow(column_names)

# Checking stock prices every 15 minutes for 2 hours starting August 16th at 11:58 AM 

check_stock_prices(7200, 900)

In [None]:
# Read in stock_prices.csv as a DataFrame
df = pd.read_csv('/Users/kristenotten/stock_prices.csv')

# Remove unnecessary rows 
df = df.drop(df.index[[200, 201]])
print(df)

In [None]:
# Function to get DataFrames for x and y axes in line chart for a given company's stock prices over time
# Also returns the labels for the ticks on the x axis

def get_xy(company_name):
    # Get all rows of data for the given company
    rows = df[df['COMPANY'].str.contains(company_name)]
    rows.reset_index(drop=True, inplace=True)
    
    # Get stock prices for given company to be used for y axis
    prices = rows['PRICE']
    
    # Get all the datetime values for the given company's rows to be used for x axis
    date_times = rows['DATETIME']
    
    # Use a for loop to get only the time from each datetime value and add to a list
    time_xticks = []
    for date_time in date_times:
        try:
            # Convert string to datetime object
            date_times_obj = datetime.strptime(date_time, '%Y-%m-%d %H:%M:%S.%f')
            # Isolate the time
            only_time = date_times_obj.time()
            # Add to the list as a string
            time_xticks.append(only_time.strftime("%H:%M:%S"))
        except:
            print("Something went wrong")
    
    return prices, date_times, time_xticks

In [None]:
# Create a subplot to show the change in stock price over time of the top 9 most active stocks 

%matplotlib inline
plt.rcParams['figure.figsize'] = (15, 13)
plt.rcParams['figure.dpi'] = 75

plt.suptitle("Price Changes for Yahoo Finance's Most Active Stocks")

# Plot the change in stock price over time for Tesla
plt.subplot(3, 3, 1)
tesla_prices, tesla_dts, tesla_xticks = get_xy('TSLA')
plt.plot(tesla_dts, tesla_prices, color='red')
plt.title("Tesla")
plt.xticks(tesla_dts, tesla_xticks, rotation='vertical')

# Plot the change in stock price over time for NIO Inc.
plt.subplot(3, 3, 2)
nio_prices, nio_dts, nio_xticks = get_xy('NIO')
plt.plot(nio_dts, nio_prices, color='blue')
plt.title("NIO Inc.")
plt.xticks(nio_dts, nio_xticks, rotation='vertical')

# Plot the change in stock price over time for AMC Entertainment Holdings
plt.subplot(3, 3, 3)
amc_prices, amc_dts, amc_xticks = get_xy('AMC Entertainment Holdings')
plt.plot(amc_dts, amc_prices, color='blue')
plt.title("AMC Entertainment Holdings")
plt.xticks(amc_dts, amc_xticks, rotation='vertical')

# Plot the change in stock price over time for Kenvue Inc.
plt.subplot(3, 3, 4)
kvue_prices, kvue_dts, kvue_xticks = get_xy('KVUE')
plt.plot(kvue_dts, kvue_prices, color='blue')
plt.title("Kenvue Inc.")
plt.xticks(kvue_dts, kvue_xticks, rotation='vertical')

# Plot the change in stock price over time for NU Holdings
plt.subplot(3, 3, 5)
nu_prices, nu_dts, nu_xticks = get_xy('NU')
plt.plot(nu_dts, nu_prices, color='blue')
plt.title("NU Holdings")
plt.xticks(nu_dts, nu_xticks, rotation='vertical')

# Plot the change in stock price over time for Advanced Micro Devices
plt.subplot(3, 3, 6)
amd_prices, amd_dts, amd_xticks = get_xy('AMD')
plt.plot(amd_dts, amd_prices, color='blue')
plt.title("Advanced Micro Devices, Inc.")
plt.xticks(amd_dts, amd_xticks, rotation='vertical')

# Plot the change in stock price over time for DLocal Limited
plt.subplot(3, 3, 7)
dlo_prices, dlo_dts, dlo_xticks = get_xy('DLO')
plt.plot(dlo_dts, dlo_prices, color='blue')
plt.title("DLocal Limited")
plt.xticks(dlo_dts, dlo_xticks, rotation='vertical')

# Plot the change in stock price over time for NVIDIA Corporation
plt.subplot(3, 3, 8)
nvda_prices, nvda_dts, nvda_xticks = get_xy('NVIDIA')
plt.plot(nvda_dts, nvda_prices, color='blue')
plt.title("NVIDIA Corporation")
plt.xticks(nvda_dts, nvda_xticks, rotation='vertical')

# Plot the change in stock price over time for Apple, Inc.
plt.subplot(3, 3, 9)
apple_prices, apple_dts, apple_xticks = get_xy('Apple')
plt.plot(apple_dts, apple_prices, color='blue')
plt.title("Apple, Inc.")
plt.xticks(apple_dts, apple_xticks, rotation='vertical')

plt.subplots_adjust(hspace=0.75, wspace=0.5)
plt.show()
