<a href="https://colab.research.google.com/github/jvance7-ut/Install4/blob/main/scraper_cleaner1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Please note that it is not legal to scrape all sites.  Please check current standards for the site you intend to scrape.

Author: Jennifer Vance - Feb. 2025

BeautifulSoup Scraper tutorial

https://finance.yahoo.com/markets/stocks/most-active/



**Import packages**

In [None]:
#install packages if needed
#!pip install numpy==1.19.5
#!pip install beautifulsoup4==4.6.3
#https://finance.yahoo.com/markets/stocks/most-active/?start=25&count=100



In [None]:
from bs4 import BeautifulSoup #scraping
import requests #Open URL
import pandas as pd #dataframe
import time

**Scrape the data**

In [None]:
#making a GET request
r = requests.get('https://finance.yahoo.com/markets/stocks/most-active/?start=25&count=100')

if r.status_code == 429:
    retry_after = r.headers.get('Retry-After')
    if retry_after:
        time.sleep(int(retry_after))
    else:
        time.sleep(100) # If no Retry-After header, wait for 100 seconds as a default
    r = requests.get('https://finance.yahoo.com/markets/stocks/most-active/?start=25&count=100') # Retry the request

soup = BeautifulSoup(r.content, 'html.parser')

#check status code for response received
#success code - 200
#too many requests - 429 (error code)
print(r)


<Response [429]>


In [None]:
##parsing the HTML
soup = BeautifulSoup(r.content, 'html.parser')
print(soup.prettify())

Edge: Too Many Requests



In [None]:
#we want the table values
#the HTML value for our table is class_= yf-paf8n5
#if you need help finding what values you want:
#1. go to the website
#2. right click and select 'Inspect' from the menu
#3. hover over the HTML until ONLY THE AREA YOU WANT is blue
#4. use the HTML tags from that line

table_names = soup.find('table', class_='yf-269l37 bd')

print(table_names)


None


In [None]:
# Extract data from the tbody
if table_names:
    # Loop through rows
    for row in table_names.find_all('th'):
      #column names use 'th'
        print(row.text)
    #print(len(row.text))

else:
    print("raw_table not found.")


data_names = []
for row in table_names.find_all('th'):
    data_names.append(row.text.strip())


print("   ")
print(data_names)

print(len(data_names))

raw_table not found.


AttributeError: 'NoneType' object has no attribute 'find_all'

In [None]:
# Find the table cell values with the specific class using tbody
tbody = soup.find('tbody', class_='body yf-paf8n5')

# Extract data from the tbody
if tbody:
    # Loop through rows
    for row in tbody.find_all('tr'):
        # Loop through cells
        for cell in row.find_all('td'):
            print(cell.text)
    #cell values use 'tr' and 'td'
else:
    print("Tbody not found.")

**Turn the data into a dataframe using Pandas**

In [None]:
data = []
#empty data frame
for row in tbody.find_all('tr'):
    row_data = []
    for cell in row.find_all('td'):
        row_data.append(cell.text.strip())
    data.append(row_data)
#fill in empty dataframe row by row and column by column

#create a temporary dataframe for cleaning (optional)
df = pd.DataFrame(data)

##Our data added the values from column 4 and 5 to column 3
## we need to remove the extra information
# Function to remove everything after the first space
def remove_after_space(text):
    if isinstance(text, str):
        return text.split(' ')[0]
    return text
# Apply the function to the column
df[3] = df[3].apply(remove_after_space)
df

In [None]:
#assign the data_names as the column names
df.columns = data_names

# Drop the blank columns (had graphs on website)
df.drop(df.columns[2], axis=1, inplace=True)
df.drop(df.columns[10], axis=1, inplace=True)


df

**Clean the data**

In [None]:
#### column by column cleaning

##Price
df['Price'] = pd.to_numeric(df['Price'])

##Change
df['Change'] = df['Change'].str.replace('+', '', regex=False)
#remove '+' but keep '-' to show negative valuse
df['Change'] = pd.to_numeric(df['Change'])
#make column values numeric ('-' will be read as negative here)

##Change %
df['Change %'] = df['Change %'].str.replace('+', '', regex=False)
df['Change %'] = df['Change %'].str.replace('%', '', regex=False)
#remove '+' and '%' signs
df['Change %'] = pd.to_numeric(df['Change %'])
#change column to numeric
df['Change %'] = df['Change %']*0.01
#put percent in decimal format for possible calculations

###M/B/T values into numeric
##Volume
df['Volume'] = df['Volume'].str.replace('.', '', regex=False)
#get rid of the decimal for ease of computation
df['Volume'] = df['Volume'].str.replace('M', '000', regex=False)
df['Volume'] = df['Volume'].str.replace('B', '000000', regex=False)
df['Volume'] = df['Volume'].str.replace('T', '000000000', regex=False)
#add the appropriate number of 0s
df['Volume'] = pd.to_numeric(df['Volume'])
#change to numeric

##Avg Vol (3M)	(Average volume over 3 months)
df['Avg Vol (3M)'] = df['Avg Vol (3M)'].str.replace('.', '', regex=False)
#get rid of the decimal for ease of computation
df['Avg Vol (3M)'] = df['Avg Vol (3M)'].str.replace('M', '000', regex=False)
df['Avg Vol (3M)'] = df['Avg Vol (3M)'].str.replace('B', '000000', regex=False)
df['Avg Vol (3M)'] = df['Avg Vol (3M)'].str.replace('T', '000000000', regex=False)
#add the appropriate number of 0s
df['Avg Vol (3M)'] = pd.to_numeric(df['Avg Vol (3M)'])
#change to numeric

##Market Cap
df['Market Cap'] = df['Market Cap'].str.replace('.', '', regex=False)
#get rid of the decimal for ease of computation
df['Market Cap'] = df['Market Cap'].str.replace('M', '000', regex=False)
df['Market Cap'] = df['Market Cap'].str.replace('B', '000000', regex=False)
df['Market Cap'] = df['Market Cap'].str.replace('T', '000000000', regex=False)
#add the appropriate number of 0s
df['Market Cap'] = pd.to_numeric(df['Market Cap'])
#change to numeric

##P/E Ratio (TTM)
df['P/E Ratio (TTM)'] = df['P/E Ratio (TTM)'].replace('-','0.0')
#in this case, the '-' are read literally, not as a null value, so they need to
#be replaced manually.  These rows is either a zero or negative value, so they do
#not fill it in.  Remember this when using column for computations.
df['P/E Ratio (TTM)'] = pd.to_numeric(df['P/E Ratio (TTM)'])
#change column to numeric
df['P/E Ratio (TTM)']

##52 Wk Change %
df['52 Wk Change %'] = df['52 Wk Change %'].str.replace('%', '', regex=False)
df['52 Wk Change %'] = df['52 Wk Change %'].str.replace(',', '', regex=False)
#remove '%' and ',' to allow type to be changed to numeric
df['52 Wk Change %'] = pd.to_numeric(df['52 Wk Change %'])
#make column numeric
df['52 Wk Change %'] = df['52 Wk Change %']*0.01
#put percent value in decimal format


In [None]:
##check your work
#numeric_columns = df.select_dtypes(include='number')
#numeric_columns

In [None]:
#make a permanent dataframe name
most_active_stocks = df
most_active_stocks

**Keep a copy of your dataset**

In [None]:
###most Python

#from os import path
#print(path)
#most_active_stocks.to_csv('most_active_stocks.csv', index=False)


In [None]:
###Colab notebook

##if you want to save it to your google drive
#from google.colab import drive
#drive.mount('/content/drive')
#allow permissions

most_active_stocks.to_csv('most_active_stocks.csv', index=False)
#create CSV

from google.colab import files
files.download('most_active_stocks.csv')
#download CSV