# Tool: extract links from a webpage

In this script we sought to: (1) extract links from a webpage; (2) extract the chunk of each link that was of our interest (in this case the stock symbol); (3) add a suffix to each symbol to make the symbol addressable by yahoo finance; (4) save the list of symbols to a csv file that could then be used for multiple stock price downloads. Let's go.  

In [2]:
import requests
from lxml import html
import pandas as pd

# URL of the webpage that contains the symbols
url = 'https://simplywall.st/stocks/th/healthcare/market-cap-large'

# Send a GET request to the webpage
response = requests.get(url)

# Parse the HTML content of the page
tree = html.fromstring(response.content)

# XPath to extract the links containing stock symbols
links_xpath = '//*[@id="root"]/div/div[2]/section/div[1]/div[1]/section/div/table/tbody/tr/td[2]/a/@href'

# Extract the links
links = tree.xpath(links_xpath)

# Function to extract symbols from a link
def extract_ticker(link):
    # Extract ticker based on the pattern in the link
    ticker = link.split('/set-')[-1].split('/')[0]
    # Add the suffix '.BK' and convert to uppercase
    ticker = (ticker + '.BK').upper()
    return ticker

# Extract symbols and add the suffix
tickers = [extract_ticker(link) for link in links]

# Save the list of tickers to a CSV file
df = pd.DataFrame(tickers, columns=['Symbol'])

# print tickers
print(df)

       Symbol
0     BDMS.BK
1       BH.BK
2      BCH.BK
3      THG.BK
4      RAM.BK
5      CHG.BK
6     STGT.BK
7    VIBHA.BK
8      SKR.BK
9   MASTER.BK
10   PRINC.BK
11     PR9.BK
12  M-CHAI.BK
13  KLINIQ.BK
14     CMR.BK
15     RJH.BK
16     TNH.BK
17    SAFE.BK
18     NTV.BK
19     TOG.BK
20     EKH.BK
21     WPH.BK
22     TRP.BK
23     VIH.BK


In [3]:
# Save the list of tickers to a CSV file - uncheck to use
#csv_file_path = 'tickers.csv'
#df.to_csv(csv_file_path, index=False)

#print(f'Tickers have been saved to {csv_file_path}')
