In [None]:
# Install necessary libraries
!pip install schedule
!pip install pytz             #for timezones

In [67]:
# Import necessary libraries
from bs4 import BeautifulSoup as bs
import requests
import re
from dateutil.parser import parse
import csv
import schedule
import time
from pytz import all_timezones
import pandas as pd

In [68]:
# Define the root url for scraping multiple pages
dse_url = 'https://dsebd.org'
listing_url = f'{dse_url}/company_listing.php'

In [69]:
def get_trading_code(all_links):
  links = []
  trading_codes = []

  # Get trading code
  for link in all_links:
    if 'name' in link:
      trading_code = link.split('=')[-1]
      trading_codes.append(trading_code)
      links.append(link)

  return trading_codes, links

In [70]:
def write_in_csv(trading_codes, company_rows, percentage_rows):
  with open('companies.csv', 'w', newline='') as company_file:
    writer = csv.writer(company_file)
    field = ["Name", "Scrip Code", "Sector", "URL"]

    writer.writerow(field)
    writer.writerows(company_rows)

  with open('share_holding_percentage.csv', 'w', newline='') as share_file:
    writer = csv.writer(share_file)
    columns = ["Date", "Sponsor/Director", "Govt", "Institute", "Foreign", "Public"]

    writer.writerow(columns)
    writer.writerows(percentage_rows)

  df = pd.read_csv('companies.csv')
  df['trading_code'] = pd.Series(trading_codes)
  df.to_csv('companies.csv')

In [71]:
def dsebd_scrape():
  # Get webpage content
  company_list_source = requests.get(listing_url)
  soup_listing = bs(company_list_source.content, 'lxml')

  company_list = soup_listing.find('div', class_ = 'row al-li')                                                       #list of companies

  all_links = [link['href'] for link in company_list.find_all('a', href = True)]                                      #links of companies only

  trading_codes, links = get_trading_code(all_links)                                                                  #get trading codes

  # Get all rows for 2 csv files
  company_rows = []                                                                                                   #empty lists to store all the rows
  percentage_rows = []

  for link in links:
    company_row = []
    try:
      url = f'{dse_url}/{link}'                                                                                       #go to url of each company
      print(url)
      company_page = requests.get(url)
      soup_company = bs(company_page.text, 'lxml')

      name_box = soup_company.find('h2', class_ = 'BodyHead topBodyHead')                                             #get company name
      company_name = name_box.i.text
      company_row.append(company_name)
      print(company_name)

      codes_table = soup_company.find('table', class_ = 'table table-bordered background-white shares-table')         #get scrip code

      if 'Company' not in name_box.text:
        codes_table = codes_table.find('tr', class_ = 'alt')

      scrip_code = (codes_table.text).split()[-1]
      company_row.append(scrip_code)

      sector = (soup_company.find('th', string = 'Sector').find_next_sibling()).text                                  #get sector
      company_row.append(sector)

      company_row.append(url)

    except(AttributeError, KeyError):
      continue

    company_rows.append(company_row)

    share_holding_info = soup_company.find_all('td', string = re.compile('Share Holding Percentage'))                 #find share holding percentage for each category at a given date

    for info in share_holding_info:

      percentage_row = []

      try:
        date_time = parse(info.text, fuzzy = True)                                                                    #get date
      except Exception:
        continue

      share_percentage_string = ((info.find_next_sibling()).text).split()                                             #get individual percentage
      share_percentage = []
      for percentage_value in share_percentage_string:

        try:
          share_percentage.append(float(percentage_value))

        except(ValueError):
          continue

      percentage_row = [date_time.date()] + share_percentage
      percentage_rows.append(percentage_row)

  write_in_csv(trading_codes, company_rows, percentage_rows)

In [None]:
if __name__ == '__main__':
  schedule.every().day.at("17:00", "Asia/Dhaka").do(dsebd_scrape)

  while(True):
    schedule.run_pending()
    time.sleep(1)