In [None]:
# Install necessary libraries
!pip install schedule
!pip install pytz             #for timezones

In [2]:
# Import necessary libraries
from bs4 import BeautifulSoup as bs
import requests
import re
from dateutil.parser import parse
import csv
import schedule
import time
from pytz import all_timezones
import pandas as pd

In [3]:
# Define the root url for scraping multiple pages
dse_url = 'https://dsebd.org'
listing_url = f'{dse_url}/company_listing.php'

In [4]:
# Method to get links of the company pages only
def get_links(all_links):
  links = []

  for link in all_links:
    if 'name' in link:
      links.append(link)

  return links

In [5]:
# Method to write data to 2 csv files
def write_in_csv(company_rows, percentage_rows):
  with open('companies.csv', 'w', newline='') as company_file:
    writer = csv.writer(company_file)
    field = ["Trading Code", "Name", "Scrip Code", "Sector", "URL"]

    writer.writerow(field)
    writer.writerows(company_rows)

  with open('share_holding_percentage.csv', 'w', newline='') as share_file:
    writer = csv.writer(share_file)
    columns = ["Trading Code", "Date", "Sponsor/Director", "Govt", "Institute", "Foreign", "Public"]

    writer.writerow(columns)
    writer.writerows(percentage_rows)

In [6]:
# Method where the whole program works
def dsebd_scrape():
  # Get content of the webpage where the companies' links are listed
  company_list_source = requests.get(listing_url)
  soup_listing = bs(company_list_source.content, 'lxml')

  company_list = soup_listing.find('div', class_ = 'row al-li')                                                       #list of companies

  all_links = [link['href'] for link in company_list.find_all('a', href = True)]                                      #links of companies only

  links = get_links(all_links)                                                                                        #get links from method

  # Get all rows for 2 csv files
  company_rows = []                                                                                                   #empty lists to store all the rows
  percentage_rows = []

  for link in links:
    company_row = []
    try:                                                                                                              #handle exception to prevent NaN values
      url = f'{dse_url}/{link}'                                                                                       #go to url of each company
      company_page = requests.get(url)
      soup_company = bs(company_page.content, 'lxml')

      name_box = soup_company.find('h2', class_ = 'BodyHead topBodyHead')                                             #get company name

      if name_box is None:
        continue

      trading_code = link.split('=')[-1]                                                                              #get trading code
      company_row.append(trading_code)

      company_name = name_box.i.text
      company_row.append(company_name)
      print(company_name)

      codes_table = soup_company.find('table', class_ = 'table table-bordered background-white shares-table')         #get scrip code

      if 'Company' not in name_box.text:
        codes_table = codes_table.find('tr', class_ = 'alt')

      scrip_code = (codes_table.text).split()[-1]
      company_row.append(scrip_code)

      sector = (soup_company.find('th', string = 'Sector').find_next_sibling()).text                                  #get sector
      company_row.append(sector)

      company_row.append(url)

    except(AttributeError, KeyError):
      continue

    company_rows.append(company_row)                                                                                  #single row of 'companies' csv file

    share_holding_info = soup_company.find_all('td', string = re.compile('Share Holding Percentage'))                 #find share holding percentage for each category at a given date

    for info in share_holding_info:

      percentage_row = []
      percentage_row.append(trading_code)

      try:                                                                                                            #handle exception to extract the dates only
        date_time = parse(info.text, fuzzy = True)                                                                    #get date using parser
      except Exception:
        continue

      percentage_row.append(date_time.date())
      share_percentage_values = ((info.find_next_sibling()).text).split()                                             #get percentage values along with the categories

      for percentage_value in share_percentage_values:

        try:                                                                                                          #handle exception to append the percentage values only
          percentage_row.append(float(percentage_value))

        except(ValueError):
          continue


      percentage_rows.append(percentage_row)                                                                          #single row of 'sharing_holding_percentage' csv file

  write_in_csv(company_rows, percentage_rows)                                                                         #method to write in csv files

In [None]:
if __name__ == '__main__':
  schedule.every().day.at("17:00", "Asia/Dhaka").do(dsebd_scrape)

  while(True):
    schedule.run_pending()
    time.sleep(1)