# Scrape annual report from KLSE

-scrape quarter report information (EPS,Financial Year,DP%) from KLSE website
- https://www.klsescreener.com/v2/stocks/view/7152/jaycorp-bhd

### Import libraries

In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup as bs
from datetime import datetime
import pandas as pd
import time
import random
import warnings
warnings.filterwarnings("ignore")

### Get stock code

In [2]:
df = pd.read_csv('stock_code.csv')
df

Unnamed: 0.1,Unnamed: 0,stock_name,stock_code
0,0,7-ELEVEN MALAYSIA HOLDINGS BERHAD,5250.KL
1,1,ABF MALAYSIA BOND INDEX FUND,0800EA.KL
2,2,ABLE GLOBAL BERHAD,7167.KL
3,3,ABLEGROUP BERHAD,7086.KL
4,4,ABM FUJIYA BERHAD,5198.KL
...,...,...,...
985,985,YX PRECIOUS METALS BHD,0250.KL
986,986,ZECON BERHAD,7028.KL
987,987,ZELAN BERHAD,2283.KL
988,988,ZEN TECH INTERNATIONAL BERHAD,0094.KL


In [3]:
# Remove the last three character
df['stock_code']=df['stock_code'].str.slice(0,-3)

In [4]:
# Put it into a list called stock_code
stock_code=list(df['stock_code'])
stock_code

['5250',
 '0800EA',
 '7167',
 '7086',
 '5198',
 '03028',
 '7131',
 '0218',
 '0122',
 '1481',
 '5281',
 '9148',
 '7191',
 '7146',
 '0181',
 '6599',
 '5139',
 '5185',
 '7145',
 '7315',
 '7078',
 '0209',
 '5238',
 '2658',
 '7609',
 '5116',
 '5115',
 '2674',
 '0079',
 '2488',
 '1163',
 '03051',
 '5269',
 '5127',
 '5293',
 '5120',
 '03011',
 '1015',
 '7031',
 '6351',
 '7083',
 '0048',
 '4758',
 '0226',
 '6556',
 '5082',
 '9342',
 '5568',
 '5088',
 '7090',
 '5015',
 '6432',
 '0119',
 '7214',
 '7181',
 '7007',
 '0038',
 '0068',
 '7722',
 '7129',
 '0159',
 '0105',
 '4057',
 '7162',
 '7054',
 '03032',
 '6399',
 '0072',
 '8176',
 '7048',
 '5130',
 '7099',
 '5302',
 '03037',
 '8885',
 '5204',
 '7579',
 '6888',
 '5106',
 '7120',
 '2305',
 '5021',
 '7005',
 '03012',
 '0098',
 '5258',
 '7251',
 '1899',
 '6602',
 '0187',
 '5190',
 '3239',
 '3395',
 '5196',
 '4219',
 '5248',
 '9814',
 '7668',
 '6173',
 '5932',
 '0195',
 '6998',
 '5032',
 '0179',
 '5069',
 '0168',
 '9288',
 '7036',
 '8133',
 '2771',
 '

### Start scrapping
- loop through all the stock code and use beautiful soup to get the tables
- loop through all the tables and until the column 'DP%' found ,which mean the annual report table has found
- if no annual report table found, append the stock code into no_annual list
- if annual report table found, filter the annual report table financial year from 2018 to now

In [5]:
#create an empty dataframe
annual_rep=pd.DataFrame()
no_annual=[]

In [6]:
#connect/open to Chrome webdriver 
driver = webdriver.Chrome()
driver.implicitly_wait(20)

for s in stock_code:
    try:
        url='https://www.klsescreener.com/v2/stocks/view/'+s
        driver.get(url)
        content = driver.page_source
        soup=bs(content)

        # scrap table
        tables = soup.find_all('table')
        # Read tables with Pandas read_html()
        dfs = pd.read_html(str(tables))

        # find the table that contain column 'DP%'
        i = 0
        has_found = False
        while i < len(dfs) and not has_found:
            n = 0
            while n < len(dfs[i].columns):
                if dfs[i].columns[n] == 'DP%':
                    has_found = True
                n += 1
            i += 1
            
        if has_found:
            df=dfs[i-1]
            
            # extract all the columns needed
            df2=df[["Financial Year","Revenue ('000)", "Net ('000)", 'EPS', 'DP%']]
            
            # changes data types
            df2['Financial Year'] = pd.to_datetime(df['Financial Year'])
            df2['Financial Year']= pd.to_datetime(df2['Financial Year'].dt.strftime('%Y-%m-%d'))
            
            # Filter data between two dates
            filtered_df = df2.loc[(df2['Financial Year'] >= '2018-01-01')
                          & (df2['Financial Year'] < '2023-12-31')]
            filtered_df.reset_index(drop=True, inplace=True)
            
            if filtered_df.empty:
                no_annual.append(s)
                print("Stock code :"+s+" has no  annual report between 2018 to 2022")
            else:
                # add stock code
                filtered_df['Stock Code']=s
                
                #add year column
                filtered_df['Year'] = filtered_df['Financial Year'].dt.strftime("%Y")

                # append it into annual_rep dataframe
                annual_rep = annual_rep.append(filtered_df, ignore_index=True)
                print("Stock code :"+s+" yes")
        else:
            no_annual.append(s)
            print("Stock code :"+s+" has no annual report")
    except:
        no_annual.append(s)
        print("Stock code :"+s+" cannot scrap")
        
        
    # Program to generate a random number between 5 to 30
    # importing the random module
    sec = random.randint(1, 8)

    # program scrapping waiting time
    time.sleep(sec)  # Seconds

Stock code :5250 yes
Stock code :0800EA yes
Stock code :7167 yes
Stock code :7086 yes
Stock code :5198 yes
Stock code :03028 yes
Stock code :7131 yes
Stock code :0218 yes
Stock code :0122 yes
Stock code :1481 yes
Stock code :5281 yes
Stock code :9148 yes
Stock code :7191 yes
Stock code :7146 yes
Stock code :0181 yes
Stock code :6599 yes
Stock code :5139 yes
Stock code :5185 yes
Stock code :7145 yes
Stock code :7315 yes
Stock code :7078 yes
Stock code :0209 yes
Stock code :5238 yes
Stock code :2658 yes
Stock code :7609 yes
Stock code :5116 yes
Stock code :5115 yes
Stock code :2674 yes
Stock code :0079 yes
Stock code :2488 yes
Stock code :1163 yes
Stock code :03051 yes
Stock code :5269 yes
Stock code :5127 yes
Stock code :5293 yes
Stock code :5120 yes
Stock code :03011 yes
Stock code :1015 yes
Stock code :7031 yes
Stock code :6351 yes
Stock code :7083 yes
Stock code :0048 yes
Stock code :4758 yes
Stock code :0226 yes
Stock code :6556 yes
Stock code :5082 yes
Stock code :9342 yes
Stock co

Stock code :7043 yes
Stock code :7223 yes
Stock code :0024 yes
Stock code :4723 yes
Stock code :8648 yes
Stock code :4383 yes
Stock code :7152 yes
Stock code :0058 yes
Stock code :5161 yes
Stock code :5673 yes
Stock code :8931 yes
Stock code :0146 yes
Stock code :0127 yes
Stock code :8923 yes
Stock code :03040 yes
Stock code :6769 yes
Stock code :7096 yes
Stock code :3441 yes
Stock code :5192 yes
Stock code :8672 yes
Stock code :0170 yes
Stock code :5247 yes
Stock code :0054 yes
Stock code :7216 yes
Stock code :3476 yes
Stock code :7199 yes
Stock code :0193 yes
Stock code :0151 yes
Stock code :7323 yes
Stock code :6483 yes
Stock code :0835EA yes
Stock code :0834EA yes
Stock code :7161 yes
Stock code :7077 yes
Stock code :9334 yes
Stock code :0036 yes
Stock code :0143 yes
Stock code :6203 yes
Stock code :7062 yes
Stock code :6211 yes
Stock code :5371 yes
Stock code :0210 yes
Stock code :5027 yes
Stock code :0180 yes
Stock code :5171 yes
Stock code :5280 yes
Stock code :9466 yes
Stock co

Stock code :2224 yes
Stock code :5305 yes
Stock code :9431 yes
Stock code :5213 yes
Stock code :5123 yes
Stock code :5279 yes
Stock code :5163 yes
Stock code :7180 yes
Stock code :0055 yes
Stock code :0251 yes
Stock code :5517 yes
Stock code :7412 yes
Stock code :5173 yes
Stock code :6017 yes
Stock code :0241 yes
Stock code :7246 yes
Stock code :0129 yes
Stock code :4197 yes
Stock code :5285 yes
Stock code :5288 yes
Stock code :4316 yes
Stock code :5172 yes
Stock code :9776 yes
Stock code :7115 yes
Stock code :7155 yes
Stock code :03008 yes
Stock code :7248 yes
Stock code :03023 yes
Stock code :7132 yes
Stock code :0117 yes
Stock code :0169 yes
Stock code :03054 yes
Stock code :0215 yes
Stock code :5242 yes
Stock code :0093 yes
Stock code :4375 yes
Stock code :5134 yes
Stock code :0225 yes
Stock code :5665 yes
Stock code :1562 yes
Stock code :0216 yes
Stock code :7103 yes
Stock code :6084 yes
Stock code :03049 yes
Stock code :5006 yes
Stock code :0080 yes
Stock code :6904 yes
Stock cod

### To scrape stocks that are not able to scrape at the first loop

1. Create a loop to loop through the no_annual list to scrape stocks annual report that are not able to scrape at the first loop
2. the while condition i<3 is use to set the number of time the loop run at most 3 times
3. once the stock annual report is able to scrape the stock code will be remove from the no_annual list

In [13]:
no_annual

['5298']

In [14]:
i=0
while i<3 and len(no_annual)!=0:
    print("loop: "+str(i)+"------------------------------------------")
    print()
    
    driver = webdriver.Chrome()
    driver.implicitly_wait(20)

    for s in no_annual:
        try:
            url='https://www.klsescreener.com/v2/stocks/view/'+s
            driver.get(url)
            content = driver.page_source
            soup=bs(content)

            # scrap table
            tables = soup.find_all('table')
            # Read tables with Pandas read_html()
            dfs = pd.read_html(str(tables))

            # find the table that contain column 'DP%'
            i = 0
            has_found = False
            while i < len(dfs) and not has_found:
                n = 0
                while n < len(dfs[i].columns):
                    if dfs[i].columns[n] == 'DP%':
                        has_found = True
                    n += 1
                i += 1

            if has_found:
                df=dfs[i-1]

                # extract all the columns needed
                df2=df[["Financial Year","Revenue ('000)", "Net ('000)", 'EPS', 'DP%']]

                # changes data types
                df2['Financial Year'] = pd.to_datetime(df['Financial Year'])
                df2['Financial Year']= pd.to_datetime(df2['Financial Year'].dt.strftime('%Y-%m-%d'))

                # Filter data between two dates
                filtered_df = df2.loc[(df2['Financial Year'] >= '2018-01-01')
                              & (df2['Financial Year'] < '2023-12-31')]
                filtered_df.reset_index(drop=True, inplace=True)

                if filtered_df.empty:
                    print("Stock code :"+s+" has no  annual report between 2018 to 2022")
                else:
                    # add stock code
                    filtered_df['Stock Code']=s

                    #add year column
                    filtered_df['Year'] = filtered_df['Financial Year'].dt.strftime("%Y")

                    # append it into annual_rep dataframe
                    annual_rep = annual_rep.append(filtered_df, ignore_index=True)
                    print("Stock code :"+s+" yes")
                    no_annual.remove(s)
            else:
                print("Stock code :"+s+" has no annual report")
        except:
            print("Stock code :"+s+" cannot scrap")
        
        
        # Program to generate a random number between 5 to 30
        # importing the random module
        sec = random.randint(1, 8)

        # program scrapping waiting time
        time.sleep(sec)  # Seconds
    i+=1


loop: 0------------------------------------------

Stock code :5298 has no  annual report between 2018 to 2022


In [15]:
annual_rep

Unnamed: 0,Financial Year,Revenue ('000),Net ('000),EPS,DP%,Stock Code,Year
0,2022-12-31,2771856,63803,5.66,-,5250,2022
1,2021-12-31,2809085,43483,3.86,-,5250,2021
2,2020-12-31,2537563,29767,2.60,-,5250,2020
3,2019-12-31,2359399,54059,4.74,-,5250,2019
4,2018-12-31,2216099,51307,4.56,-,5250,2018
...,...,...,...,...,...,...,...
5217,2022-11-30,136167,42487,8.33,2.04,5131,2022
5218,2021-11-30,149616,45360,9.86,1.72,5131,2021
5219,2020-11-30,170678,46773,10.17,1.67,5131,2020
5220,2019-11-30,170622,49522,10.77,1.11,5131,2019


In [16]:
annual_rep.to_csv("div_annual_report.csv")

# Clean data

In [17]:
df2 = pd.read_csv("div_annual_report.csv")

In [18]:
df2

Unnamed: 0.1,Unnamed: 0,Financial Year,Revenue ('000),Net ('000),EPS,DP%,Stock Code,Year
0,0,2022-12-31,2771856,63803,5.66,-,5250,2022
1,1,2021-12-31,2809085,43483,3.86,-,5250,2021
2,2,2020-12-31,2537563,29767,2.60,-,5250,2020
3,3,2019-12-31,2359399,54059,4.74,-,5250,2019
4,4,2018-12-31,2216099,51307,4.56,-,5250,2018
...,...,...,...,...,...,...,...,...
5217,5217,2022-11-30,136167,42487,8.33,2.04,5131,2022
5218,5218,2021-11-30,149616,45360,9.86,1.72,5131,2021
5219,5219,2020-11-30,170678,46773,10.17,1.67,5131,2020
5220,5220,2019-11-30,170622,49522,10.77,1.11,5131,2019


### Delete uneccessary column

In [19]:
del df2['Unnamed: 0']

In [20]:
df2.dtypes

Financial Year     object
Revenue ('000)      int64
Net ('000)          int64
EPS               float64
DP%                object
Stock Code         object
Year                int64
dtype: object

### Rename the columns

In [21]:
df2.rename(columns={"Financial Year": "annual_financial_year", "Revenue ('000)": "annual_revenue('000)", "Net ('000)": "annual_net_income('000)", "EPS": "annual_eps", "DP%": "annual_dp(%)",
                    "Stock Code": "annual_stock_code"}, inplace=True)

In [22]:
df2

Unnamed: 0,annual_financial_year,annual_revenue('000),annual_net_income('000),annual_eps,annual_dp(%),annual_stock_code,Year
0,2022-12-31,2771856,63803,5.66,-,5250,2022
1,2021-12-31,2809085,43483,3.86,-,5250,2021
2,2020-12-31,2537563,29767,2.60,-,5250,2020
3,2019-12-31,2359399,54059,4.74,-,5250,2019
4,2018-12-31,2216099,51307,4.56,-,5250,2018
...,...,...,...,...,...,...,...
5217,2022-11-30,136167,42487,8.33,2.04,5131,2022
5218,2021-11-30,149616,45360,9.86,1.72,5131,2021
5219,2020-11-30,170678,46773,10.17,1.67,5131,2020
5220,2019-11-30,170622,49522,10.77,1.11,5131,2019


### Change data types

In [23]:
df2.dtypes

annual_financial_year       object
annual_revenue('000)         int64
annual_net_income('000)      int64
annual_eps                 float64
annual_dp(%)                object
annual_stock_code           object
Year                         int64
dtype: object

In [24]:
df2["annual_revenue('000)"] = df2["annual_revenue('000)"].apply(pd.to_numeric, errors='coerce')
df2["annual_net_income('000)"] = df2["annual_net_income('000)"].apply(pd.to_numeric, errors='coerce')
df2["annual_dp(%)"] = df2["annual_dp(%)"].apply(pd.to_numeric, errors='coerce')

df2['annual_financial_year'] = pd.to_datetime(df2['annual_financial_year'])
df2['annual_financial_year'] = pd.to_datetime(df2['annual_financial_year'].dt.strftime('%Y-%m-%d'))

### Add div_year column

In [25]:
df2['div_year'] = df2['annual_financial_year'].dt.strftime('%Y')

In [26]:
df2

Unnamed: 0,annual_financial_year,annual_revenue('000),annual_net_income('000),annual_eps,annual_dp(%),annual_stock_code,Year,div_year
0,2022-12-31,2771856,63803,5.66,,5250,2022,2022
1,2021-12-31,2809085,43483,3.86,,5250,2021,2021
2,2020-12-31,2537563,29767,2.60,,5250,2020,2020
3,2019-12-31,2359399,54059,4.74,,5250,2019,2019
4,2018-12-31,2216099,51307,4.56,,5250,2018,2018
...,...,...,...,...,...,...,...,...
5217,2022-11-30,136167,42487,8.33,2.04,5131,2022,2022
5218,2021-11-30,149616,45360,9.86,1.72,5131,2021,2021
5219,2020-11-30,170678,46773,10.17,1.67,5131,2020,2020
5220,2019-11-30,170622,49522,10.77,1.11,5131,2019,2019


### Format eps 

In [27]:
df2['annual_eps'] = df2['annual_eps']/100

In [28]:
df2['annual_eps']=df2['annual_eps'].round(3)

In [29]:
df2

Unnamed: 0,annual_financial_year,annual_revenue('000),annual_net_income('000),annual_eps,annual_dp(%),annual_stock_code,Year,div_year
0,2022-12-31,2771856,63803,0.057,,5250,2022,2022
1,2021-12-31,2809085,43483,0.039,,5250,2021,2021
2,2020-12-31,2537563,29767,0.026,,5250,2020,2020
3,2019-12-31,2359399,54059,0.047,,5250,2019,2019
4,2018-12-31,2216099,51307,0.046,,5250,2018,2018
...,...,...,...,...,...,...,...,...
5217,2022-11-30,136167,42487,0.083,2.04,5131,2022,2022
5218,2021-11-30,149616,45360,0.099,1.72,5131,2021,2021
5219,2020-11-30,170678,46773,0.102,1.67,5131,2020,2020
5220,2019-11-30,170622,49522,0.108,1.11,5131,2019,2019


### Aggregation 

In [30]:
func={"annual_revenue('000)":'sum',"annual_net_income('000)":'sum',"annual_eps":'sum',"annual_dp(%)":'sum'}
annual_div_info=df2.groupby(['annual_stock_code','div_year']).agg(func)
annual_div_info.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,annual_revenue('000),annual_net_income('000),annual_eps,annual_dp(%)
annual_stock_code,div_year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2018,88288,12173,0.021,0.0
1,2019,122969,18819,0.029,0.51
1,2020,128411,22586,0.034,0.44
1,2021,145441,25291,0.034,0.44
1,2022,120980,26910,0.035,0.57
2,2018,178476,15749,0.118,0.0
2,2019,172550,22195,0.161,0.0
2,2020,171727,29557,0.208,0.0
2,2021,159622,24390,0.165,0.0
2,2022,207917,62095,0.42,0.23


In [31]:
annual_div_info= annual_div_info.reset_index()

In [32]:
annual_div_info

Unnamed: 0,annual_stock_code,div_year,annual_revenue('000),annual_net_income('000),annual_eps,annual_dp(%)
0,0001,2018,88288,12173,0.021,0.00
1,0001,2019,122969,18819,0.029,0.51
2,0001,2020,128411,22586,0.034,0.44
3,0001,2021,145441,25291,0.034,0.44
4,0001,2022,120980,26910,0.035,0.57
...,...,...,...,...,...,...
5110,9997,2019,327539,566,0.004,0.00
5111,9997,2020,280178,3050,0.024,0.00
5112,9997,2021,339159,16317,0.126,0.18
5113,9997,2022,325005,14469,0.115,0.11


In [33]:
annual_div_info.dtypes

annual_stock_code           object
div_year                    object
annual_revenue('000)         int64
annual_net_income('000)      int64
annual_eps                 float64
annual_dp(%)               float64
dtype: object

# Load data into database

In [34]:
import psycopg2
from sqlalchemy import create_engine
from urllib.parse import quote

In [35]:
conn_string = 'postgresql://postgres:%s@localhost/dividend_investing_no2'% quote('@SMWHot4')

In [36]:
db = create_engine(conn_string)
conn = db.connect()

In [37]:
annual_div_info.to_sql('annual', con=conn, if_exists='replace',
          index=False)
conn = psycopg2.connect(conn_string
                        )
conn.autocommit = True
cursor = conn.cursor()

sql1 = '''select * from annual;'''
cursor.execute(sql1)
for i in cursor.fetchall():
    print(i)

conn.commit()
conn.close()

('0001', '2018', 88288, 12173, 0.021, 0.0)
('0001', '2019', 122969, 18819, 0.029, 0.51)
('0001', '2020', 128411, 22586, 0.034, 0.44)
('0001', '2021', 145441, 25291, 0.034, 0.44)
('0001', '2022', 120980, 26910, 0.035, 0.57)
('0002', '2018', 178476, 15749, 0.118, 0.0)
('0002', '2019', 172550, 22195, 0.161, 0.0)
('0002', '2020', 171727, 29557, 0.208, 0.0)
('0002', '2021', 159622, 24390, 0.165, 0.0)
('0002', '2022', 207917, 62095, 0.42, 0.23)
('0002', '2023', 64752, 18240, 0.123, 0.0)
('0005', '2018', 48263, 14182, 0.039, 0.0)
('0005', '2019', 28878, 7839, 0.017, 0.0)
('0005', '2020', 12302, -13669, -0.03, 0.0)
('0005', '2021', 25103, 11317, 0.02, 0.0)
('0005', '2022', 12227, -35101, -0.056, 0.0)
('0005', '2023', 2394, -4078, -0.007, 0.0)
('0006', '2018', 57851, 24, 0.0, 0.0)
('0006', '2019', 39059, -1633, -0.034, 0.0)
('0006', '2020', 44400, -1840, -0.038, 0.0)
('0006', '2021', 55629, -390, -0.008, 0.0)
('0006', '2022', 50457, -846, -0.017, 0.0)
('0006', '2023', 13161, -538, -0.011, 0.0)
