# Scrape quarter report from KLSE
- Scrape quarter report information (EPS,DPS,Financial Year,ROE) from KLSE website
- https://www.klsescreener.com/v2/stocks/view/7152/jaycorp-bhd

In [1]:
# Import libraries
from selenium import webdriver
from bs4 import BeautifulSoup as bs
from datetime import datetime
import pandas as pd
import time
import random
import warnings
warnings.filterwarnings("ignore")

### Get stock code

In [2]:
df = pd.read_csv('stock_code.csv')
df

Unnamed: 0.1,Unnamed: 0,stock_name,stock_code
0,0,7-ELEVEN MALAYSIA HOLDINGS BERHAD,5250.KL
1,1,ABF MALAYSIA BOND INDEX FUND,0800EA.KL
2,2,ABLE GLOBAL BERHAD,7167.KL
3,3,ABLEGROUP BERHAD,7086.KL
4,4,ABM FUJIYA BERHAD,5198.KL
...,...,...,...
985,985,YX PRECIOUS METALS BHD,0250.KL
986,986,ZECON BERHAD,7028.KL
987,987,ZELAN BERHAD,2283.KL
988,988,ZEN TECH INTERNATIONAL BERHAD,0094.KL


In [3]:
# check the stock code is correct by checking their dimension
df.shape

(990, 3)

In [4]:
# Remove the last three character 
df['stock_code']=df['stock_code'].str.slice(0,-3)

In [5]:
# Put it into a list called stock_code
stock_code=list(df['stock_code'])
stock_code

['5250',
 '0800EA',
 '7167',
 '7086',
 '5198',
 '03028',
 '7131',
 '0218',
 '0122',
 '1481',
 '5281',
 '9148',
 '7191',
 '7146',
 '0181',
 '6599',
 '5139',
 '5185',
 '7145',
 '7315',
 '7078',
 '0209',
 '5238',
 '2658',
 '7609',
 '5116',
 '5115',
 '2674',
 '0079',
 '2488',
 '1163',
 '03051',
 '5269',
 '5127',
 '5293',
 '5120',
 '03011',
 '1015',
 '7031',
 '6351',
 '7083',
 '0048',
 '4758',
 '0226',
 '6556',
 '5082',
 '9342',
 '5568',
 '5088',
 '7090',
 '5015',
 '6432',
 '0119',
 '7214',
 '7181',
 '7007',
 '0038',
 '0068',
 '7722',
 '7129',
 '0159',
 '0105',
 '4057',
 '7162',
 '7054',
 '03032',
 '6399',
 '0072',
 '8176',
 '7048',
 '5130',
 '7099',
 '5302',
 '03037',
 '8885',
 '5204',
 '7579',
 '6888',
 '5106',
 '7120',
 '2305',
 '5021',
 '7005',
 '03012',
 '0098',
 '5258',
 '7251',
 '1899',
 '6602',
 '0187',
 '5190',
 '3239',
 '3395',
 '5196',
 '4219',
 '5248',
 '9814',
 '7668',
 '6173',
 '5932',
 '0195',
 '6998',
 '5032',
 '0179',
 '5069',
 '0168',
 '9288',
 '7036',
 '8133',
 '2771',
 '

### Start scrapping
- loop through all the stock code and use beautiful soup to get the tables
- loop through all the tables and until the column 'ROE' found ,which mean the quarter report table has found
- if no quarter report table found, append the stock code into no_quarter list
- if quarter report table found, filter the quarter report table financial year from 2018 to now

In [6]:
# Create an empty dataframe
quarter_rep=pd.DataFrame()
no_scrape=[]

In [35]:
no_scrape

['5298']

In [7]:
# Connect/open to Chrome webdriver 
driver = webdriver.Chrome()
driver.implicitly_wait(30)

for s in stock_code:
    try:
        url='https://www.klsescreener.com/v2/stocks/view/'+s
        driver.get(url)
        content = driver.page_source
        soup=bs(content)

        # Scrap table
        tables = soup.find_all('table')
        # Read tables with Pandas read_html()
        dfs = pd.read_html(str(tables))

        # Find the table that contain column 'ROE'
        i = 0
        has_found = False
        while i < len(dfs) and not has_found:
            n = 0
            while n < len(dfs[i].columns):
                if dfs[i].columns[n] == 'ROE':
                    has_found = True
                n += 1
            i += 1
            
        if has_found:
            df=dfs[i-1]
            
            # Extract all the columns needed
            df2=df[['EPS','DPS','NTA','Revenue','P/L','Quarter','Q Date','Financial Year','Announced','ROE']]
            
            # Changes data types 
            df2['Financial Year'] = pd.to_datetime(df['Financial Year'])
            df2['Financial Year']= pd.to_datetime(df2['Financial Year'].dt.strftime('%Y-%m-%d'))
            
            # Filter data between two dates  
            filtered_df = df2.loc[(df2['Financial Year'] >= '2018-01-01')
                          & (df2['Financial Year'] < '2023-12-31')]
            filtered_df.reset_index(drop=True, inplace=True)
            
            if filtered_df.empty:
                no_scrape.append(s)
                print("Stock code :"+s+" has no  quarter report between 2018 to 2022")
            else:
                # Drop the date row 
                filtered_df.drop(filtered_df[filtered_df['EPS'] == filtered_df['Revenue']].index, inplace = True)
                filtered_df.reset_index(drop=True, inplace=True)

                # Add stock code
                filtered_df['Stock Code']=s

                # add year column
                filtered_df['Year'] = filtered_df['Financial Year'].dt.strftime("%Y")

                # Append it into quarter_rep dataframe
                quarter_rep = quarter_rep.append(filtered_df, ignore_index=True)
                print("Stock code :"+s+" yes")
            
        else:
            no_scrape.append(s)
            print("Stock code :"+s+" has no  annual report")
    except:
        no_scrape.append(s)
        print("Stock code :"+s+" cannot scrap")
        
    # Program to generate a random number between 5 to 30
    # Importing the random module
    sec = random.randint(1, 5)

    # Program scrapping waiting time
    time.sleep(sec)  # Seconds


Stock code :5250 yes
Stock code :0800EA yes
Stock code :7167 yes
Stock code :7086 yes
Stock code :5198 yes
Stock code :03028 yes
Stock code :7131 yes
Stock code :0218 yes
Stock code :0122 yes
Stock code :1481 yes
Stock code :5281 yes
Stock code :9148 yes
Stock code :7191 yes
Stock code :7146 yes
Stock code :0181 yes
Stock code :6599 yes
Stock code :5139 yes
Stock code :5185 yes
Stock code :7145 yes
Stock code :7315 yes
Stock code :7078 yes
Stock code :0209 yes
Stock code :5238 yes
Stock code :2658 yes
Stock code :7609 yes
Stock code :5116 yes
Stock code :5115 yes
Stock code :2674 yes
Stock code :0079 yes
Stock code :2488 yes
Stock code :1163 yes
Stock code :03051 yes
Stock code :5269 yes
Stock code :5127 yes
Stock code :5293 yes
Stock code :5120 yes
Stock code :03011 yes
Stock code :1015 yes
Stock code :7031 yes
Stock code :6351 yes
Stock code :7083 yes
Stock code :0048 yes
Stock code :4758 yes
Stock code :0226 yes
Stock code :6556 yes
Stock code :5082 yes
Stock code :9342 yes
Stock co

Stock code :7043 yes
Stock code :7223 yes
Stock code :0024 yes
Stock code :4723 yes
Stock code :8648 yes
Stock code :4383 yes
Stock code :7152 yes
Stock code :0058 yes
Stock code :5161 yes
Stock code :5673 yes
Stock code :8931 yes
Stock code :0146 yes
Stock code :0127 yes
Stock code :8923 yes
Stock code :03040 yes
Stock code :6769 yes
Stock code :7096 yes
Stock code :3441 yes
Stock code :5192 yes
Stock code :8672 yes
Stock code :0170 yes
Stock code :5247 yes
Stock code :0054 yes
Stock code :7216 yes
Stock code :3476 yes
Stock code :7199 yes
Stock code :0193 yes
Stock code :0151 yes
Stock code :7323 yes
Stock code :6483 yes
Stock code :0835EA yes
Stock code :0834EA yes
Stock code :7161 yes
Stock code :7077 yes
Stock code :9334 yes
Stock code :0036 yes
Stock code :0143 yes
Stock code :6203 yes
Stock code :7062 yes
Stock code :6211 yes
Stock code :5371 yes
Stock code :0210 yes
Stock code :5027 yes
Stock code :0180 yes
Stock code :5171 yes
Stock code :5280 yes
Stock code :9466 yes
Stock co

Stock code :9431 yes
Stock code :5213 yes
Stock code :5123 yes
Stock code :5279 yes
Stock code :5163 yes
Stock code :7180 yes
Stock code :0055 yes
Stock code :0251 yes
Stock code :5517 yes
Stock code :7412 yes
Stock code :5173 yes
Stock code :6017 yes
Stock code :0241 yes
Stock code :7246 yes
Stock code :0129 yes
Stock code :4197 yes
Stock code :5285 yes
Stock code :5288 yes
Stock code :4316 yes
Stock code :5172 yes
Stock code :9776 yes
Stock code :7115 yes
Stock code :7155 yes
Stock code :03008 yes
Stock code :7248 yes
Stock code :03023 yes
Stock code :7132 yes
Stock code :0117 yes
Stock code :0169 yes
Stock code :03054 yes
Stock code :0215 yes
Stock code :5242 yes
Stock code :0093 yes
Stock code :4375 yes
Stock code :5134 yes
Stock code :0225 yes
Stock code :5665 yes
Stock code :1562 yes
Stock code :0216 yes
Stock code :7103 yes
Stock code :6084 yes
Stock code :03049 yes
Stock code :5006 yes
Stock code :0080 yes
Stock code :6904 yes
Stock code :7207 yes
Stock code :2569 yes
Stock cod

### To scrape stocks that are not able to scrape at the first loop

1. Create a loop to loop through the no_scrape list to scrape stocks quarter report that are not able to scrape at the first loop
2. the while condition i<3 is use to set the number of time the loop run at most 3 times
3. once the stock quarter report is able to scrape the stock code will be remove from the no _scrape list

In [42]:
i=0
while i<3 and len(no_scrape)!=0:
    print("loop: "+str(i)+"------------------------------------------")
    print()
    
    driver = webdriver.Chrome()
    driver.implicitly_wait(20)

    for s in no_scrape:
        try:
            url='https://www.klsescreener.com/v2/stocks/view/'+s
            driver.get(url)
            content = driver.page_source
            soup=bs(content)

            # Scrap table
            tables = soup.find_all('table')
            # Read tables with Pandas read_html()
            dfs = pd.read_html(str(tables))

            # Find the table that contain column 'ROE'
            i = 0
            has_found = False
            while i < len(dfs) and not has_found:
                n = 0
                while n < len(dfs[i].columns):
                    if dfs[i].columns[n] == 'ROE':
                        has_found = True
                    n += 1
                i += 1

            if has_found:
                df=dfs[i-1]

                # Extract all the columns needed
                df2=df[['EPS','DPS','NTA','Revenue','P/L','Quarter','Q Date','Financial Year','Announced','ROE']]

                # Changes data types 
                df2['Financial Year'] = pd.to_datetime(df['Financial Year'])
                df2['Financial Year']= pd.to_datetime(df2['Financial Year'].dt.strftime('%Y-%m-%d'))

                # Filter data between two dates  
                filtered_df = df2.loc[(df2['Financial Year'] >= '2018-01-01')
                              & (df2['Financial Year'] < '2023-12-31')]
                filtered_df.reset_index(drop=True, inplace=True)

                if filtered_df.empty:
                    print("Stock code :"+s+" has no  annual report between 2018 to 2022")
                else:
                    # Drop the date row 
                    filtered_df.drop(filtered_df[filtered_df['EPS'] == filtered_df['Revenue']].index, inplace = True)
                    filtered_df.reset_index(drop=True, inplace=True)

                    # Add stock code
                    filtered_df['Stock Code']=s

                    # Get the year
                    filtered_df['Year'] = filtered_df['Financial Year'].dt.strftime("%Y")

                    # Append it into quarter_rep dataframe
                    quarter_rep = quarter_rep.append(filtered_df, ignore_index=True)
                    print("Stock code :"+s+" yes")
                    no_scrape.remove(s)

            else:
                print("Stock code :"+s+" has no  annual report")
        except:
            print("Stock code :"+s+" cannot scrap")
    
        # Program to generate a random number between 5 to 30
        # Importing the random module
        sec = random.randint(1, 5)

        # Program scrapping waiting time
        time.sleep(sec)  # Seconds
    i+=1


Stock code :5298 cannot scrap


In [43]:
quarter_rep

Unnamed: 0,EPS,DPS,NTA,Revenue,P/L,Quarter,Q Date,Financial Year,Announced,ROE,Stock Code,Year
0,1.21,0.000,0.1106,988.21m,13.66m,3,2022-09-30,2022-12-31,2022-11-29,10.9%,5250,2022
1,2.29,0.000,0.0984,943.67m,25.77m,2,2022-06-30,2022-12-31,2022-08-25,23.3%,5250,2022
2,2.16,0.000,0.1015,839.98m,24.38m,1,2022-03-31,2022-12-31,2022-05-26,21.3%,5250,2022
3,2.60,0.000,0.0799,795.06m,29.24m,4,2021-12-31,2021-12-31,2022-02-24,32.5%,5250,2021
4,0.13,0.000,0.0522,680.22m,1.44m,3,2021-09-30,2021-12-31,2021-11-25,2.5%,5250,2021
...,...,...,...,...,...,...,...,...,...,...,...,...
18424,2.35,2.000,1.2851,39.45m,10.79m,1,2019-02-28,2019-11-30,2019-04-17,1.8%,5131,2019
18425,3.44,4.000,1.2934,45.03m,15.83m,4,2018-11-30,2018-11-30,2019-01-23,2.7%,5131,2018
18426,3.36,2.000,1.2731,51.02m,17.11m,3,2018-08-31,2018-11-30,2018-10-17,2.6%,5131,2018
18427,2.62,2.000,1.2559,44.02m,12.07m,2,2018-05-31,2018-11-30,2018-07-18,2.1%,5131,2018


In [44]:
quarter_rep.to_csv("div_quarter_report.csv")

# Clean data

In [45]:
quarter=pd.read_csv("div_quarter_report.csv")

In [46]:
quarter

Unnamed: 0.1,Unnamed: 0,EPS,DPS,NTA,Revenue,P/L,Quarter,Q Date,Financial Year,Announced,ROE,Stock Code,Year
0,0,1.21,0.0,0.1106,988.21m,13.66m,3,2022-09-30,2022-12-31,2022-11-29,10.9%,5250,2022
1,1,2.29,0.0,0.0984,943.67m,25.77m,2,2022-06-30,2022-12-31,2022-08-25,23.3%,5250,2022
2,2,2.16,0.0,0.1015,839.98m,24.38m,1,2022-03-31,2022-12-31,2022-05-26,21.3%,5250,2022
3,3,2.60,0.0,0.0799,795.06m,29.24m,4,2021-12-31,2021-12-31,2022-02-24,32.5%,5250,2021
4,4,0.13,0.0,0.0522,680.22m,1.44m,3,2021-09-30,2021-12-31,2021-11-25,2.5%,5250,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18424,18424,2.35,2.0,1.2851,39.45m,10.79m,1,2019-02-28,2019-11-30,2019-04-17,1.8%,5131,2019
18425,18425,3.44,4.0,1.2934,45.03m,15.83m,4,2018-11-30,2018-11-30,2019-01-23,2.7%,5131,2018
18426,18426,3.36,2.0,1.2731,51.02m,17.11m,3,2018-08-31,2018-11-30,2018-10-17,2.6%,5131,2018
18427,18427,2.62,2.0,1.2559,44.02m,12.07m,2,2018-05-31,2018-11-30,2018-07-18,2.1%,5131,2018


### Delete uneccessary column

In [47]:
del quarter["Unnamed: 0"]
del quarter["NTA"]
del quarter["Revenue"]
del quarter["P/L"]
del quarter["Quarter"]
del quarter["Q Date"]
del quarter["Financial Year"]
del quarter["Announced"]

### Rename columns

In [48]:
quarter=quarter.rename(columns={'EPS':'quar_eps', 'DPS':'quar_dps', 'NTA':'quar_nta', 'ROE': 'quar_roe(%)', 'Stock Code':'quar_stock_code', 'Year':'quar_year'})

In [49]:
quarter

Unnamed: 0,quar_eps,quar_dps,quar_roe(%),quar_stock_code,quar_year
0,1.21,0.0,10.9%,5250,2022
1,2.29,0.0,23.3%,5250,2022
2,2.16,0.0,21.3%,5250,2022
3,2.60,0.0,32.5%,5250,2021
4,0.13,0.0,2.5%,5250,2021
...,...,...,...,...,...
18424,2.35,2.0,1.8%,5131,2019
18425,3.44,4.0,2.7%,5131,2018
18426,3.36,2.0,2.6%,5131,2018
18427,2.62,2.0,2.1%,5131,2018


In [50]:
# Remove the % symbol in column ROE
quarter['quar_roe(%)']=quarter['quar_roe(%)'].str.slice(0,-1)

In [51]:
# Replace the "-" in column ROE to 0
quarter['quar_roe(%)']=quarter['quar_roe(%)'].replace('-', '0')

In [52]:
quarter.dtypes

quar_eps           float64
quar_dps           float64
quar_roe(%)         object
quar_stock_code     object
quar_year            int64
dtype: object

In [53]:
# Replace the thousand separator ',' to ensure the whole column is in a consistent format
quarter['quar_roe(%)'] = quarter['quar_roe(%)'].str.replace(',', '')

In [54]:
# Convert string type to float type for column quar_roe(%)
quarter['quar_roe(%)']=quarter['quar_roe(%)'].astype(float)

In [55]:
quarter.dtypes

quar_eps           float64
quar_dps           float64
quar_roe(%)        float64
quar_stock_code     object
quar_year            int64
dtype: object

In [57]:
quarter

Unnamed: 0,quar_eps,quar_dps,quar_roe(%),quar_stock_code,quar_year
0,1.21,0.0,10.9,5250,2022
1,2.29,0.0,23.3,5250,2022
2,2.16,0.0,21.3,5250,2022
3,2.60,0.0,32.5,5250,2021
4,0.13,0.0,2.5,5250,2021
...,...,...,...,...,...
18424,2.35,2.0,1.8,5131,2019
18425,3.44,4.0,2.7,5131,2018
18426,3.36,2.0,2.6,5131,2018
18427,2.62,2.0,2.1,5131,2018


### Perform aggregation on columns

In [58]:
fun={'quar_eps':'sum','quar_dps':'sum','quar_roe(%)':'sum'}
roe=quarter.groupby(['quar_stock_code','quar_year']).agg(fun)
roe.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,quar_eps,quar_dps,quar_roe(%)
quar_stock_code,quar_year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2018,2.14,0.0,10.9
1,2019,2.93,1.5,9.5
1,2020,3.43,1.5,10.1
1,2021,3.44,1.5,8.7
1,2022,3.53,2.0,8.3
2,2018,11.85,0.02,10.5
2,2019,16.11,0.03,13.7
2,2020,20.76,0.04,16.5
2,2021,16.53,0.02,12.2
2,2022,41.96,9.5,27.0


In [59]:
roe=roe.reset_index()

In [60]:
roe

Unnamed: 0,quar_stock_code,quar_year,quar_eps,quar_dps,quar_roe(%)
0,0001,2018,2.14,0.00,10.9
1,0001,2019,2.93,1.50,9.5
2,0001,2020,3.43,1.50,10.1
3,0001,2021,3.44,1.50,8.7
4,0001,2022,3.53,2.00,8.3
...,...,...,...,...,...
5110,9997,2019,0.43,0.00,0.4
5111,9997,2020,2.36,0.00,2.5
5112,9997,2021,12.59,2.25,12.9
5113,9997,2022,11.47,1.25,10.4


### Data formating

In [61]:
roe['quar_eps']=(roe['quar_eps']/100).round(3)
roe['quar_dps']=(roe['quar_dps']/100).round(3)

In [62]:
roe

Unnamed: 0,quar_stock_code,quar_year,quar_eps,quar_dps,quar_roe(%)
0,0001,2018,0.021,0.000,10.9
1,0001,2019,0.029,0.015,9.5
2,0001,2020,0.034,0.015,10.1
3,0001,2021,0.034,0.015,8.7
4,0001,2022,0.035,0.020,8.3
...,...,...,...,...,...
5110,9997,2019,0.004,0.000,0.4
5111,9997,2020,0.024,0.000,2.5
5112,9997,2021,0.126,0.022,12.9
5113,9997,2022,0.115,0.012,10.4


### Change data types

In [63]:
roe.dtypes

quar_stock_code     object
quar_year            int64
quar_eps           float64
quar_dps           float64
quar_roe(%)        float64
dtype: object

In [64]:
# Convert the quar_year column to string type
roe['quar_year'] = roe['quar_year'].apply(str)

In [65]:
roe.dtypes

quar_stock_code     object
quar_year           object
quar_eps           float64
quar_dps           float64
quar_roe(%)        float64
dtype: object

# Load data into database

In [66]:
import psycopg2
from sqlalchemy import create_engine
from urllib.parse import quote 
from datetime import timedelta

In [67]:
conn_string = 'postgresql://postgres:%s@localhost/dividend_investing_no2'% quote('@SMWHot4')
db = create_engine(conn_string)
conn = db.connect()

In [68]:
roe.to_sql('stock_performance', con=conn, if_exists='replace',
          index=False)
conn = psycopg2.connect(conn_string
                        )
conn.autocommit = True
cursor = conn.cursor()

sql1 = '''select * from stock_performance;'''
cursor.execute(sql1)
for i in cursor.fetchall():
    print(i)

conn.commit()
conn.close()

('0001', '2018', 0.021, 0.0, 10.9)
('0001', '2019', 0.029, 0.015, 9.5)
('0001', '2020', 0.034, 0.015, 10.1)
('0001', '2021', 0.034, 0.015, 8.7)
('0001', '2022', 0.035, 0.02, 8.3)
('0002', '2018', 0.118, 0.0, 10.5)
('0002', '2019', 0.161, 0.0, 13.7)
('0002', '2020', 0.208, 0.0, 16.5)
('0002', '2021', 0.165, 0.0, 12.2)
('0002', '2022', 0.42, 0.095, 27.0)
('0002', '2023', 0.123, 0.0, 7.0)
('0005', '2018', 0.039, 0.0, 66.80000000000001)
('0005', '2019', 0.017, 0.0, 20.3)
('0005', '2020', -0.03, 0.0, -63.599999999999994)
('0005', '2021', 0.02, 0.0, 28.299999999999997)
('0005', '2022', -0.056, 0.0, -124.1)
('0005', '2023', -0.007, 0.0, -16.6)
('0006', '2018', 0.001, 0.0, 0.1)
('0006', '2019', -0.034, 0.0, -6.5)
('0006', '2020', -0.038, 0.0, -7.8999999999999995)
('0006', '2021', -0.008, 0.0, -1.8)
('0006', '2022', -0.017, 0.0, -3.8999999999999995)
('0006', '2023', -0.011, 0.0, -2.6)
('0007', '2018', 0.004, 0.0, 3.2)
('0007', '2019', -0.027, 0.0, -31.500000000000004)
('0007', '2020', -0.096, 0