# Scrape quarter report from KLSE
-scrape quarter report information (EPS,DPS,Financial Year,ROE) from KLSE website
- https://www.klsescreener.com/v2/stocks/view/7152/jaycorp-bhd

### Import libraries


In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup as bs
from datetime import datetime
import pandas as pd
import time
import random
import warnings
warnings.filterwarnings("ignore")

### Get stock code

In [None]:
df = pd.read_csv('stock_code.csv')
df

In [None]:
# check the stock code is correct by checking their dimension
df.shape

In [None]:
# Remove the last three character 
df['stock_code']=df['stock_code'].str.slice(0,-3)

In [None]:
# Put it into a list called stock_code
stock_code=list(df['stock_code'])
stock_code

### Start scrapping
- loop through all the stock code and use beautiful soup to get the tables
- loop through all the tables and until the column 'ROE' found ,which mean the quarter report table has found
- if no quarter report table found, append the stock code into no_quarter list
- if quarter report table found, filter the quarter report table financial year from 2018 to now

In [None]:
#create an empty dataframe
quarter_rep=pd.DataFrame()

#connect/open to Chrome webdriver 
driver = webdriver.Chrome()
driver.implicitly_wait(60)

no_quarter=[]


for s in stock_code:
    try:
        url='https://www.klsescreener.com/v2/stocks/view/'+s
        driver.get(url)
        content = driver.page_source
        soup=bs(content)

        # scrap table
        tables = soup.find_all('table')
        # Read tables with Pandas read_html()
        dfs = pd.read_html(str(tables))

        # find the table that contain column 'ROE'
        i = 0
        has_found = False
        while i < len(dfs) and not has_found:
            n = 0
            while n < len(dfs[i].columns):
                if dfs[i].columns[n] == 'ROE':
                    has_found = True
                n += 1
            i += 1
            
        if has_found:
            df=dfs[i-1]
            
            # extract all the columns needed
            df2=df[['EPS','DPS','NTA','Revenue','P/L','Quarter','Q Date','Financial Year','Announced','ROE']]
            
            # changes data types 
            df2['Financial Year'] = pd.to_datetime(df['Financial Year'])
            df2['Financial Year']= pd.to_datetime(df2['Financial Year'].dt.strftime('%Y-%m-%d'))
            
            # Filter data between two dates  
            filtered_df = df2.loc[(df2['Financial Year'] >= '2018-01-01')
                          & (df2['Financial Year'] < '2023-12-31')]
            filtered_df.reset_index(drop=True, inplace=True)
            
            # drop the date row 
            filtered_df.drop(filtered_df[filtered_df['EPS'] == filtered_df['Revenue']].index, inplace = True)
            filtered_df.reset_index(drop=True, inplace=True)
            
            # add stock code
            filtered_df['Stock Code']=s
            
            # get the year
            filtered_df['Year'] = filtered_df['Financial Year'].dt.strftime("%Y")
            
            # append it into quarter_rep dataframe
            quarter_rep = quarter_rep.append(filtered_df, ignore_index=True)
            print("Stock code :"+s+" yes")
            
        else:
            no_quater.append(s)
            print("Stock code :"+s+" has no  annual report")
    except:
        print("Stock code :"+s+" cannot scrap")
        
    # Program to generate a random number between 5 to 30
    # importing the random module
    sec = random.randint(3, 8)

    # program scrapping waiting time
    time.sleep(sec)  # Seconds


In [None]:
quarter_rep.to_csv("div_quarter_report.csv")

# Clean data

In [2]:
quarter=pd.read_csv("div_quarter_report.csv")

In [3]:
quarter

Unnamed: 0.1,Unnamed: 0,EPS,DPS,NTA,Revenue,P/L,Quarter,Q Date,Financial Year,Announced,ROE,Stock Code,Year
0,0,1.21,0.0,0.1106,988.21m,13.66m,3,2022-09-30,2022-12-31,2022-11-29,10.9%,5250,2022
1,1,2.29,0.0,0.0984,943.67m,25.77m,2,2022-06-30,2022-12-31,2022-08-25,23.3%,5250,2022
2,2,2.16,0.0,0.1015,839.98m,24.38m,1,2022-03-31,2022-12-31,2022-05-26,21.3%,5250,2022
3,3,2.60,0.0,0.0799,795.06m,29.24m,4,2021-12-31,2021-12-31,2022-02-24,32.5%,5250,2021
4,4,0.13,0.0,0.0522,680.22m,1.44m,3,2021-09-30,2021-12-31,2021-11-25,2.5%,5250,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11760,11760,2.35,2.0,1.2851,39.45m,10.79m,1,2019-02-28,2019-11-30,2019-04-17,1.8%,5131,2019
11761,11761,3.44,4.0,1.2934,45.03m,15.83m,4,2018-11-30,2018-11-30,2019-01-23,2.7%,5131,2018
11762,11762,3.36,2.0,1.2731,51.02m,17.11m,3,2018-08-31,2018-11-30,2018-10-17,2.6%,5131,2018
11763,11763,2.62,2.0,1.2559,44.02m,12.07m,2,2018-05-31,2018-11-30,2018-07-18,2.1%,5131,2018


### Delete uneccessary column

In [4]:
del quarter["Unnamed: 0"]

In [5]:
del quarter["NTA"]

In [6]:
del quarter["Revenue"]

In [7]:
del quarter["P/L"]

In [8]:
del quarter["Quarter"]

In [9]:
del quarter["Q Date"]

In [10]:
del quarter["Financial Year"]

In [11]:
del quarter["Announced"]

### Rename columns

In [12]:
quarter=quarter.rename(columns={'EPS':'quar_eps', 'DPS':'quar_dps', 'NTA':'quar_nta', 'ROE': 'quar_roe(%)', 'Stock Code':'quar_stock_code', 'Year':'quar_year'})

In [13]:
quarter

Unnamed: 0,quar_eps,quar_dps,quar_roe(%),quar_stock_code,quar_year
0,1.21,0.0,10.9%,5250,2022
1,2.29,0.0,23.3%,5250,2022
2,2.16,0.0,21.3%,5250,2022
3,2.60,0.0,32.5%,5250,2021
4,0.13,0.0,2.5%,5250,2021
...,...,...,...,...,...
11760,2.35,2.0,1.8%,5131,2019
11761,3.44,4.0,2.7%,5131,2018
11762,3.36,2.0,2.6%,5131,2018
11763,2.62,2.0,2.1%,5131,2018


In [15]:
# Remove the % symbol in column ROE
quarter['quar_roe(%)']=quarter['quar_roe(%)'].str.slice(0,-1)

In [16]:
# Replace the "-" in column ROE to 0
quarter['quar_roe(%)']=quarter['quar_roe(%)'].replace('-', '0')

In [17]:
quarter.dtypes

quar_eps           float64
quar_dps           float64
quar_roe(%)         object
quar_stock_code     object
quar_year            int64
dtype: object

In [18]:
# Replace the thousand separator ',' to ensure the whole column is in a consistent format
quarter['quar_roe(%)'] = quarter['quar_roe(%)'].str.replace(',', '')

In [19]:
# Convert string type to float type for column quar_roe(%)
quarter['quar_roe(%)']=quarter['quar_roe(%)'].astype(float)

In [20]:
quarter.dtypes

quar_eps           float64
quar_dps           float64
quar_roe(%)        float64
quar_stock_code     object
quar_year            int64
dtype: object

### Perform aggregation on columns

In [21]:
fun={'quar_eps':'sum','quar_dps':'sum','quar_roe(%)':'sum'}
roe=quarter.groupby(['quar_stock_code','quar_year']).agg(fun)
roe.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,quar_eps,quar_dps,quar_roe(%)
quar_stock_code,quar_year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2018,2.14,0.0,10.9
1,2019,2.93,1.5,9.5
1,2020,3.43,1.5,10.1
1,2021,3.44,1.5,8.7
1,2022,3.53,2.0,8.3
2,2018,11.85,0.02,10.5
2,2019,16.11,0.03,13.7
2,2020,20.76,0.04,16.5
2,2021,16.53,0.02,12.2
2,2022,41.96,9.5,27.0


In [22]:
roe=roe.reset_index()

In [23]:
roe

Unnamed: 0,quar_stock_code,quar_year,quar_eps,quar_dps,quar_roe(%)
0,0001,2018,2.14,0.00,10.9
1,0001,2019,2.93,1.50,9.5
2,0001,2020,3.43,1.50,10.1
3,0001,2021,3.44,1.50,8.7
4,0001,2022,3.53,2.00,8.3
...,...,...,...,...,...
3254,9997,2019,0.43,0.00,0.4
3255,9997,2020,2.36,0.00,2.5
3256,9997,2021,12.59,2.25,12.9
3257,9997,2022,11.47,1.25,10.4


### Data formating

In [24]:
roe['quar_eps']=(roe['quar_eps']/100).round(3)
roe['quar_dps']=(roe['quar_dps']/100).round(3)

In [25]:
roe

Unnamed: 0,quar_stock_code,quar_year,quar_eps,quar_dps,quar_roe(%)
0,0001,2018,0.021,0.000,10.9
1,0001,2019,0.029,0.015,9.5
2,0001,2020,0.034,0.015,10.1
3,0001,2021,0.034,0.015,8.7
4,0001,2022,0.035,0.020,8.3
...,...,...,...,...,...
3254,9997,2019,0.004,0.000,0.4
3255,9997,2020,0.024,0.000,2.5
3256,9997,2021,0.126,0.022,12.9
3257,9997,2022,0.115,0.012,10.4


### Change data types

In [26]:
roe.dtypes

quar_stock_code     object
quar_year            int64
quar_eps           float64
quar_dps           float64
quar_roe(%)        float64
dtype: object

In [27]:
# Convert the quar_year column to string type
roe['quar_year'] = roe['quar_year'].apply(str)

In [28]:
roe.dtypes

quar_stock_code     object
quar_year           object
quar_eps           float64
quar_dps           float64
quar_roe(%)        float64
dtype: object

# Load data into database

In [None]:
import psycopg2
from sqlalchemy import create_engine
from urllib.parse import quote 
from datetime import timedelta

In [None]:
conn_string = 'postgresql://postgres:%s@localhost/dividend_investing_no2'% quote('@SMWHot4')
db = create_engine(conn_string)
conn = db.connect()

In [None]:
roe.to_sql('stock_performance', con=conn, if_exists='replace',
          index=False)
conn = psycopg2.connect(conn_string
                        )
conn.autocommit = True
cursor = conn.cursor()

sql1 = '''select * from stock_performance;'''
cursor.execute(sql1)
for i in cursor.fetchall():
    print(i)

conn.commit()
conn.close()