# Business results and asset structure

## Importing libraries

In [32]:
import pandas as pd
import plotly.graph_objects as go
import requests
from bs4 import BeautifulSoup
from plotly.subplots import make_subplots
import plotly.express as px


In [33]:
SYMBOL = 'VCB'

## Business results

### Initialize request

In [34]:
# init params to call GET request
url = 'https://s.cafef.vn/Ajax/HoSoCongTy.aspx'
data = {
    'symbol': SYMBOL,
    'Type': 1,
    'PageIndex': 0,
    'PageSize': 4
}
headers = {
    "Cache-Control": "no-cache",
    "Pragma": "no-cache"
}
method = "GET"


### Crawl data

In [35]:
page_index = 0
crawl_result = []
profit_index = None
while True:
    # call GET request and parse response to BeautifulSoup
    data['PageIndex'] = page_index
    response = requests.request(method, url, headers=headers, params=data)
    soup = BeautifulSoup(response.text, 'html.parser')

    # get data in table, the first row (header) is the quarter, the second row (data row) is the revenue
    table = soup.find('table')
    trS = table.find_all('tr')
    quarter = trS[0].find_all('th')[4].text
    revenue = trS[2].find_all('td')[4].text
    
    # find the profit column index
    if profit_index is None:
        for index, tr in enumerate(trS):
            if ("Lợi nhuận sau thuế" in tr.text) or ("Lợi nhuận ròng" in tr.text):
                profit_index = index
                break
    
    # get profit
    profit = trS[profit_index].find_all('td')[4].text

    # append to result
    crawl_result.append({
        'quarter': quarter,
        'revenue': revenue,
        'profit': profit
    })

    # check if there is next page when the quarter (Quý) cannot be found in the header
    if not trS[0].find_all('th')[3].text.startswith('Quý'):
        break
    else:
        page_index += 1

# reverse the result to make the time series from the oldest to the newest
crawl_result.reverse()

# demo crawled data
crawl_result[:5]


[{'quarter': 'Quý 1-2006',
  'revenue': '2,100,312,212',
  'profit': '740,728,368'},
 {'quarter': 'Quý 2-2006',
  'revenue': '2,342,642,243',
  'profit': '581,164,117'},
 {'quarter': 'Quý 1-2007',
  'revenue': '2,273,476,595',
  'profit': '296,886,310'},
 {'quarter': 'Quý 2-2007',
  'revenue': '2,995,895,239',
  'profit': '726,305,638'},
 {'quarter': 'Quý 3-2008',
  'revenue': '5,021,677,000',
  'profit': '457,233,000'}]

### Clean data

In [36]:
cleaned_data = list(map(lambda x: {
    'quarter': x['quarter'].replace("Quý ", ""),
    'revenue': 0 if x['revenue'] == '' else int(x['revenue'].replace(',', '')) * 1000,
    'profit': 0 if x['profit'] == '' else int(x['profit'].replace(',', '')) * 1000
}, crawl_result))

cleaned_data[:5]


[{'quarter': '1-2006', 'revenue': 2100312212000, 'profit': 740728368000},
 {'quarter': '2-2006', 'revenue': 2342642243000, 'profit': 581164117000},
 {'quarter': '1-2007', 'revenue': 2273476595000, 'profit': 296886310000},
 {'quarter': '2-2007', 'revenue': 2995895239000, 'profit': 726305638000},
 {'quarter': '3-2008', 'revenue': 5021677000000, 'profit': 457233000000}]

### Visualize data

In [37]:
quarter = [item['quarter'] for item in cleaned_data]
revenue = [item['revenue'] for item in cleaned_data]
profit = [item['profit'] for item in cleaned_data]

fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(
    go.Bar(
        x=quarter,
        y=revenue,
        name='Revenue (VND)',
    )
)
fig.add_trace(
    go.Scatter(
        x=quarter,
        y=profit,
        name='Profit after taxes (VND)',
        mode='lines+markers',
        line={
            'shape': 'spline',
            'smoothing': 1.3
        }
    ),
    secondary_y=True
)

fig.update_layout(
    title_text='Business results ' + SYMBOL,
    title_x=0.5,
    title_font_size=20,
    width=800,
    height=600,
    yaxis_tickformat='0.2s',
    template='plotly_white',
)
fig.update_xaxes(title_text='Quarter')
fig.update_yaxes(title_text='Revenue (VND)', secondary_y=False)
fig.update_yaxes(title_text='Profit after taxes (VND)', secondary_y=True)

fig.show()


## Asset structure

### Initialize request

In [38]:
year = 2023
url = 'https://s.cafef.vn/bao-cao-tai-chinh/{stock_symbol}/BSheet/{year}/4/0/0/bao-cao-tai-chinh-ngan-hang-thuong-mai-co-phan-ngoai-thuong-viet-nam.chn'


### Crawl data

In [39]:
crawled_data = []
while True:
    # request and parse bs4
    response = requests.get(url.format(year=year, stock_symbol=SYMBOL))
    soup = BeautifulSoup(response.text, 'html.parser')

    # retrieve body element from html
    body = soup.find('body')

    # get 4 quarters
    table_title = body.find('table', {'id': 'tblGridData'})
    tdS = table_title.find('tr').find_all('td')
    quarters = [td.text.strip() for td in tdS[1:5]]

    # get table data
    table_content = body.find('table', {'id': 'tableContent'})
    trS = table_content.find_all(
        'tr',
        recursive=False,
        style=lambda x: x == 'cursor:pointer' or x is None,
    )

    # with open('data.html', 'w') as f:
    #     f.write(str(table_content))
    # break

    # get data in 4 columns (4 quarters in a year)
    for i in range(3, -1, -1):
        # init current data
        current_data = {
            'quarter': quarters[i],
        }

        # get data in each row in this quarter
        for index, tr in enumerate(trS):
            tdS = tr.find_all('td')
            title = tdS[0].text.strip()

            # title have Roman number -> skip
            if "i - tài sản ngắn hạn" in title.lower() or "ii - tài sản dài hạn" in title.lower() or "i. tài sản" in title.lower():
                continue
            # title is sum of assets -> end of data -> break
            elif "tổng tài sản" in title.lower() or "tổng cộng tài sản" in title.lower():
                break
            # add data title and value to current_data
            else:
                current_data[title] = tdS[i + 1].text.strip()

        crawled_data.append(current_data)

    # when meet quarter without data -> end of data -> break
    patience = 0
    for k, v in current_data.items():
        if v == '':
            patience += 1

    if patience >= 10 and year != 2022:
        break
    else:
        year -= 1

crawled_data.reverse()


### Clean data

In [40]:
def process_data(x):
    hihi = {'quarter': x['quarter'].replace('Quý ', ''), }
    for key, value in x.items():
        if key == 'quarter':
            continue

        if value == '':
            value = 0
        else:
            value = float(value.replace(',', ''))

        hihi[key[3:]] = value

    return hihi


cleaned_data = list(map(process_data, crawled_data))
cleaned_data_df = pd.DataFrame(cleaned_data)
cleaned_data_df.head()


Unnamed: 0,quarter,Tiền mặt và các khoản tương đương tại quỹ,Tiền gửi tại NHNN,"Tiền, vàng gửi tại các TCTD khác và cho vay các TCTD khác",Cho vay khách hàng,Chứng khoán kinh doanh,Chứng khoán đầu tư,Các công cụ tài chính phái sinh và các tài sản tài chính khác,"Góp vốn, đầu tư dài hạn",Tài sản cố định,Bất động sản đầu tư,Tài sản Có khác
0,1-2008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2-2008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3-2008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4-2008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1-2009,3511584000000.0,27925460000000.0,25216080000000.0,113136200000000.0,255949500000.0,42581000000000.0,0.0,2996928000000.0,1287366000000.0,3582878000000.0,0.0


### Visualize data

In [41]:
fig = px.bar(cleaned_data_df,
             x='quarter',
             y=cleaned_data_df.columns.drop('quarter'),
             log_y=True)

fig.update_layout(title_text=f'Asset structure of {SYMBOL} (log scale)',
                  title_x=0.5,
                  width=1000,
                  height=600,
                  legend_title_text='Asset',
                  template='plotly_white')
fig.update_xaxes(title_text='Quarter')
fig.update_yaxes(title_text='Value (VND)')
fig.show()
