# Data Crawling
## Install and import packages

In [932]:
#!pip install vnstock pandas requests bs4

In [933]:
from vnstock import *
from datetime import datetime
import pandas as pd
from requests.exceptions import ConnectTimeout
import time
import re
import logging

logger = logging.getLogger('server_logger')
logger.setLevel(logging.ERROR)
logger.setLevel(logging.INFO)

# for handler in logger.handlers:
#     print(handler)
#     logger.removeHandler(handler)

if not logging.getLogger().hasHandlers():
    file_handler = logging.FileHandler('logs.log')

    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)


## Crawl data

Data to be crawled:
- Quarterly:
    - Thông tin tổng quan (Company overview)
    - Hồ sơ công ty (Company profile)
    - Danh sách cổ đông (Company large shareholders)
    - Các chỉ số tài chính cơ bản (Company fundamental ratio)
    
    - Danh sách công ty con, công ty liên kết (Company subsidiaries listing)
    - Ban lãnh đạo công ty (Company officers)
    - Chỉ số tài chính cơ bản (Financial ratio)
    - Báo cáo kinh doanh (Income statement)
    - Bảng cân đối kế toán (Balance sheet)
    - Báo cáo lưu chuyển tiền tệ (Cash flow)
    - Chỉ số tài chính (Financial ratio)


- Daily
    - Công ty (Companies):
        - Danh sách công ty (Company listing)
        - Mức biến động giá cổ phiếu (Ticker price volatility)
        - Thông tin giao dịch nội bộ (Company insider deals)
        - Thông tin sự kiện quyền (Company events)
        - Tin tức công ty (Company news)
        - Giá cổ phiếu (Stock history)

        - Dữ liệu khớp lệnh trong ngày giao dịch (Stock intraday)
        - Định giá cổ phiếu (Stock evaluation)
        - Đánh giá cổ phiếu (Stock rating)
    - Quỹ (Funds):
        - Danh sách quỹ (Funds listing)
        - Các mã quỹ nắm giữ (Top holding list details)
        - Ngành mà quỹ đang đầu tư (Industry holding list details)
        - Báo cáo NAV (Nav report)
        
        - Tỉ trọng tài sản nắm giữ (Asset holding list)

## Daily

### Companies

#### Danh sách công ty (Company listing)

In [934]:
def capture_company_listing():
    try:
        df = listing_companies(live=True)
    except Exception as e:
        error_message = f"Error capturing company listing data: {e}"
        logger.error(error_message)
        return pd.DataFrame()
    return df

#### Mức biến động giá cổ phiếu (Ticker price volatility)

In [935]:
def capture_ticker_volatility(*args):
    symbol = args[0]
    try:
        df = ticker_price_volatility(symbol=symbol)
    except Exception as e:
        error_message = f"Error capturing ticker price volatility data for symbol {symbol}: {e}"
        logger.error(error_message)
        return pd.DataFrame()
    return df

#### Thông tin giao dịch nội bộ (Company insider deals)

In [936]:
def capture_insider_deals(*args, page_size=30):
    symbol = args[0]
    capture_date = args[1]
    date_string = capture_date.strftime("%Y-%m-%d")

    try:
        df = company_insider_deals (
            symbol=symbol, 
            page_size=page_size, 
            page=0)
    except Exception as e:
        error_message = f"Error capturing company insider deals data for symbol {symbol}: {e}"
        logger.error(error_message)
        return pd.DataFrame()
    return df[df['dealAnnounceDate'] == date_string]

#### Thông tin sự kiện quyền (Company events)

In [937]:
def capture_company_event(*args, page_size=10):
    symbol = args[0]
    capture_date = args[1]
    try:
        df = company_events(
            symbol=symbol, 
            page_size=page_size, 
            page=0)

    except Exception as e:
        error_message = f"Error capturing company events data for symbol {symbol}: {e}"
        logger.error(error_message)
        return pd.DataFrame()
    
    df["exerDateFormatted"] = pd.to_datetime(df['exerDate'], format="%Y-%m-%d %H:%M:%S")
    return df[df['exerDateFormatted'] == capture_date]

#### Tin tức công ty (Company news)

In [938]:
def capture_company_news(*args, page_size=10):
    symbol = args[0]
    capture_date = args[1]

    try:
        df = company_news(
            symbol=symbol, 
            page_size=page_size, 
            page=0)

    except Exception as e:
        error_message = f"Error capturing company news for symbol {symbol}: {e}"
        logger.error(error_message)
        return pd.DataFrame()
    
    df["publishDateFormatted"] = pd.to_datetime(df['publishDate'], format="%Y-%m-%d %H:%M:%S")
    return df[df['publishDateFormatted'] == capture_date]

#### Giá cổ phiếu (Stock history) 

In [939]:
def capture_stock_history(*args):
    symbol = args[0]
    capture_date = args[1]
    date_string = capture_date.strftime("%Y-%m-%d")

    try:
        df = stock_historical_data(
            symbol=symbol,
            start_date=date_string,
            end_date=date_string,
            resolution='1D',
            type="stock",
            beautify=True,
            decor=False,
            source='DNSE'
        )
    except Exception as e:
        error_message = f"Error capturing historical data for symbol {symbol}: {e}"
        logger.error(error_message)
        return pd.DataFrame()

    if 'ticker' in df.columns:
        ticker_col = df['ticker']
        df.drop(columns=['ticker'], inplace=True)
        df.insert(0, 'ticker', ticker_col)

    return df


#### Dữ liệu khớp lệnh trong ngày giao dịch (Stock intraday)


In [940]:
def capture_stock_intraday(*args, page_size=1000):
    symbol = args[0]
    
    try:
        df = stock_intraday_data(
            symbol=symbol,                 
            page_size=page_size, 
            investor_segment=True)
    except Exception as e:
        error_message = f"Error capturing stock intraday data for symbol {symbol}: {e}"
        logger.error(error_message)
        return pd.DataFrame()
    return df

#### Định giá cổ phiếu (Stock evaluation)

In [941]:
def capture_stock_evaluation(*args):
    symbol = args[0]
    capture_date = args[1]
    date_string = capture_date.strftime("%Y-%m-%d")

    try:
        df = stock_evaluation(
            symbol=symbol,
            period=1, 
            time_window='D')
    except Exception as e:
        error_message = f"Error capturing evaluation data for symbol {symbol}: {e}"
        logger.error(error_message)
        return pd.DataFrame()

    return df[df['fromDate'] == date_string]

#### Đánh giá cổ phiếu (Stock rating)


In [942]:
def capture_stock_rating(*args):
    symbol = args[0]

    try:
        df_general = general_rating(symbol)
        df_biz_model = biz_model_rating(symbol)
        df_biz_operation = biz_operation_rating(symbol)
        df_financial_health = financial_health_rating(symbol)
        df_valuation = valuation_rating(symbol)
        df_industry_health = industry_financial_health(symbol)

        dfs = [df_general, df_biz_model, df_biz_operation, df_financial_health, df_valuation, df_industry_health]

        df_merged = pd.concat([df.set_index('ticker') for df in dfs], axis=1, join='outer').reset_index()

        df_merged = df_merged.loc[:,~df_merged.columns.duplicated()]

    except Exception as e:
        error_message = f"Error capturing stock rating data for symbol {symbol}: {e}"
        logger.error(error_message)
        return pd.DataFrame()
    return df_merged

### Crawl all daily companies data and store

In [943]:
def retry_request(func, *args, max_retries=3, retry_delay=20):
    for _ in range(max_retries):
        try:
            return func(*args)
        except ConnectTimeout as e:
            logger.error(f"Connection timeout: {e}")
            logger.error(f"Retrying in {retry_delay} seconds...")
            print(f"Retrying in {retry_delay} seconds...")
            time.sleep(retry_delay)
    logger.error("Max retries exceeded. Skipping.")
    print("Max retries exceeded. Skipping.")
    return None

def process_symbols(symbols, capture_function, *args):
    data_dict = {}

    for i, symbol in enumerate(symbols, start=1):
        data = retry_request(capture_function, symbol, *args)
        if not data.empty:
            data.columns = [convert_column_name(column) for column in data.columns]
            data_dict[symbol] = data
        print(f"Processed {i}/{len(symbols)} symbols of {capture_function.__name__}")

    return data_dict

def concat_if_not_empty(data_dict):
    df_list = [df for df in data_dict.values() if not df.empty]
    return pd.concat(df_list, keys=data_dict.keys(), names=['Symbol']) if df_list else None

def get_number_of_rows(data_dict):
    return len(data_dict) if data_dict is not None else 0

def convert_column_name(column_name):
    spaced_name = re.sub(r'(?<=[a-z])([A-Z])', r' \1', column_name)
    title_case_name = spaced_name.title().replace("_", " ")
    return title_case_name  

In [944]:
def export_data_to_excel(capture_date, data_dict, listing_df, list_name, file_path):
    excel = pd.ExcelWriter(file_path, mode='a', if_sheet_exists='replace') if os.path.exists(file_path) else pd.ExcelWriter(file_path, mode='w')

    with excel as writer:
        summary_df = pd.DataFrame({
            'Sheet Name': [convert_column_name(sheet_name) for sheet_name in data_dict.keys()],
            'Number of Rows': [get_number_of_rows(data) for data in data_dict.values()]
        })

        company_listing_summary = pd.DataFrame({
            'Sheet Name': [f'{list_name} Listing'],
            'Number of Rows': [get_number_of_rows(listing_df)]
        })
        summary_df = pd.concat([company_listing_summary, summary_df], ignore_index=True)

        summary_df.to_excel(writer, sheet_name=f'Daily {list_name} Info Summary', index=False)

        listing_df.columns = [convert_column_name(column) for column in listing_df.columns]
        listing_df.to_excel(writer, sheet_name=f'{list_name} Listing', index=False)
        for function_name, data in data_dict.items():
            df = concat_if_not_empty(data)
            if df is not None:
                df.to_excel(writer, sheet_name=convert_column_name(function_name), index=False)

In [945]:
def capture_all_stock_data(capture_date, file_path, limit=None):
    logger.info("Capture all daily stock data started.")
    company_listing_df = capture_company_listing()
    if limit is not None:
        company_symbols = company_listing_df["ticker"][:limit]
    else:
        company_symbols = company_listing_df["ticker"]

    functions_to_capture = [
        capture_ticker_volatility,
        capture_insider_deals,
        capture_company_event,
        capture_company_news,
        capture_stock_history,
        capture_stock_intraday,
        capture_stock_evaluation,
        capture_stock_rating,
    ]

    data_dict = {f.__name__: process_symbols(company_symbols, f, capture_date) for f in functions_to_capture}

    export_data_to_excel(capture_date, data_dict, company_listing_df, 'Company', file_path)
    
    logger.info("Capture all daily stock data ended.")
    return company_listing_df, *data_dict.values()

### Funds

#### Danh sách quỹ (Funds listing)

In [946]:
def capture_fund_listing():
    try:
        df = funds_listing(
            lang='en', 
            fund_type="",
            mode="full",
            decor=False
        )
    except Exception as e:
        error_message = f"Error capturing fund listing data: {e}"
        logger.error(error_message)
        return pd.DataFrame()
    return df

#### Fund Details
- Các mã quỹ nắm giữ (Top holding list details)
- Ngành mà quỹ đang đầu tư (Industry holding list details)
- Báo cáo NAV (Nav report)
- Tỉ trọng tài sản nắm giữ (Asset holding list)

In [947]:
FUND_DETAILS_CATE = ["top_holding_list", "industry_holding_list", "nav_report", "asset_holding_list"]
def capture_fund_details(*args):
    symbol = args[0]
    type = args[1]
    if len(args) == 3:
        capture_date = args[2]
        date_string = capture_date.strftime("%Y-%m-%d")

    try:
        if type not in FUND_DETAILS_CATE:
            raise ValueError(f"Invalid type '{type}'. Type must be one of {FUND_DETAILS_CATE}")
        
        df = fund_details(
            symbol=symbol, 
            type=type
        )
    except Exception as e:
        error_message = f"Error capturing fund details data of {type} for {symbol}: {e}"
        logger.error(error_message)
        return pd.DataFrame()

    if 'symbol' in df.columns:
        ticker_col = df['symbol']
        df.drop(columns=['symbol'], inplace=True)
        df.insert(0, 'symbol', ticker_col)
    if 'Ngày' in df.columns:
        return df[df['Ngày'] == date_string]

    return df

### Crawl all funds daily data and store

In [948]:
def capture_all_fund_data(capture_date, file_path, limit=None):
    logger.info("Capture all daily fund data started.")
    fund_listing_df = capture_fund_listing()
    if limit is not None:
        fund_symbols = fund_listing_df["shortName"][:limit]
    else:
        fund_symbols = fund_listing_df["shortName"]

    data_dict = {s: process_symbols(fund_symbols, capture_fund_details, s, capture_date) for s in FUND_DETAILS_CATE}

    export_data_to_excel(capture_date, data_dict, fund_listing_df, 'Stock', file_path)
    
    logger.info("Capture all daily fund data ended.")
    return fund_listing_df, *data_dict.values() 
    

In [950]:
def capture_all_daily(capture_date, limit=None):
    date_string = capture_date.strftime("%Y-%m-%d")
    file_path = f'output_{date_string}.xlsx'

    if os.path.exists(file_path):
        os.remove(file_path)

    all_stock_data = capture_all_stock_data(capture_date, file_path, limit)
    all_fund_data = capture_all_fund_data(capture_date, file_path, limit)
    
    logger.info("Capture all daily data ended.")
    return all_stock_data, all_fund_data

my_date = datetime(2023,12,25)
capture_all_daily(my_date, 10)

INFO:server_logger:Capture all daily stock data started.


Processed 1/10 symbols of capture_ticker_volatility
Processed 2/10 symbols of capture_ticker_volatility
Processed 3/10 symbols of capture_ticker_volatility
Processed 4/10 symbols of capture_ticker_volatility
Processed 5/10 symbols of capture_ticker_volatility
Processed 6/10 symbols of capture_ticker_volatility
Processed 7/10 symbols of capture_ticker_volatility
Processed 8/10 symbols of capture_ticker_volatility
Processed 9/10 symbols of capture_ticker_volatility
Processed 10/10 symbols of capture_ticker_volatility
Processed 1/10 symbols of capture_insider_deals
Processed 2/10 symbols of capture_insider_deals
Processed 3/10 symbols of capture_insider_deals
Processed 4/10 symbols of capture_insider_deals
Processed 5/10 symbols of capture_insider_deals
Processed 6/10 symbols of capture_insider_deals
Processed 7/10 symbols of capture_insider_deals
Processed 8/10 symbols of capture_insider_deals
Processed 9/10 symbols of capture_insider_deals
Processed 10/10 symbols of capture_insider_deal

ERROR:server_logger:Error capturing stock intraday data for symbol A32: "['cp', 'rcp'] not found in axis"


Processed 1/10 symbols of capture_stock_intraday
Processed 2/10 symbols of capture_stock_intraday
Processed 3/10 symbols of capture_stock_intraday
Processed 4/10 symbols of capture_stock_intraday
Processed 5/10 symbols of capture_stock_intraday
Processed 6/10 symbols of capture_stock_intraday
Processed 7/10 symbols of capture_stock_intraday
Processed 8/10 symbols of capture_stock_intraday
Processed 9/10 symbols of capture_stock_intraday
Processed 10/10 symbols of capture_stock_intraday
Processed 1/10 symbols of capture_stock_evaluation
Processed 2/10 symbols of capture_stock_evaluation
Processed 3/10 symbols of capture_stock_evaluation
Processed 4/10 symbols of capture_stock_evaluation
Processed 5/10 symbols of capture_stock_evaluation
Processed 6/10 symbols of capture_stock_evaluation
Processed 7/10 symbols of capture_stock_evaluation
Processed 8/10 symbols of capture_stock_evaluation
Processed 9/10 symbols of capture_stock_evaluation
Processed 10/10 symbols of capture_stock_evaluatio

  return pd.concat(df_list, keys=data_dict.keys(), names=['Symbol']) if df_list else None
INFO:server_logger:Capture all daily stock data ended.
INFO:server_logger:Capture all daily fund data started.


Total number of funds currently listed on Fmarket:  42
Getting data for VESAF
Processed 1/10 symbols of capture_fund_details
Getting data for VEOF
Processed 2/10 symbols of capture_fund_details
Getting data for SSISCA
Processed 3/10 symbols of capture_fund_details
Getting data for VIBF
Processed 4/10 symbols of capture_fund_details
Getting data for VCBF-BCF
Processed 5/10 symbols of capture_fund_details
Getting data for DCDS
Processed 6/10 symbols of capture_fund_details
Getting data for VCBF-TBF
Processed 7/10 symbols of capture_fund_details
Getting data for BVPF
Processed 8/10 symbols of capture_fund_details
Getting data for VCAMBF
Processed 9/10 symbols of capture_fund_details
Getting data for VNDAF
Processed 10/10 symbols of capture_fund_details
Getting data for VESAF
Processed 1/10 symbols of capture_fund_details
Getting data for VEOF
Processed 2/10 symbols of capture_fund_details
Getting data for SSISCA
Processed 3/10 symbols of capture_fund_details
Getting data for VIBF
Processe

INFO:server_logger:Capture all daily fund data ended.
INFO:server_logger:Capture all daily data ended.


((     Ticker                                      Organ Name  Organ Type Code  \
  0       A32                                         CTCP 32                1   
  1       AAA                          CTCP Nhựa An Phát Xanh                1   
  2       AAM                            CTCP Thủy sản MeKong                1   
  3       AAS                    CTCP Chứng khoán SmartInvest                4   
  4       AAT                CTCP Tập Đoàn Tiên Sơn Thanh Hóa                1   
  ...     ...                                             ...              ...   
  1581    XPH                            CTCP Xà phòng Hà Nội                1   
  1582    YBC              CTCP Xi măng và Khoáng sản Yên Bái                1   
  1583    YBM             CTCP Khoáng sản Công nghiệp Yên Bái                1   
  1584    YEG                             CTCP Tập đoàn Yeah1                1   
  1585    YTC  CTCP Xuất nhập khẩu Y tế Thành phố Hồ Chí Minh                1   
  
       Com Gr