<a href="https://colab.research.google.com/github/linhvien/Stock-analysis/blob/main/Vietnamese_stock_price_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import logging as logging
import re
import requests
import time
import numpy as np
from datetime import datetime

In [3]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
URL_CAFE = "http://s.cafef.vn/Lich-su-giao-dich-"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}

In [28]:
def convert_date(text, date_type = '%Y-%m-%d'):
  return datetime.strptime(text, date_type)
  
def convert_text_dateformat(text, origin_type = '%Y-%m-%d', new_type = '%Y-%m-%d'):
  return convert_date(text, origin_type).strftime(new_type)


class DataLoadProto():
    def __init__(self, symbols, start, end, *arg, **karg):
        self.symbols = symbols
        self.start = convert_text_dateformat(start, new_type = '%d/%m/%Y')
        self.end = convert_text_dateformat(end, new_type = '%d/%m/%Y')

class DataLoaderCAFE(DataLoadProto):
    def __init__(self, symbols, start, end, *arg, **karg):
        self.symbols = symbols
        self.start = start
        self.end = end
        super(DataLoaderCAFE, self).__init__(symbols, start, end)

    def download(self):
        stock_datas = []
        if not isinstance(self.symbols, list):
            symbols = [self.symbols]
        else:
            symbols = self.symbols

        for symbol in symbols:
            stock_datas.append(self.download_one(symbol))

        data = pd.concat(stock_datas, axis=1)
        return data

    def download_one(self, symbol):
        stock_data = pd.DataFrame(columns=['date', 'close',
                                           'adjust'])

        for i in range(1000):
            stock_slice_batch = self.download_batch(i + 1, symbol)
            stock_data = pd.concat([stock_data, stock_slice_batch], axis=0)
            try:
                date_end_batch = stock_slice_batch.date.values[-1]
            except:
                # start date is holiday or weekend
                break
            is_touch_end = convert_date(self.start, '%d/%m/%Y') == convert_date(date_end_batch, '%d/%m/%Y')
            # logging.info('batch: {}; start date out range: {}; date_end_batch: {}'.format(i + 1, is_touch_end, date_end_batch))
            if is_touch_end:
                break

        stock_data = stock_data.set_index('date').apply(pd.to_numeric, errors='coerce')
        stock_data.index = list(map(lambda text: convert_date(text, date_type='%d/%m/%Y'), stock_data.index))
        stock_data.index.name = 'date'
        stock_data = stock_data.sort_index()
        stock_data.fillna(0, inplace=True)

        # Create multiple columns
        iterables = [stock_data.columns.tolist(), [symbol]]
        mulindex = pd.MultiIndex.from_product(iterables, names=['Attributes', 'Symbols'])
        stock_data.columns = mulindex


        logging.info('data {} from {} to {} have already cloned!' \
                     .format(symbol,
                             convert_text_dateformat(self.start, origin_type = '%d/%m/%Y', new_type = '%Y-%m-%d'),
                             convert_text_dateformat(self.end, origin_type='%d/%m/%Y', new_type='%Y-%m-%d')))

        return stock_data

    def download_batch(self, id_batch, symbol):
        form_data = {'ctl00$ContentPlaceHolder1$scriptmanager':'ctl00$ContentPlaceHolder1$ctl03$panelAjax|ctl00$ContentPlaceHolder1$ctl03$pager2',
                       'ctl00$ContentPlaceHolder1$ctl03$txtKeyword':symbol,
                       'ctl00$ContentPlaceHolder1$ctl03$dpkTradeDate1$txtDatePicker':self.start,
                       'ctl00$ContentPlaceHolder1$ctl03$dpkTradeDate2$txtDatePicker':self.end,
                       '__EVENTTARGET':'ctl00$ContentPlaceHolder1$ctl03$pager2',
                       '__EVENTARGUMENT':id_batch,
                       '__ASYNCPOST':'true'}
        url = URL_CAFE+symbol+"-1.chn"
        r = requests.post(url, data = form_data, headers = headers, verify=False)
        soup = BeautifulSoup(r.content, 'html.parser')
        
        table = soup.find('table')
        stock_slice_batch = pd.read_html(str(table))[0].iloc[2:, :3]
        #print(stock_slice_batch)
        stock_slice_batch.columns = ['date', 'adjust', 'close']

        return stock_slice_batch


In [29]:
loader = DataLoaderCAFE('VNM', '2021-02-02','2021-09-02')
data = loader.download()
data.head()

2021-09-26 16:07:22,234 : INFO : data VNM from 2021-02-02 to 2021-09-02 have already cloned!


Attributes,close,adjust
Symbols,VNM,VNM
date,Unnamed: 1_level_2,Unnamed: 2_level_2
2021-02-02,106.4,103.3
2021-02-03,109.7,106.51
2021-02-04,108.8,105.63
2021-02-05,109.6,106.41
2021-02-08,105.3,102.24


In [25]:
data = loader.download_batch(2,'VNM')
data

Unnamed: 0,date,adjust,close
1,Ngày,Giá điều chỉnh,Giá đóng cửa
2,04/08/2021,85.62,87.10
3,03/08/2021,85.62,87.10
4,02/08/2021,85.03,86.50
5,30/07/2021,84.64,86.10
6,29/07/2021,85.32,86.80
7,28/07/2021,85.42,86.90
8,27/07/2021,86.01,87.50
9,26/07/2021,87.39,88.90
10,23/07/2021,86.11,87.60
