# Chapter 1: Scrapping news from [AASTOCKS](http://www.aastocks.com/)

## Specify the location to store the data

In [94]:
DATA_PATH = "../data/aastocks"

## Parse information from a news article

In [95]:
import re
import requests
from bs4 import BeautifulSoup

def download_news(news_id: str):
    url = f"http://www.aastocks.com/en/stocks/news/aafn-con/{news_id}/"
    html = requests.get(url).text
    html = str(BeautifulSoup(html))
    
    try:
        text = re.search('<div class="newscontent5 fLevel3" id="spanContent"> <span> <p>(.*?)<br/>', html).group(1)
    except AttributeError:
        text = ""
    
    news = {}
    news["Title"] = re.search('<div class="NewsShare_share_title">(.*?)</div>', html).group(1)
    news["URL"] = url
    news["Time_Published"] = re.search('<div class="float_l newstime5">(.*?)</div>', html).group(1)
    news["Body_Text"] = re.sub(' \(.*?<span class="jssc inline_block">(.*?)</a>', "", text)
    news["Body_Text"] = re.sub('<a.*?class="jsSS".*?</a>', "", news["Body_Text"])
    
    try:
        news["Company"] = re.search("(.*?)\(", text).group(1).strip()
    except AttributeError:
        news["Company"] = ""
    try:
#         news["Symbol"] = re.search('var rscData = \[{"s":"(.*?)"', html).group(1)
        news["Symbol"] = re.search("\((.*?)\)", text).group(1)
    except AttributeError:
        news["Symbol"] = ""
    
    news["Positive"] = int(re.search('Positive</div><div class="value">([0-9]+)', html).group(1))
    news["Negative"] = int(re.search('Negative</div><div class="value">([0-9]+)', html).group(1))
    news["Polarity"] = (news["Positive"] - news["Negative"]) / (news["Positive"] + news["Negative"]) if (news["Positive"] + news["Negative"]) > 0 else 0
    
    news["News_Id"] = news_id
    
    return news

## Dig out some articles to scrape
Only articles published in the past 14 days can be downloaded

In [101]:
import json
from datetime import datetime, timedelta

def save_news(path: str, news: dict):
    with open(os.path.join(path, f"{news['News_Id']}.json"), "w") as f:
        json.dump(news, f)
        
dtd = None
    
def search_news(timestamp: str, from_date: datetime):
    global dtd
    news_items = requests.get(f"http://www.aastocks.com/en/resources/datafeed/getmorenews.ashx?cat=latest-news&newstime={timestamp}&newsid=NOW.1050948&period=0&key=").json() # Latest News

    for x in news_items:
        date_published = datetime.strptime(x["dt"], "%Y/%m/%d %H:%M")
        if date_published < from_date:
            return
        save_news(DATA_PATH, download_news(x["id"]))
        dtd = x["dtd"]
        
        
    search_news(dtd, from_date)

In [102]:
import time
search_news(time.time(), (datetime.now() - timedelta(days=30)).replace(hour=0, minute=0, second=0, microsecond=0))

## Postprocessing

1. Remove all articles with title "Disclaimer by AAStocks Financial News"
2. Remove all articles with title "《HIBOR》Latest HIBOR"

In [89]:
import os
titles = {}
for file in os.listdir(DATA_PATH):
    with open(os.path.join(DATA_PATH, file), "r") as f:
        d = json.load(f)
        t = "".join([c for c in d["Title"] if c.isalpha()]).upper()
        titles[t] = titles.get(t, [])
        titles[t].append(d["News_Id"])

In [90]:
for t in titles:
    if len(titles[t]) > 1:
        print(t, "\n", titles[t], "\n\n")

HKADDSCONFIRMEDCOVIDCASESTODAYHAVEUNKNOWNORIGIN 
 ['NOW.1047443', 'NOW.1049072', 'NOW.1049057'] 


SHORTSELLINGTURNOVERBOROFELIGIBLESECURITIESTURNOVERATCLOSE 
 ['NOW.1048324', 'NOW.1048783', 'NOW.1047037', 'NOW.1050662', 'NOW.1050053', 'NOW.1047453', 'NOW.1047260', 'NOW.1049756', 'NOW.1050350', 'NOW.1047686', 'NOW.1047984', 'NOW.1049415', 'NOW.1050976', 'NOW.1049070'] 


HKMAINJECTSHKBINTOMKT 
 ['NOW.1049770', 'NOW.1050992', 'NOW.1050066', 'NOW.1048824', 'NOW.1050675', 'NOW.1047703', 'NOW.1049437', 'NOW.1049086', 'NOW.1050365'] 


HKADRPROJECTSHSITOOPENDOWNPTSTO 
 ['NOW.1047490', 'NOW.1049818', 'NOW.1048835', 'NOW.1050409', 'NOW.1047302'] 


HKADRPROJECTSHSITOOPENUPPTSTONEXTMON 
 ['NOW.1049458', 'NOW.1046694', 'NOW.1048013'] 


HKADRPROJECTSHSITOOPENUPPTSTO 
 ['NOW.1050743', 'NOW.1047085', 'NOW.1049152', 'NOW.1047755', 'NOW.1050118', 'NOW.1048534'] 


HKMAINJECTSHKBINTOMKTONHKDSTRENGTH 
 ['NOW.1048489', 'NOW.1047266', 'NOW.1051019', 'NOW.1050713', 'NOW.1050399', 'NOW.1047457', 'NOW.104

In [91]:
# for file in titles["DISCLAIMERBYAASTOCKSFINANCIALNEWS"]:
#     os.remove(os.path.join(DATA_PATH, f"{file}.json"))

# for file in titles["HIBORLATESTHIBOR"]:
#     os.remove(os.path.join(DATA_PATH, f"{file}.json"))