# Chapter 1: Scrapping news from [AASTOCKS](http://www.aastocks.com/)

## Specify the location to store the data

In [1]:
import configparser
from getpass import getuser
config = configparser.ConfigParser()
config.read(f"config/{getuser()}.ini")
if len(config.sections()) == 0:
    config.read("config/default.ini")
DATA_PATH = config["Data"]["path"]

## Parse information from a news article

In [2]:
import re
import requests
from bs4 import BeautifulSoup

def download_news(news_id: str):
    url = f"http://www.aastocks.com/en/stocks/news/aafn-con/{news_id}/"
    html = requests.get(url).text
    html = str(BeautifulSoup(html))
    
    try:
        text = re.search('<div class="newscontent5 fLevel3" id="spanContent"> <span> <p>(.*?)<br/>', html).group(1)
    except AttributeError:
        text = ""
    
    news = {}
    news["Title"] = re.search('<div class="NewsShare_share_title">(.*?)</div>', html).group(1)
    news["URL"] = url
    news["Time_Published"] = re.search('<div class="float_l newstime5">(.*?)</div>', html).group(1)
    news["Body_Text"] = re.sub(' \(.*?<span class="jssc inline_block">(.*?)</a>', "", text)
    news["Body_Text"] = re.sub('<a.*?class="jsSS".*?</a>', "", news["Body_Text"])
    
    try:
        news["Company"] = re.search("(.*?)\(", text).group(1).strip()
    except AttributeError:
        news["Company"] = ""
    try:
#         news["Symbol"] = re.search('var rscData = \[{"s":"(.*?)"', html).group(1)
        news["Symbol"] = re.search("\((.*?)\)<", text).group(1)
    except AttributeError:
        news["Symbol"] = ""
    
    news["Positive"] = int(re.search('Positive</div><div class="value">([0-9]+)', html).group(1))
    news["Negative"] = int(re.search('Negative</div><div class="value">([0-9]+)', html).group(1))
    news["Polarity"] = (news["Positive"] - news["Negative"]) / (news["Positive"] + news["Negative"]) if (news["Positive"] + news["Negative"]) > 0 else 0
    
    news["News_Id"] = news_id
    
    return news

## Dig out some articles to scrape
Only articles published in the past 3 weeks can be downloaded

In [3]:
import json
import os
from datetime import datetime, timedelta

def save_news(path: str, news: dict):
    with open(os.path.join(path, f"{news['News_Id']}.json"), "w") as f:
        json.dump(news, f)
        
dtd = None
    
def search_news(timestamp: str, from_date: datetime):
    global dtd
    news_items = requests.get(f"http://www.aastocks.com/en/resources/datafeed/getmorenews.ashx?cat=latest-news&newstime={timestamp}&newsid=NOW.1050948&period=0&key=", headers={
        "Referer": "http://www.aastocks.com/en/stocks/news/aafn/latest-news",
    }).json() # Latest News
    
    try:
        for x in news_items:
            date_published = datetime.strptime(x["dt"], "%Y/%m/%d %H:%M")
            if date_published < from_date:
                return
            save_news(DATA_PATH, download_news(x["id"]))
            dtd = x["dtd"]
    except Exception as e:
        if str(e) == "'int' object is not iterable":
            print("Scrapping Finished!")
            return
        
        # In case of error, re-run the cell below by passing the printed timestamp as the timestamp argument
        print(timestamp)
        raise e
        
    search_news(dtd, from_date)

In [4]:
import time
# search_news(timestamp, (datetime.now() - timedelta(days=21)).replace(hour=0, minute=0, second=0, microsecond=0))
search_news(time.time(), (datetime.now() - timedelta(days=21)).replace(hour=0, minute=0, second=0, microsecond=0))

Scrapping Finished!


## Postprocessing

1. Remove all articles with title "Disclaimer by AAStocks Financial News"
2. Remove all articles with title "《HIBOR》Latest HIBOR"

In [5]:
import os
titles = {}
for file in os.listdir(DATA_PATH):
    with open(os.path.join(DATA_PATH, file), "r") as f:
        d = json.load(f)
        t = "".join([c for c in d["Title"] if c.isalpha()]).upper()
        titles[t] = titles.get(t, [])
        titles[t].append(d["News_Id"])

In [6]:
for t in titles:
    if len(titles[t]) > 1:
        print(t, "\n", titles[t], "\n\n")

DISCLAIMERBYAASTOCKSFINANCIALNEWS 
 ['NOW.1094086', 'NOW.1094423', 'NOW.1094810', 'NOW.1095230', 'NOW.1095658', 'NOW.1095990', 'NOW.1096235', 'NOW.1096483', 'NOW.1096802', 'NOW.1097117', 'NOW.1097166', 'NOW.1097175', 'NOW.1097494', 'NOW.1097869', 'NOW.1098207', 'NOW.1098554', 'NOW.1098910', 'NOW.1098964', 'NOW.1098989'] 


HKADRPROJECTSHSITOOPENUPPTSTO 
 ['NOW.1094133', 'NOW.1094466', 'NOW.1094875', 'NOW.1095295', 'NOW.1096021', 'NOW.1096521', 'NOW.1097910', 'NOW.1098582'] 


UBSUPGRADESABCHKTOBUYWITHTPADDEDTO 
 ['NOW.1094194', 'NOW.1094238'] 


UBSUPGRADESBANKOFCHINAHKTOBUYTPADDEDTO 
 ['NOW.1094199', 'NOW.1094258'] 


CSUISSELIFTSYONGDAAUTOHKTPTOQRESULTSSLIGHTLYBEAT 
 ['NOW.1094204', 'NOW.1094242'] 


MSTANLEYRAISESSINOPHARMHKTPTORATEDOVERWEIGHT 
 ['NOW.1094205', 'NOW.1094230'] 


JPMLIFTSLEEAMPMANPAPERHKTPTORATEDOVERWEIGHT 
 ['NOW.1094211', 'NOW.1094233'] 


HSBCHOLDINGSQREPORTEDPBTUPYOYTOUSBWELLBEYONDESTIMATE 
 ['NOW.1094286', 'NOW.1094306'] 


HSBCHOLDINGSNOTINTENDTOPAYQOQDIVTOMULL

In [7]:
for file in titles["DISCLAIMERBYAASTOCKSFINANCIALNEWS"]:
    os.remove(os.path.join(DATA_PATH, f"{file}.json"))

for file in titles["HIBORLATESTHIBOR"]:
    os.remove(os.path.join(DATA_PATH, f"{file}.json"))