### Introduction
This notebook aims to demostrate how to collect data related to Covid-19 pandemic in China from Weibo.

In [34]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
import numpy as np
import re
import time
import datetime

### Helper functions

In [16]:
def preprocess_mobile_post_datetime(data):
    """
    Preprocess the issue datetime of a post for mobile version website
    
    Parameters
    -----------
    data: str
        datetime in string format

    Returns
    ----------
    Datetime
        convert post issue datetime to datetime object
    """
    data = data.strip()
    if '秒' in data:
        return datetime.datetime.now().date()
    if '昨天' in data:
        date = (datetime.datetime.now() - datetime.timedelta(hours=24)).date().strftime('%Y-%m-%d')
        time = data.split(' ')[-1]
        t = datetime.datetime.strptime(date+' '+time, '%Y-%m-%d %H:%M')
        return t
    if '分钟前' in data:
        minutes = int(data[:-3])
        time = datetime.datetime.now() - datetime.timedelta(minutes=minutes)
        return time.replace(second = 0, microsecond=0)
    if '小时前' in data:
        hrs = int(data[:-3])
        time = datetime.datetime.now() - datetime.timedelta(hours=hrs)
        return time.replace(second = 0, microsecond=0)
    if len(data.split(' ')[0].split('-')) <= 2:   # if data is from this year
        year = str(datetime.datetime.now().year)
        data = year+'-'+data
    try:
        t = datetime.datetime.strptime(data, '%Y-%m-%d %H:%M')
    except:
        t = datetime.datetime.strptime(data, '%Y-%m-%d')
    return t


In [35]:
def preprocess_PC_post_datetime(data):
    """
    Preprocess the issue datetime of a post for PC version website
    
    Parameters
    -----------
    data: str
        datetime in string format

    Returns
    ----------
    Datetime
        convert post issue datetime to datetime object
    """
    data = data.strip()
    if '秒' in data:
        return datetime.datetime.now().replace(second = 0, microsecond=0)
    if '分钟前' in data:
        minutes = int(data[:-3])
        time = datetime.datetime.now() - datetime.timedelta(minutes=minutes)
        return time.replace(second = 0, microsecond=0)
    if '小时前' in data:
        hrs = int(data[:-3])
        time = datetime.datetime.now() - datetime.timedelta(hours=hrs)
        return time.replace(second = 0, microsecond=0)
    if '日' not in data:
        t = datetime.datetime.strptime(data, '%y-%m-%d %H:%M')
        return t
    if '年' not in data:   # if data is from this year
        year = str(datetime.datetime.now().year)
        data = year+'年'+data
    t = datetime.datetime.strptime(data, '%Y年%m月%d日 %H:%M')
    return t

In [26]:
def preprocess_post_content(data):
    """
    Preprocess the content of a post
    
    Parameters
    -----------
    data: str
        content of the post

    Returns
    ----------
    str
        prerocessed post content after removing all strings in '<>'
    """
    data = data.replace('\u200b', '')
    data = data.replace('\ue627', '')
    data = data.strip()
    unwanted = re.compile('<[\w\W]*?>|\n')
    res = unwanted.sub(' ', data)
    return res

In [19]:
def preprocess_post_buttons(data):
    """
    Preprocess the content in the 'like', 'repost' and 'comment' buttons of a post
    
    Parameters
    -----------
    data: str
        content in the button

    Returns
    ----------
    int
        number in shown in the buttons
    """
    data = data.strip()
    try:
        data = int(data)
    except:
        data = 0
    return data


### Obtain post from Weibo (Advanced Search from PC website)
We can specify the post issue datetime and topic of the posts. 

In [20]:
topics = ['北京疫情','河北疫情','山西疫情','辽宁疫情','吉林疫情','黑龙江疫情','江苏疫情',
          '浙江疫情','安徽疫情','福建疫情','江西疫情','山东疫情','河南疫情','湖北疫情',
          '湖南疫情','广东疫情','海南疫情','四川疫情','贵州疫情','云南疫情','陕西疫情',
          '甘肃疫情','青海疫情','台湾疫情', '内蒙古疫情','广西疫情','西藏疫情','宁夏疫情',
          '新疆疫情','天津疫情','上海疫情','重庆疫情','香港疫情','澳门疫情']

In [27]:
def get_data_between_datetime(topic, start, end, driver, first_url = False):
    """
    Obtain data related to the given topic from weibo. The issue datetime is specified by 'start' and 'end'. 
    
    Parameters
    -----------
    topic: str
        A string specifying the topic to be searched on Weibo.
    
    start: str
        A string specifying the start datetime of search range.
        Format: YYYY-MM-DD-HH
        
    end: str
        A string specifying the end datetime of search range.
        Format: YYYY-MM-DD-HH
    
    driver: selenium.webdriver.Chrome
        webdriver object from selenium module
    
    first_url: bool
        If this is the time to call 'driver.get(url)', set to True so that more time will be given to load the page

    Returns
    ----------
    pd.DataFrame
        DataFrame storing the result.
    """
    url = 'https://s.weibo.com/weibo?q=%23{}%23&timescope=custom:{}:{}&Refer=g'.format(topic, start, end)
    pattern = re.compile('<div class="card-feed"><div class="avator">[\W\w]*?<a href="[\W\w]*?"[\W\w]*?nick-name="([\W\w]*?)" suda-data="[\W\w]*?" class="name">[\W\w]*?</a>[\W\w]*?<p class="from"><a href="[\W\w]*?" target="_blank" suda-data="[\W\w]*?">([\W\w]*?)</a>[\W\w]*?<p node-type="feed_list_content"[\W\w]*?class="txt"[\W\w]*?>([\W\w]*?)</p>( <p node-type="feed_list_content_full"[\W\w]*?class="txt"[\W\w]*?>([\W\w]*?)</p>){0,1}[\W\w]*?<div class="card-act">[\W\w]*?<i class="woo-font woo-font--retweet toolbar_icon"></i></span>([\W\w]*?)</a></li>[\W\w]*?<i class="woo-font woo-font--comment toolbar_icon"></i></span>([\W\w]*?)</a></li>[\W\w]*?</span><span class="woo-like-count">([\W\w]*?)</span>')
    driver.get(url)
    if first_url:
        time.sleep(2.5)
    time.sleep(2.5)
    eles = driver.find_elements_by_css_selector('[action-type="fl_unfold"]')
    for ele in eles:
        ele.click()
        time.sleep(0.5)
    html = driver.page_source
    if '抱歉，未找到“#{}#”相关结果。'.format(topic) in html:
        return pd.DataFrame()
    target = re.findall(pattern, html)
    username = [i[0].strip() for i in target]
    post_datetime = [preprocess_PC_post_datetime(i[1]) for i in target]
    post_content_mapping = lambda x: x[2] if x[4] == '' else x[4]
    post_content = [preprocess_post_content(post_content_mapping(i)) for i in target]
    repost = [preprocess_post_buttons(i[5]) for i in target]
    comment = [preprocess_post_buttons(i[6]) for i in target]
    like = [preprocess_post_buttons(i[7]) for i in target]
    df = pd.DataFrame()
    df['Username'] = username
    df['Post_datetime'] = post_datetime
    df['Content'] = post_content
    df['Repost_Count'] = repost
    df['Comment_Count'] = comment
    df['Like_Count'] = like
    return df

In [28]:
def get_all_data_between_datetime(topic, start, end, freq, driver):
    """
    Split the period between start date and end date by 'freq'. 
    Repeatedly call 'get_data_between_datetime' for each 'splited period' and join all returned DataFrame.
    
    Parameters
    -----------
    topic: str
        A string specifying the topic to be searched on Weibo.
    
    start: str
        A string specifying the start datetime of search range.
        Format: YYYY-MM-DD
        
    end: str
        A string specifying the end datetime of search range.
        Format: YYYY-MM-DD
        
    freq: str
        A string specifying parameter in pd.date_range
    
    driver: selenium.webdriver.Chrome
        webdriver object from selenium module

    Returns
    ----------
    pd.DataFrame
        DataFrame storing the result.
    """
    dates = pd.date_range(start, end, freq=freq)
    dates = [i.strftime('%Y-%m-%d-%H') for i in dates]
    l_dates = len(dates)
    global dfs
    dfs = [0]*(l_dates-1)
    first_url = True
    for idx in range(l_dates-1):
        print('topic: {}    start: {}      end: {}'.format(topic, dates[idx], dates[idx+1]))
        dfs[idx] = get_data_between_datetime(topic, dates[idx], dates[idx+1], driver, first_url)
        if first_url:
            first_url = False
    df = pd.concat(dfs, ignore_index=True)
    return df

In [41]:
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument('user-agent="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"')
driver = webdriver.Chrome(chrome_options = options)
for topic in topics:
    df = get_all_data_between_datetime(topic, start, end, 'H', driver)
    df.to_excel('{}_{}_{}.xlsx'.format(topic, start, end), index = False)
driver.close()