In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import re
import pandas as pd
import requests
import datetime

### Helper functions for preprocessing data

In [2]:
def to_datetime(s):
    '''
    Convert datetime in string format to Datetime object.
    
    Parameters
    -----------
    s: str
        Datetime in string form. Expected format: '%Y年%m月%d日 %H:%M'
        
    Returns
    ----------
    datetime object
    '''
    if '年' not in s:   # if data is from this year
        year = str(datetime.datetime.now().year)
        s = year+'年'+s
    t = datetime.datetime.strptime(s, '%Y年%m月%d日 %H:%M')
    return t

In [3]:
### modify post content
def preprocess_content(s):
    '''
    Function to remove unwanted strings in post content.
    
    Parameters
    -----------
    s: str
        Content of a post obtained from web scraping 
        
    Returns
    ----------
    str
        Preprocessed post content
    '''
    s = s.replace('\u200b', '')
    s = s.strip()
    p = re.compile('<[\w\W]*?>')
    s = p.sub(' ', s)
    tag = re.compile('#[\w\W]*?#')
    tags = re.findall(tag, s)
    return s, tags

In [4]:
# modify video link
def preprocess_video_link(link):
    '''
    Function to remove unwanted strings in video link of the post.
    
    Parameters
    -----------
    s: str
        Video link
        
    Returns
    ----------
    str
        Preprocessed video link
    '''
    link = link.replace('amp;', '')
    link = 'https:'+link
    return link

In [5]:
# convert str to int
def to_int(s):
    '''
    Function to convert integer string to integer. If s is invalid, return 0.
    
    Parameters
    -----------
    s: str
        Integer in string form
        
    Returns
    ----------
    int
        Corresponding integer
    '''
    s = s.strip()
    try:
        s = int(s)
    except:
        s = 0
    return s

In [6]:
def download_video(url, filename):
    '''
    Download the video from url and save the video to 'filename'
    
    Parameters
    -----------
    url: str
        Video link
    
    filename: str
        Filepath for saving the video
        
    Returns
    ----------
    None
    '''
    print('Downloading: {}'.format(filename))
    url_headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'}
    r = requests.get(url, headers=url_headers, stream=True)
    with open(filename, 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024*1024):
            if chunk:
                f.write(chunk)
    print('\n')

In [7]:
# preprocess a whole tuple of data
def preprocess_all(tup):
    '''
    Apply all helper functions to preprocess each component of a post
    
    Parameters
    -----------
    tup: tuple
        Collection of target information of a post. 
        
    Returns
    ----------
    List
        Preprocessed information of a post
    '''
    res = list(tup)
    res[1] = to_datetime(res[1])
    res[2], tags = preprocess_content(res[2])
    res[3] = preprocess_video_link(res[3])
    for i in [4,5,6]:
        res[i] = to_int(res[i])
    res.append(' '.join(tags))
    return res

### Obtain the post content and video for specified topics

In [8]:
topic = ['北京2022冬奥会', '冬奥会2022冰壺']

url_template = 'https://s.weibo.com/video?q={}&xsort=hot&hasvideo=1&tw=video&Refer=weibo_video&page=1'

driver = webdriver.Chrome()
failed_topic = []
preprocess_res = []
hist = dict()
video_idx = 1   
video_file = r'./video/{}.mp4'
for i in topic:
    print('topic = {}'.format(i))
    url = url_template.format(i)
    driver.get(url)
    time.sleep(5)
    try:
        tag_html =driver.find_element_by_tag_name('html')
        tag_html.send_keys(Keys.END)
    except:
        print('Fail to load topic: {}'.format(i))
        failed_topic.append(i)
        next
    html = driver.page_source
    # scrape the data
    target = re.compile('<div class="card-feed"><div class="avator">.*?<a href=".*?" target="_blank" nick-name="(.*?)" suda-data=".*?" class="name">.*?</a>[\w\W]*?(\d\d\d\d年\d\d月\d\d日 \d\d:\d\d|\d\d月\d\d日 \d\d:\d\d)[\W\w]*?</a>[\w\W]*?<p node-type="feed_list_content" nick-name=".*?" class="txt">([\w\W]*?)</p>[\w\W]*?<video data-v-b2582c8a="" class="wbpv-tech"[\w\W]*?src="([\w\W]*?)"></video>[\w\W]*?<li><a href="[\w\W]*?" action-type="login" suda-data="[\w\W]*?" class="woo-box-flex woo-box-alignCenter woo-box-justifyCenter"><span class="woo-box-flex woo-box-alignCenter woo-box-justifyCenter toolbar_iconWrap"><i class="woo-font woo-font--retweet toolbar_icon"></i></span>(.*?)</a></li>[\W\w]*?<li><a href="[\w\W]*?" action-data="[\w\W]*?" suda-data="[\w\W]*?" action-type="login" class="woo-box-flex woo-box-alignCenter woo-box-justifyCenter"><span class="woo-box-flex woo-box-alignCenter woo-box-justifyCenter toolbar_iconWrap"><i class="woo-font woo-font--comment toolbar_icon"></i></span>(.*?)</a></li>[\w\W]*?<li><a title="[\w\W]*?action-data="[\w\W]*?" action-type="login" href="[\w\W]*?" suda-data="[\w\W]*?" class="woo-box-flex woo-box-alignCenter woo-box-justifyCenter"><button class="woo-like-main toolbar_btn"><span class="woo-like-iconWrap"><svg class="woo-like-icon"><use xlink:href="[\w\W]*?"><svg xmlns="[\w\W]*?" viewBox="[\w\W]*?" id="[\w\W]*?" width="[\w\W]*?" height="[\w\W]*?"><path fill="[\w\W]*?" d="[\w\W]*?"></path></svg></use></svg></span><span class="woo-like-count">([\w\W]*?)</span></button></a></li>')
    res = re.findall(target, html)
    print('Preprocessing data')
    for i in res:
        print('Video index = {}'.format(video_idx))
        filename = video_file.format(video_idx)
        item = preprocess_all(i)
        hist_item = tuple(item)
        if hist_item in hist:
            next
        else:
            hist[hist_item] = 1
            item.append(filename)
            download_video(item[3], filename)
            preprocess_res.append(item)
            video_idx+=1
        time.sleep(1)

driver.close()

topic = 北京2022冬奥会
Preprocessing data
Video index = 1
Downloading: ./video/1.mp4


Video index = 2
Downloading: ./video/2.mp4


Video index = 3
Downloading: ./video/3.mp4


Video index = 4
Downloading: ./video/4.mp4


Video index = 5
Downloading: ./video/5.mp4


Video index = 6
Downloading: ./video/6.mp4


Video index = 7
Downloading: ./video/7.mp4


Video index = 8
Downloading: ./video/8.mp4


Video index = 9
Downloading: ./video/9.mp4


Video index = 10
Downloading: ./video/10.mp4


Video index = 11
Downloading: ./video/11.mp4


Video index = 12
Downloading: ./video/12.mp4


Video index = 13
Downloading: ./video/13.mp4


Video index = 14
Downloading: ./video/14.mp4


topic = 冬奥会2022冰壺
Preprocessing data
Video index = 15
Downloading: ./video/15.mp4


Video index = 16
Downloading: ./video/16.mp4


Video index = 17
Downloading: ./video/17.mp4


Video index = 18
Downloading: ./video/18.mp4


Video index = 19
Downloading: ./video/19.mp4


Video index = 20
Downloading: ./video/20.mp4


Vide

### Save the data as excel file

In [9]:
zipped = zip(*preprocess_res)
zipped = list(zipped)
d = dict()
for idx, col in enumerate(['Username', 'Post_datetime', 'Content', 'Video_link(expired)','Repost_Count','Comment_Count','Like_Count','Keywords','Video_file']):
    d[col] = list(zipped[idx])
df = pd.DataFrame(d)
df.to_excel(r'weibo_data.xlsx', index=False, encoding = 'utf8')

In [10]:
df.head()

Unnamed: 0,Username,Post_datetime,Content,Video_link(expired),Repost_Count,Comment_Count,Like_Count,Keywords,Video_file
0,奥林匹克运动会,2022-04-02 12:30:00,天旋？地转？你擅长哪一种 #奥运会# ｜ #北京2022年冬奥会# L 奥林匹克...,https://f.video.weibocdn.com/o0/KEzcwhe7lx07UU...,3,5,52,#奥运会# #北京2022年冬奥会#,./video/1.mp4
1,奥林匹克运动会,2022-04-02 08:00:00,"""我和你，心连心"" 一起回顾北京2022年冬奥会闭幕式上，童声合唱《我和你》 #奥运会...",https://f.video.weibocdn.com/o0/mzZakMyJlx07UU...,41,14,373,#奥运会# #北京2022年冬奥会#,./video/2.mp4
2,奥林匹克运动会,2022-04-02 15:01:00,QandA 😎☺️请欣赏这个视频  欧班永利 @北京昆仑鸿星冰球俱乐部 ...,https://f.video.weibocdn.com/o0/ncaq2pHylx07UX...,6,47,131,#健身打卡# #北京2022年冬奥会#,./video/3.mp4
3,环球时报,2022-04-01 21:17:00,2022北京冬奥会JOC Team Japan官方写真集预览 （羽生选手在0:16-0:18...,https://f.video.weibocdn.com/o0/zqQz3PhZlx07UW...,126,237,3812,#羽生结弦#,./video/4.mp4
4,北京卫视,2022-04-01 22:02:00,#曲春雨给父母买房并亲自设计家居# 2022北京冬奥会短道速滑混合团体接力冠军 @曲春雨_...,https://f.video.weibocdn.com/o0/hQxP8xjslx07UW...,13,8,72,#曲春雨给父母买房并亲自设计家居#,./video/5.mp4


In [6]:
import pandas
pandas.__version__

'1.3.5'