In [69]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
import urllib
import codecs
import time
import random
import sqlite3
from datetime import datetime
import pandas as pd
import os
from df2gspread import gspread2df as g2d

In [None]:
### SETTINGS

sqlite_file = 'results.sqlite'
new_database = True
verbose = True
use_sample_keywords = False

In [94]:
censorship_phrase_utf8 = '根据相关法律法规和政策'
censorship_phrase_decoded = codecs.encode(censorship_phrase_utf8.decode('utf8'), 'unicode_escape')

def has_censorship(keyword):
    no_results_phrase_utf8 = '抱歉，未找到'
    no_results_phrase_decoded = codecs.encode(no_results_phrase_utf8.decode('utf8'), 'unicode_escape')
    
    r = urllib.urlopen(('http://s.weibo.com/weibo/%s&Refer=index' % keyword).encode('utf-8')).read()
    time.sleep(random.randint(1, 3))
    
    if censorship_phrase_decoded in r:
        return True
    elif no_results_phrase_decoded in r:
        return None
    else:
        return False
    #soup = BeautifulSoup(r, 'html.parser')

In [77]:
def create_table(sqlite_file):
    conn = sqlite3.connect(sqlite_file)
    c = conn.cursor()
    c.execute('CREATE TABLE results (id int, date date, datetime datetime, keyword string, censored bool, no_results bool,PRIMARY KEY(id,date))')
    conn.commit()
    conn.close()
    
if new_database and os.path.isfile(sqlite_file):
    os.remove(sqlite_file)
if not os.path.isfile(sqlite_file):
    create_table(sqlite_file)

In [66]:
def insert_into_table(record_id,keyword,result):
    conn = sqlite3.connect(sqlite_file)
    conn.text_factory = str
    c = conn.cursor()
    
    dt = datetime.now()
    d = dt.date()
    if result is True:
        censored = True
    else:
        censored = False
    if result is None:
        no_results = True
    else:
        no_results = False

    query = """INSERT INTO results (id, date, datetime, keyword, censored, no_results) VALUES (?,?,?,?,?,?);"""
    c.execute(query,(record_id, d, dt, keyword, censored, no_results))

    conn.commit()
    conn.close()

In [64]:
def sqlite_to_df(sqlite_file):
    conn = sqlite3.connect(sqlite_file)
    df = pd.read_sql_query("select * from results;", conn)
    return df

In [51]:
sample_test_keywords = ['hello',
                'lxb',
                '习胞子']

In [72]:
test_df = g2d.download("wechat_keywords",col_names=True)

In [73]:
test_df.head()

Unnamed: 0,Date Tested,Keyword,Translation,Theme,Category,# of Components,Language,Retested Date,Status,Note,Media Source,Reference,Article Publish Date,Block in One-to-one Chat?,Tested Date (One-on-One)
0,07/2016,零八宪章,Charter 08,Political,Charter 08,1,simplified Chinese,2017-05-09,Blocked,Reference to the manifesto initially signed by...,,,,Not blocked,2017-05-18
1,07/2016,08宪章,Charter 08,Political,Charter 08,1,simplified Chinese,2017-05-09,Blocked,Reference to the manifesto initially signed by...,,,,Not blocked,2017-05-18
2,2017-06-19,王岐山家族,Wang Qishan family,People,Govt Official,1,simplified Chinese,,,,,http://cn.rfi.fr/中国/20170616-郭文贵爆料王岐山家族身份及财产资料,2017-06-16T14:22:37,,
3,2017-06-20,中纪委批评14所大学意识形态工作不力,The Discipline Inspection Commission criticize...,Political,Censorship,1,simplified Chinese,,,,,http://www.ftchinese.com/story/001073078,2017-06-20T00:31:01,,
4,2017-06-21,19大人事,19 big personnel,Event,19th Party Congress,1,simplified Chinese,,,,,http://www.bbc.com/zhongwen/simp/chinese-news-...,2017-06-20T10:24:02,,


In [75]:
test_keywords = test_df.Keyword

In [95]:
def run():
    if use_sample_keywords:
        test_keywords = sample_test_keywords
    for i,k in enumerate(test_keywords):
        result = has_censorship(k)
        if verbose:
            print i, k, result
        insert_into_table(i,k,result)

In [None]:
run()

0 零八宪章 True
1 08宪章 True
2 王岐山家族 None
3 中纪委批评14所大学意识形态工作不力 None
4 19大人事 False
5 北戴河会议 False
6 刘晓波病情+鲍彤被公安登门禁言 True
7 香港团体+中联办示威+习近平+停止酷刑 None
8 当局+释放+刘晓波+软禁 True
9 刘晓波+肝癌晚期+保外就医 True
10 国家主席+刘晓波 True
11 劉曉波+律師 True
12 薄熙来+患肝癌+保外就医 None
13 鲍彤谴+责当局+延误刘晓波 True
14 习访港+恐袭风险高+全面提升保安级别 None
15 刘晓波+出国就医+死也要死在西方 True
16 刘晓波+律师 True
17 刘晓波+习近平 True
18 刘晓波+虐待+狱中异议+堪忧 True
19 刘晓波+呼吁 True
20 京城乱象+聚餐必谈+郭文贵 False
21 习近平访港3日万余名警察维护安保 False
22 香港主权移交20年+习近平赴港+保安有多猛 False
23 劉曉波+習 False
24 国际间积极活动+刘晓波有望获准出国求医 False
25 艾未未谈刘晓波+理性的改革者+不尽情理 False
26 习近平访港+占领金紫荆雕塑抗议 False
27 习近平+访问香港将+胡萝卜和大棒 False
28 刘晓波+狱中生活+中共+洗刷罪责 False
29 愿接纳刘晓波+提供最佳医疗 False
30 习总大驾光临+香港街头+沦陷20年标语 False
31 拒绝刘晓波出国+德美大使争取探视 False
32 习近平到港+彩旗飘飘+抗议消音 False
33 社交媒体突现+刘晓波+狱中生活 False
34 港警逮捕黄之锋+抗议习近平七一访港 False
35 港人占领金紫荆+多人遭扣+无法抗习 False
36 乒乓球男隊+兵變+國家和粉絲+之間的風暴 False
37 泛民函習+重啟政改+香港不可分離 False
38 西隧口橫額迎習+鴨寮街現+淪陷 False
39 眾志社民連佔金紫荊+26人被捕+花瓣示威 False
40 泛民採溫和策略+爭取習近平+對話空間 False
41 地方換屆結束+習近平大豐收 False
42 中国不准刘晓波出国接受癌症治疗 False
43 习近平抵达+香港人抗议 False
44

In [78]:
df = sqlite_to_df(sqlite_file)
df.head()

Unnamed: 0,id,date,datetime,keyword,censored,no_results
