In [135]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
import urllib
import codecs
import time
import random
import sqlite3
from datetime import datetime
import pandas as pd
import os
from df2gspread import gspread2df as g2d

In [136]:
### SETTINGS

sqlite_file = 'results.sqlite'
new_database = True
verbose = "semi"
use_sample_keywords = False

In [137]:
censorship_phrase_utf8 = '根据相关法律法规和政策'
censorship_phrase_decoded = codecs.encode(censorship_phrase_utf8.decode('utf8'), 'unicode_escape')

def has_censorship(keyword):
    no_results_phrase_utf8 = '抱歉，未找到'
    no_results_phrase_decoded = codecs.encode(no_results_phrase_utf8.decode('utf8'), 'unicode_escape')
    url = ('http://s.weibo.com/weibo/%s&Refer=index' % keyword).encode('utf-8')
    try:
        r = urllib.urlopen(url).read()
    except IOError:
        wait_seconds = random.randint(90, 100
        print "connection reset, waiting %s" % wait_seconds
        time.sleep(wait_seconds))
        return "reset"
    
    time.sleep(random.randint(1, 3))
    
    if censorship_phrase_decoded in r:
        return "censored"
    elif no_results_phrase_decoded in r:
        return "no_results"
    else:
        return "has_results"
    #soup = BeautifulSoup(r, 'html.parser')

In [138]:
def create_table(sqlite_file):
    conn = sqlite3.connect(sqlite_file)
    c = conn.cursor()
    c.execute('CREATE TABLE results (id int, date date, datetime datetime, keyword string, censored bool, no_results bool, reset bool, result string, source string, PRIMARY KEY(id,date,source))')
    conn.commit()
    conn.close()
    
if new_database and os.path.isfile(sqlite_file):
    os.remove(sqlite_file)
if not os.path.isfile(sqlite_file):
    create_table(sqlite_file)

In [139]:
def insert_into_table(record_id,keyword,result,source):
    conn = sqlite3.connect(sqlite_file)
    conn.text_factory = str
    c = conn.cursor()
    
    dt = datetime.now()
    d = dt.date()
    if result is "censored":
        censored = True
    else:
        censored = False
    if result is "no_results":
        no_results = True
    else:
        no_results = False
    if result is "reset":
        reset = True
    else:
        reset = False

    query = """INSERT INTO results (id, date, datetime, keyword, censored, no_results, reset, result, source) VALUES (?,?,?,?,?,?,?,?,?);"""
    c.execute(query,(record_id, d, dt, keyword, censored, no_results, reset, result, source))

    conn.commit()
    conn.close()

In [140]:
def sqlite_to_df(sqlite_file):
    conn = sqlite3.connect(sqlite_file)
    df = pd.read_sql_query("select * from results;", conn)
    return df

In [141]:
sample_test_keywords = ['hello',
                'lxb',
                '习胞子']

In [142]:
test_df = g2d.download("wechat_keywords",col_names=True)
test_keywords = test_df.Keyword
print test_keywords.head()
print len(test_keywords)
source = 'wechat spreadsheet'

0     退出中共
1     中革中央
2    共产党下台
3     突破中共
4     废除中共
Name: Keyword, dtype: object
1055


In [145]:
def run(test_keywords,source):
    count=0
    df_len = len(sqlite_to_df(sqlite_file))
    for i,k in enumerate(test_keywords):
        if i < df_len:
            continue
        result = has_censorship(k)
        if verbose=="true":
            print i, k, result
        if verbose=="semi" and (count%100==0 or count==0):
            print i, k, result
        insert_into_table(i,k,result,source)
        count+=1

In [None]:
if use_sample_keywords:
    test_keywords = sample_test_keywords
run(test_keywords,source)

0 退出中共 has_results


In [119]:
df = sqlite_to_df(sqlite_file)
df

Unnamed: 0,id,date,datetime,keyword,censored,no_results
0,0,2017-07-13,2017-07-13 19:44:14.142870,退出中共,0,0
1,1,2017-07-13,2017-07-13 19:44:16.650751,中革中央,0,0
2,2,2017-07-13,2017-07-13 19:44:18.163456,共产党下台,0,0
3,3,2017-07-13,2017-07-13 19:44:19.691267,突破中共,0,0
4,4,2017-07-13,2017-07-13 19:44:21.212216,废除中共,0,0
5,5,2017-07-13,2017-07-13 19:44:23.724056,打倒中共,0,0
6,6,2017-07-13,2017-07-13 19:44:26.233822,打倒共产党,0,0
7,7,2017-07-13,2017-07-13 19:44:28.744529,推翻中共,0,0
8,8,2017-07-13,2017-07-13 19:44:30.268134,灭中共,0,0
9,9,2017-07-13,2017-07-13 19:44:33.792338,胡温暴政,0,0
