In [4]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
import urllib
import codecs
import time
import random
import sqlite3
from datetime import datetime
import pandas as pd
import os
from df2gspread import gspread2df as g2d
import io
import requests
import ast
import weibo_credentials

pd.set_option('display.max_rows', 500)

In [78]:
### SETTINGS

sqlite_file = 'results.sqlite' # name of sqlite file to read from/write to
new_database = True # erases any existing sqlite file and generates an empty one to write to
verbose = "some" # 'none',some','all'
load_cookies = True # load cookies from disk (can load cookies without having to do fresh_log_in if cookies already exist)
fresh_log_in = False # perform a log in
write_cookies = False # save cookies and overwrite any existing cookies during log in
cookie_file = weibo_credentials.Creds().username + "_cookie.txt" # name of cookie file in case you want to specify

censorship_phrase_utf8 = '根据相关法律法规和政策'
censorship_phrase_decoded = codecs.encode(censorship_phrase_utf8.decode('utf8'), 'unicode_escape')

captcha_phrase_utf8 = '你的行为有些异常'
captcha_phrase_decoded = codecs.encode(captcha_phrase_utf8.decode('utf8'), 'unicode_escape')

no_results_phrase_utf8 = '抱歉，未找到'
no_results_phrase_decoded = codecs.encode(no_results_phrase_utf8.decode('utf8'), 'unicode_escape')

In [6]:
# code from https://www.zhihu.com/question/29666539 with minor modifications
import base64  
import re
try:
    from urllib.parse import urlparse
except ImportError:
     from urlparse import urlparse
import rsa  
import json  
import binascii  
from bs4 import BeautifulSoup
import weibo_credentials
  
class Userlogin:  
    def userlogin(self,username,password,write_cookie=True):  
        session = requests.Session()  
        url_prelogin = 'http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=&rsakt=mod&client=ssologin.js(v1.4.5)&_=1364875106625'  
        url_login = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.5)'  
  
        #get servertime,nonce, pubkey,rsakv  
        resp = session.get(url_prelogin)  
        json_data  = re.findall(r'(?<=\().*(?=\))', resp.text)[0]
        data       = json.loads(json_data)  


        servertime = data['servertime']  
        nonce      = data['nonce']  
        pubkey     = data['pubkey']  
        rsakv      = data['rsakv']  
  
        # calculate su  
        #print(urlparse.quote(username))
        su  = base64.b64encode(username.encode(encoding="utf-8"))  
  
        #calculate sp  
        rsaPublickey= int(pubkey,16)  
        key = rsa.PublicKey(rsaPublickey,65537)  
        message = str(servertime) +'\t' + str(nonce) + '\n' + str(password)  
        sp = binascii.b2a_hex(rsa.encrypt(message.encode(encoding="utf-8"),key))  
        postdata = {  
                            'entry': 'weibo',  
                            'gateway': '1',  
                            'from': '',  
                            'savestate': '7',  
                            'userticket': '1',  
                            'ssosimplelogin': '1',  
                            'vsnf': '1',  
                            'vsnval': '',  
                            'su': su,  
                            'service': 'miniblog',  
                            'servertime': servertime,  
                            'nonce': nonce,  
                            'pwencode': 'rsa2',  
                            'sp': sp,  
                            'encoding': 'UTF-8',  
                           'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack',  
                            'returntype': 'META',  
                            'rsakv' : rsakv,  
                            }  
        resp = session.post(url_login,data=postdata)  
        # print resp.headers 
        #print(resp.content)
        login_url = re.findall(r'http://weibo.*&retcode=0',resp.text)  
        #  
        #print(login_url)
        respo = session.get(login_url[0])  
        uid = re.findall('"uniqueid":"(\d+)",',respo.text)[0]  
        url = "http://weibo.com/u/"+uid  
        respo = session.get(url)
        if write_cookie:
            cookie_dict = session.cookies.get_dict()
            with open(username + "_cookie.txt", 'w') as f:
                f.write(cookie)
        return session
    
if fresh_log_in:
    session = Userlogin().userlogin(weibo_credentials.Creds().username,weibo_credentials.Creds().password)

if load_cookies:
    with open(cookie_file, 'r') as f:
        cookie = ast.literal_eval(f.read())
else:
    cookie = None

In [45]:
def verify_cookies_work(cookie=cookie):
    r = requests.get('http://level.account.weibo.com/level/mylevel?from=profile1',cookies=cookie).text
    if "W_face_radius" in r:
        return True
    else:
        return False

In [20]:
def has_censorship(keyword,cookies=None):
    if isinstance(keyword, str):
        url = 'http://s.weibo.com/weibo/%s&Refer=index' % keyword
    elif isinstance(keyword, unicode):
        url = ('http://s.weibo.com/weibo/%s&Refer=index' % keyword).encode('utf-8')    
    
    try:
        r = requests.get(url,cookies=cookie).text
        if captcha_phrase_decoded in r:
            print "CAPTCHA", keyword
    except IOError:
        wait_seconds = random.randint(90, 100)
        print "connection reset, waiting %s" % wait_seconds
        time.sleep(wait_seconds)
        return "reset"
    
    if censorship_phrase_decoded in r:
        return "censored"
    elif no_results_phrase_decoded in r:
        return "no_results"
    else:
        return "has_results"
    #soup = BeautifulSoup(r, 'html.parser')

In [79]:
def create_table(sqlite_file):
    conn = sqlite3.connect(sqlite_file)
    c = conn.cursor()
    c.execute('CREATE TABLE results (id int, date date, datetime datetime, keyword string, censored bool, no_results bool, reset bool, result string, source string, PRIMARY KEY(id,date,source))')
    conn.commit()
    conn.close()
    
if new_database and os.path.isfile(sqlite_file):
    os.remove(sqlite_file)
if not os.path.isfile(sqlite_file):
    create_table(sqlite_file)

In [10]:
def insert_into_table(record_id,keyword,result,source):
    conn = sqlite3.connect(sqlite_file)
    conn.text_factory = str
    c = conn.cursor()
    
    dt = datetime.now()
    d = dt.date()
    
    if result is "censored":
        censored = True
    else:
        censored = False
        
    if result is "no_results":
        no_results = True
    else:
        no_results = False
        
    if result is "reset":
        reset = True
    else:
        reset = False

    query = """INSERT INTO results (id, date, datetime, keyword, censored, no_results, reset, result, source) VALUES (?,?,?,?,?,?,?,?,?);"""
    c.execute(query,(record_id, d, dt, keyword, censored, no_results, reset, result, source))

    conn.commit()
    conn.close()
    
def sqlite_to_df(sqlite_file):
    conn = sqlite3.connect(sqlite_file)
    df = pd.read_sql_query("select * from results;", conn)
    return df

In [71]:
def get_keywords_from_source(location,keyword_col_name,source_name,lxb_categories=None):
    test_keywords = pd.DataFrame()
    if '.csv' in location:
        s=requests.get(location).content
        test_df=pd.read_csv(io.StringIO(s.decode('utf-8')))
    else:
        test_df = g2d.download(location,col_names=True)
    if lxb_categories:
        mask = test_df.category.isin(lxb_categories)
        test_df = test_df[mask]
    test_keywords['keyword'] = test_df[keyword_col_name]
    test_keywords['source'] = source_name
    return test_keywords

In [37]:
def run(test_keywords,verbose='some',insert=True,return_df=False,sleep=True):
    
    count=0
    if return_df:
        results_df = pd.DataFrame()
    
    for r in test_keywords.itertuples():
        if insert and r.Index < len(sqlite_to_df(sqlite_file)):
            continue
        result = has_censorship(r.keyword)
        if verbose=="all":
            print r.Index,r.keyword, result
        if verbose=="some" and (count%100==0 or count==0):
            print r.Index,r.keyword, result
        if insert:
            insert_into_table(r.Index,r.keyword,result,r.source)
        if return_df:
            results_df = pd.concat([results_df,
                                    pd.DataFrame([{"date":datetime.now().date(),
                                                  "datetime":datetime.now(),
                                                  "keyword":r.keyword,
                                                  "result":result,
                                                  "source":r.source
                                                 }])
                                   ])
        count+=1
        if sleep:
            time.sleep(random.randint(15, 20))
    if return_df:
        return results_df

In [46]:
verify_cookies_work()

True

In [38]:
sample_keywords = pd.DataFrame(
    [{'keyword':'hello','Index':0,'source':'test'},
     {'keyword':'lxb','Index':1,'source':'test'},
     {'keyword':u'习胞子','Index':2,'source':'unicode'},
     {'keyword':'自由亚洲电台','Index':3,'source':'should reset'},
     {'keyword':'刘晓波','Index':4,'source':'string'},
     {'keyword':'dhfjkdashfjkasdhfsadsf87sadfhjfasdnf'}])

run(sample_keywords,verbose='none',insert=False,return_df=True)

connection reset, waiting 90


Unnamed: 0,date,datetime,keyword,result,source
0,2017-07-14,2017-07-14 06:12:48.502143,hello,has_results,test
0,2017-07-14,2017-07-14 06:13:05.141758,lxb,censored,test
0,2017-07-14,2017-07-14 06:13:23.329775,习胞子,no_results,unicode
0,2017-07-14,2017-07-14 06:15:09.901741,自由亚洲电台,reset,should reset
0,2017-07-14,2017-07-14 06:15:31.526905,刘晓波,censored,string
0,2017-07-14,2017-07-14 06:15:52.814367,dhfjkdashfjkasdhfsadsf87sadfhjfasdnf,no_results,


In [74]:
lxb_categories = ["Cultural Revolution","Dissident","charter 08","human rights","Charter 08","Dissident / Activist","Detention","Human Rights","Tiananmen Square (June 4 1989)"]
df1 = get_keywords_from_source("wechat_keywords","Keyword","wechat spreadsheet")
df2 = get_keywords_from_source("https://raw.githubusercontent.com/citizenlab/chat-censorship/master/livestream/livestream_keywords_05_15_09_16.csv",'word','livestream_05_15_09_16',lxb_categories=lxb_categories)
df3 = get_keywords_from_source("https://raw.githubusercontent.com/citizenlab/chat-censorship/master/TOM-Skype--Sina-UC/blocked-words.csv","word","tom-skype-sina",lxb_categories=lxb_categories)
test_keywords = pd.concat([df1,df2,df3]).reset_index(drop=True)

In [77]:
print len(test_keywords)
test_keywords.head()

1612


Unnamed: 0,keyword,source
0,退出中共,wechat spreadsheet
1,中革中央,wechat spreadsheet
2,共产党下台,wechat spreadsheet
3,突破中共,wechat spreadsheet
4,废除中共,wechat spreadsheet


In [None]:
run(test_keywords)

0 退出中共 censored


In [25]:
sample_df = sqlite_to_df(sqlite_file)
sample_df

Unnamed: 0,id,date,datetime,keyword,censored,no_results,reset,result,source
0,0,2017-07-14,2017-07-14 05:51:35.469646,hello,0,0,0,has_results,test
1,1,2017-07-14,2017-07-14 05:51:36.469346,lxb,1,0,0,censored,test
2,2,2017-07-14,2017-07-14 05:51:37.290245,习胞子,0,1,0,no_results,unicode
3,3,2017-07-14,2017-07-14 05:53:17.141411,自由亚洲电台,0,0,1,reset,should reset
4,4,2017-07-14,2017-07-14 05:53:19.224088,刘晓波,1,0,0,censored,string
5,5,2017-07-14,2017-07-14 05:53:20.061392,dhfjkdashfjkasdhfsadsf87sadfhjfasdnf,0,1,0,no_results,
