# Web认证跨越

## 跨越HTTP Basic认证

目前使用HTTP Basic认证的系统较为少见了，一些单位的内部系统可能还存在这类认证。

我们以httpbin.org中提供的这一功能为基础，进行跨越此类认证的尝试。

In [None]:
import requests

def getHtmlwithHttpBasicAuth(url,username,password):
    """一个跨越HTTP Basic认证的示例
    注意：使用前，需要到http://httpbin.org设置auth为basic的页面，并设置用户名为：'wang123'；密码为'12345678'
    
    """
    try:
        headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9',
                   'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
                  }

        s = requests.Session()
        r = s.get(url,headers = headers,auth=requests.auth.HTTPBasicAuth(username, password))
        r.raise_for_status()
        print(r.headers)
        return r.text
    except Exception as e:
        print(e)
    

url = 'https://www.httpbin.org/basic-auth/wang123/12345678'
getHtmlwithHttpBasicAuth(url,'wang123','12345678')

## 跨越Http Digest 认证

我们仍以httpbin.org中提供的这一功能为基础，进行跨越此类认证的尝试。

In [None]:
import requests

def getHtmlwithHttpDigestAuth(url,username,password):
    """一个跨越HTTP Basic认证的示例
    注意：使用前，需要到http://httpbin.org设置auth为basic的页面，并设置用户名为：'wang123'；密码为'12345678'
    
    """
    try:
        headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9',
                   'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
                  }

        s = requests.Session()
        r = s.get(url,headers = headers,auth=requests.auth.HTTPDigestAuth(username, password))
        r.raise_for_status()
        print(r.headers)
        return r.text
    except Exception as e:
        print(e)
    

url = 'https://www.httpbin.org/digest-auth/auth/wang456/1234567890'
getHtmlwithHttpDigestAuth(url,'wang456','1234567890')

## 尝试登录

In [None]:
# JSEngrypt 前端加密部分内容
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
<head runat="server">
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
  <title></title>
  <script src="http://code.jquery.com/jquery-1.8.3.min.js"></script>
  <script src="http://passport.cnblogs.com/scripts/jsencrypt.min.js"></script>
  <script type="text/javascript">
    // 使用jsencrypt类库加密js方法，
    function encryptRequest(reqUrl, data, publicKey) {
      var encrypt = new JSEncrypt();
      encrypt.setPublicKey(publicKey);
      // ajax请求发送的数据对象
      var sendData = new Object();
      // 将data数组赋给ajax对象
      for (var key in data) {
        sendData[key] = encrypt.encrypt(data[key]);
      }
      console.info(sendData);
      $.ajax({
        url: reqUrl,
        type: 'post',
        data: sendData,
        dataType: 'json',
        //contentType: 'application/json; charset=utf-8',
        success: function (data) {
          console.info(data);
        },
        error: function (xhr) {
          //console.error('出错了');
        }
      });
    }
    // Call this code when the page is done loading.
    $(function () {
      $('#testme').click(function () {
        var data = [];
        data['username'] = $('#username').val();
        data['passwd'] = $('#passwd').val();
        var pkey = $('#pubkey').val();
        encryptRequest('/', data, pkey);
      });
    });
  </script>
</head>
<body>
<form id="form1" runat="server">
  <div>
    <label for="pubkey">Public Key</label><br/>
    <textarea id="pubkey" rows="15" cols="65">
        {{ public_key }}
      </textarea><br/>
    <label for="input">Text to encrypt:</label><br/>
    name:<input id="username" name="username" type="text" value="user"></input><br/>
    password:<input id="passwd" name="passwd" type="password" value="123"></input><br/>
    <input id="testme" type="button" value="submit"/><br/>
  </div>
</form>
</body>
</html>

In [10]:
 # 加载公钥
from Cryptodome.PublicKey import RSA
from Cryptodome.Cipher import PKCS1_OAEP, PKCS1_v1_5

publicKey="-----BEGIN PUBLIC KEY-----MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDHVvBLvQ4tMRlni0/GmgmdqObTKFjrYxH008pPzy5hBn9hDY0D4J+ETlAA86W2lGSmEysp9bK33qTN3jJzyzyq7Q7RaShUuFqpXOJkpkwAgSATGg5AjAdXysKqkZZpeV6eHHfKE68oowb43ZjPfY98s7yHSRDvBvX3S/6QdohBHQIDAQAB-----END PUBLIC KEY-----";
  
    

recipient_key = RSA.import_key(extern_key = publicKey)
#cipher_rsa = PKCS1_v1_5.new(recipient_key)
#en_data = cipher_rsa.encrypt("tdzjpyzc".encode('utf-8'))
#print(len(en_data), en_data)
    
help(RSA.import_key)

Help on function import_key in module Cryptodome.PublicKey.RSA:

import_key(extern_key, passphrase=None)
    Import an RSA key (public or private).
    
    Args:
      extern_key (string or byte string):
        The RSA key to import.
    
        The following formats are supported for an RSA **public key**:
    
        - X.509 certificate (binary or PEM format)
        - X.509 ``subjectPublicKeyInfo`` DER SEQUENCE (binary or PEM
          encoding)
        - `PKCS#1`_ ``RSAPublicKey`` DER SEQUENCE (binary or PEM encoding)
        - An OpenSSH line (e.g. the content of ``~/.ssh/id_ecdsa``, ASCII)
    
        The following formats are supported for an RSA **private key**:
    
        - PKCS#1 ``RSAPrivateKey`` DER SEQUENCE (binary or PEM encoding)
        - `PKCS#8`_ ``PrivateKeyInfo`` or ``EncryptedPrivateKeyInfo``
          DER SEQUENCE (binary or PEM encoding)
        - OpenSSH (text format, introduced in `OpenSSH 6.5`_)
    
        For details about the PEM encoding, see `RFC1

In [6]:
import rsa

def xtzxLogin(url,username,password):
    """一个登录学堂在线的示例
   
    """
    try:
        headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9',
                   'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
                  }
        
        payload = dict(type= "E",
                       name= "hhhparty@163.com",
                       psw= "dzmELTlZV16VLpcHkPuKt59HYpfIpVRbteV+MmwTDFhZPvjmdukAi7v/GRga3N/q6ScvMUwcck03Ep/eNHMgU5cWgULeeAaP0n0QNw9BJtCKWCyw//sCI1xViD/zlirllq8RxdvJXkLyxF1a/zex48rmR/+uDtFV+O6wBY5Yx6g=",
                      )

        s = requests.Session()
        r = s.get(url,headers = headers,auth=requests.auth.HTTPDigestAuth(username, password))
        r.raise_for_status()
        print(r.headers)
        return r.text
    except Exception as e:
        print(e)
def getPubkFun(password):

    """
    var publicKey="-----BEGIN PUBLIC KEY-----MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDHVvBLvQ4tMRlni0/GmgmdqObTKFjrYxH008pPzy5hBn9hDY0D4J+ETlAA86W2lGSmEysp9bK33qTN3jJzyzyq7Q7RaShUuFqpXOJkpkwAgSATGg5AjAdXysKqkZZpeV6eHHfKE68oowb43ZjPfY98s7yHSRDvBvX3S/6QdohBHQIDAQAB-----END PUBLIC KEY-----";
    var encryptor = new JsEncrypt();
    encryptor.setPublicKey(publicKey);
    var rsaPassword= encryptor.encrypt(password);
    resolve(rsaPassword);
    
    
    if(this.emailCheckRight===true&&this.passwordCheckRight===true){
        let pwd=await this.getPubkFun(this.mailLoginForm.password);
        this.judgePostLogin("E",this.mailLoginForm.mail,pwd);
    """        
    """获取将密码加密后用于登录的字符串"""
    publicKey="-----BEGIN PUBLIC KEY-----MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDHVvBLvQ4tMRlni0/GmgmdqObTKFjrYxH008pPzy5hBn9hDY0D4J+ETlAA86W2lGSmEysp9bK33qTN3jJzyzyq7Q7RaShUuFqpXOJkpkwAgSATGg5AjAdXysKqkZZpeV6eHHfKE68oowb43ZjPfY98s7yHSRDvBvX3S/6QdohBHQIDAQAB-----END PUBLIC KEY-----";
    #encode_password = (str(server_time) + "\t" + str(nonce) + "\n" + str(self.password)).encode("utf-8")
    #public_key = rsa.PublicKey(int(pubkey, 16), int('10001', 16))
    password.encode("utf-8")
    #public_key = rsa.PublicKey(publicKey)
    encry_password = rsa.encrypt(password.encode('utf-8'), publicKey)
    print(encry_password)
    password1 = binascii.b2a_hex(encry_password)
    print(password1)
    return password.decode()

# "xPY7mn/NogFokoiyYLY5IGPpXPT1XCbbM1Nvfx19oZ+/nyAeWcdpARh5Nf7Q7Ci5C7MN8sdhKJ2kAV1tOecgWwiWVcoeHsq5qw5lau/Ncpf4aRZHUQgDhPOr/fY8stKhF3jKYHHkM3NHAXFq5H07pTZQ0akrJGe+MEPJbvMxAAM="
# "cjroGZnnl5PlVqQbY+H4Q8dVriJI1UpbjfHJPXPCwOc+B/uK540w4wvQQuz5gzVCVsxmBhirDfj0m7mu7fnYcpkbG8BQomjIGPtIBN/10POsZS23gpMU+sUILpjLnPDR3r7D/xiI9nG7HcTeiDk1iE3dgdLHjOWo5kD04RSZPX8="
url = 'https://next.xuetangx.com/api/v1/u/login/e_p/'
#username = input('邮箱：')
#userpwd = input('密码：')
#xtzxLogin(url,username,userpwd)
getPubkFun('tdzjpyzc')

AttributeError: 'str' object has no attribute 'n'

In [4]:
import rsa


# public_key = rsa.PublicKey(int(publicKey, 16), int('10001', 16))

publick_key = rsa.PublicKey(,65537)

Help on class PublicKey in module rsa.key:

class PublicKey(AbstractKey)
 |  PublicKey(n, e)
 |  
 |  Represents a public RSA key.
 |  
 |  This key is also known as the 'encryption key'. It contains the 'n' and 'e'
 |  values.
 |  
 |  Supports attributes as well as dictionary-like access. Attribute access is
 |  faster, though.
 |  
 |  >>> PublicKey(5, 3)
 |  PublicKey(5, 3)
 |  
 |  >>> key = PublicKey(5, 3)
 |  >>> key.n
 |  5
 |  >>> key['n']
 |  5
 |  >>> key.e
 |  3
 |  >>> key['e']
 |  3
 |  
 |  Method resolution order:
 |      PublicKey
 |      AbstractKey
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __eq__(self, other)
 |      Return self==value.
 |  
 |  __getitem__(self, key)
 |  
 |  __getstate__(self)
 |      Returns the key as tuple for pickling.
 |  
 |  __hash__(self)
 |      Return hash(self).
 |  
 |  __ne__(self, other)
 |      Return self!=value.
 |  
 |  __repr__(self)
 |      Return repr(self).
 |  
 |  __setstate__(self, state)
 |      Sets 

In [8]:
#需要安装 pip install pycryptodomex

from Cryptodome.PublicKey import RSA
from Cryptodome.Cipher import PKCS1_OAEP, PKCS1_v1_5
import base64
from urllib import parse

help(RSA)

Help on module Cryptodome.PublicKey.RSA in Cryptodome.PublicKey:

NAME
    Cryptodome.PublicKey.RSA

DESCRIPTION
    #
    # Copyright (c) 2016, Legrandin <helderijs@gmail.com>
    # All rights reserved.
    #
    # Redistribution and use in source and binary forms, with or without
    # modification, are permitted provided that the following conditions
    # are met:
    #
    # 1. Redistributions of source code must retain the above copyright
    #    notice, this list of conditions and the following disclaimer.
    # 2. Redistributions in binary form must reproduce the above copyright
    #    notice, this list of conditions and the following disclaimer in
    #    the documentation and/or other materials provided with the
    #    distribution.
    #
    # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
    # FOR A PARTI

In [None]:
getPubkFun: function(e) {
    var t, n = this;
    return new j.a((t = N()(P.a.mark(function t(i, s) {
                        var o, a, r;
                        return P.a.wrap(function(t) {
                            for (; ; )
                                switch (t.prev = t.next) {
                                case 0:
                                    o = "-----BEGIN PUBLIC KEY-----MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDHVvBLvQ4tMRlni0/GmgmdqObTKFjrYxH008pPzy5hBn9hDY0D4J+ETlAA86W2lGSmEysp9bK33qTN3jJzyzyq7Q7RaShUuFqpXOJkpkwAgSATGg5AjAdXysKqkZZpeV6eHHfKE68oowb43ZjPfY98s7yHSRDvBvX3S/6QdohBHQIDAQAB-----END PUBLIC KEY-----",
                                    (a = new H.a).setPublicKey(o),
                                    r = a.encrypt(e),
                                    i(r);
                                case 5:
                                case "end":
                                    return t.stop()
                                }
                        }, t, n)
                    })),
                    function(e, n) {
                        return t.apply(this, arguments)
                    }
                    ))
                },

## 跨越OAuth认证

1.手动从OAuth提供商处获取凭据。至少你需要一个client_id但很可能也需要一个client_secret。在此过程中，您可能还需要注册应用程序使用的默认重定向URI。将这些内容保存在Python脚本中：

In [None]:
import requests
client_id = input("输入您的用户名：")
client_secret = input("输入您的密码：")
redirect_uri = 'http://www.xuetangx.com/complete/weibo/'
response_type='code'
client_id='2021069109'

In [11]:

import os
import rsa
import time
import base64
import requests
import binascii
from urllib.parse import quote


class LoginSinaWeibo:
    """
    新浪微博登陆
    - 用户名和密码均加密后提交，其中密码采用rsa加密
    """
    # 创建session会话
    session = requests.session()

    def __init__(self, username, password):
        self.username = username
        self.password = password
        # ssologin.js版本
        self.ssologin_version = 'v1.4.18'
        # 微博登陆首页地址
        self.login_home_url = 'https://weibo.com/login.php'
        # 预登陆参数获取接口
        self.pre_login_params_url = 'https://login.sina.com.cn/sso/prelogin.php'
        # 提交正式登陆数据的链接
        self.real_login_url = 'https://login.sina.com.cn/sso/login.php?client=ssologin.js({js_version})&_={ts}'.format(
            js_version=self.ssologin_version, ts=int(time.time()))
        # 验证码图片地址
        self.captcha_url = "http://login.sina.com.cn/cgi/pin.php?r={ts}&s=0&p={pcid}"
        self.session.headers.update(
            {"User-Agent": "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0"})
        self._init_session()
        # 设置是否登录成功标志
        self.login_success = False
        # 个人信息
        self.user_info = None

    def login(self):
        """开始登陆"""
        login_status_info = {'code': 1, 'state': 'failed', 'message': ''}
        login_form_data = self._pre_login()
        login_resp = self.session.post(url=self.real_login_url, data=login_form_data)
        login_resp_json = login_resp.json()
        # 登录成功
        if login_resp_json.get('retcode') == '0':
            self.login_success = True
            self.user_info = {'username': login_resp_json.get('uid'), 'nickname': login_resp_json.get('nick')}
            login_status_info['code'] = 0
            login_status_info['state'] = 'success'
            login_status_info['message'] = '登录成功，获取到的用户名: %s' % login_resp_json.get('nick')
        # 验证码不正确
        elif login_resp_json.get('retcode') == '2070':
            login_status_info['message'] = '登录失败，%s' % login_resp_json.get('reason')
        elif login_resp_json.get('retcode') == '101':
            login_status_info['message'] = '登录失败，%s' % login_resp_json.get('reason')
        else:
            login_status_info['message'] = '登录失败，登录返回结果 %s' % login_resp.text
        return login_status_info

    def get_user_info(self):
        """获取用户信息
        :return: 登录成功`rtype:dict`, 登录失败`rtype:None`
        """
        return self.user_info if self.login_success else None

    def get_login_cookies(self) -> dict:
        """获取用户登录后的cookies"""
        return requests.utils.dict_from_cookiejar(self.session.cookies)

    def _init_session(self):
        """初始化请求会话"""
        try:
            self.session.get(url=self.login_home_url)
        except requests.exceptions.RequestException:
            pass

    def _pre_login(self):
        """预登陆操作，获取相关参数"""
        # 获取提交的用户名
        s_username = self._get_su()
        # 获取提交登陆时需要的参数
        json_data = self._get_login_form_data(su=s_username)
        # # 获取提交的密码字符串
        # s_password = self._get_s_password(json_data=json_data)
        s_password = self._get_s_password(server_time=json_data.get('servertime'),
                                          nonce=json_data.get('nonce'),
                                          pubkey=json_data.get('pubkey'))
        # 设置提交登陆操作时
        login_form_data = {
            'entry': 'weibo',
            'gateway': '1',
            'from': '',
            'savestate': '7',
            'userticket': '1',
            'vsnf': '1',
            'service': 'miniblog',
            'encoding': 'UTF-8',
            'pwencode': 'rsa2',
            'sr': '1280*800',
            'prelt': '529',
            'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack',
            'rsakv': json_data.get('rsakv'),
            'servertime': json_data.get('servertime'),
            'nonce': json_data.get('nonce'),
            'su': s_username,
            'sp': s_password,
            'returntype': 'TEXT'
        }
        # print(json.dumps(login_form_data, ensure_ascii=False))
        # 需要验证码时
        if json_data.get('pcid'):
            current_captcha_url = self.captcha_url.format(ts=int(time.time()), pcid=json_data.get('pcid'))
            captcha_resp = self.session.get(url=current_captcha_url)
            temp_captcha_file = 'weibo_captcha.jpg'
            open(temp_captcha_file, 'wb').write(captcha_resp.content)
            # TODO 2019-06-24 开放验证码识别接口 http://captcha.faceme.site/sina
            # 使用方式见README-IDEAs.md文档
            captcha_text = input('验证码保存路径为 %s\n验证码的值为> ' % os.path.abspath(temp_captcha_file))
            login_form_data['pcid'] = json_data.get('pcid')
            login_form_data['door'] = captcha_text
        return login_form_data

    def _get_su(self):
        """获取真实的执行登陆操作时提交的用户名"""
        # 用户名先进行url编码
        username_quote = quote(self.username)
        # 再经过base64进行编码
        username_base64 = base64.b64encode(username_quote.encode('utf-8'))
        return username_base64.decode('utf-8')

    def _get_s_password(self, server_time, nonce, pubkey):
        """获取将密码加密后用于登录的字符串"""
        encode_password = (str(server_time) + "\t" + str(nonce) + "\n" + str(self.password)).encode("utf-8")
        public_key = rsa.PublicKey(int(pubkey, 16), int('10001', 16))
        encry_password = rsa.encrypt(encode_password, public_key)
        password = binascii.b2a_hex(encry_password)
        return password.decode()

    def _get_login_form_data(self, su):
        """获取登陆form-data提交的参数`servertime`,`nonce`,`pubkey`,`rsakv`,`showpin`,etc"""
        pre_login_params = {
            'entry': 'weibo',
            'rsakt': 'mod',
            'checkpin': '1',
            'client': 'ssologin.js({js_version})'.format(js_version=self.ssologin_version),
            'su': su,
            '_': int(time.time() * 1000)}
        try:
            resp = self.session.get(url=self.pre_login_params_url, params=pre_login_params)
            if resp.status_code == 200 and resp.json().get('retcode') == 0:
                json_data = resp.json()
                return json_data
            else:
                raise ValueError('请求获取的数据无效')
        except (requests.exceptions.RequestException, ValueError):
            raise Exception('获取form-data参数出错')


if __name__ == '__main__':
    test_username = input('输入你的weibo账号名：')
    test_password = input('输入你的密码：')
    loginer = LoginSinaWeibo(username=test_username, password=test_password)
    # 开始执行登录操作
    login_result = loginer.login()
    print('登录结果：', login_result)
    # 获取用户信息
    user_info = loginer.get_user_info()
    print('用户信息：', user_info)
    # 获取登录状态cookies
    cookies = loginer.get_login_cookies()
    print('登录Cookies：', cookies)

输入你的weibo账号名：13141055789
输入你的密码：*TDzjPYzc*
验证码保存路径为 C:\Users\leo\Documents\GitRepo\security\courses\webcrawler\experiment_instruction\weibo_captcha.jpg
验证码的值为> ewvu9
登录结果： {'code': 1, 'state': 'failed', 'message': '登录失败，登录返回结果 {"retcode":"2071","reason":"\\u60a8\\u5df2\\u5f00\\u542f\\u767b\\u5f55\\u4fdd\\u62a4\\uff0c\\u8bf7\\u626b\\u7801\\u767b\\u5f55","protection_url":"https%3A%2F%2Flogin.sina.com.cn%2Fprotection%2Findex%3Ftoken%3D2NDZd13QvAASaSyLyTSXc12jgvpblErMWCnByb3RlY3Rpb24."}'}
用户信息： None
登录Cookies： {'ULOGIN_IMG': 'tc-5426b37249090eee72697592e1ddb39288cd', 'cross_origin_proto': 'SSL', 'login_sid_t': '75925c3b2a02a36a359d4644c68dac6f', 'login': '609423641c81693ee710ee69b0d0e34c', 'Ugrow-G0': 'cf25a00b541269674d0feadd72dce35f'}


In [None]:
"""weibo login"""

import requests
import base64
import urllib

class WeiboLogin:
    def __init__(self):
        session = requests.Session()
        self.userName = input("Input your weibo userid：")
        self.userPwd = input("Input your weibo password:")
    
    def encodeUsername(self):
        return base64.b64encode(urllib.parse.quote(self.userName).encode('utf-8')).decode('utf-8')
    
    def getPrelogin(self):
        
    
    

In [14]:
# -*- coding: utf-8 -*-
# file: sentence_similarity.py
# author: JinTian
# time: 24/03/2017 6:46 PM
# Copyright 2017 JinTian. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ------------------------------------------------------------------------
"""
using guide:
setting accounts first:
under: weibo_terminator/settings/accounts.py
you can set more than one accounts, WT will using all accounts one by one,
if one banned, another will move on.
if you care about security, using subsidiary accounts instead.
"""
import re
import sys
import os
import requests
from lxml import etree
import traceback
from pprint import pprint

import pickle
import time


################

# please set this to your own, this is fake accounts
accounts = [
    {
        "id": '99567',
        "password": '968077',
    },
    {
        "id": '4675879',
        "password": '54675486',
    },
]



##################

"""
all configurations set here, follow the instructions
"""

# you should not change this properly
DEFAULT_USER_ID = 'realangelababy'
LOGIN_URL = 'https://passport.weibo.cn/signin/login'
ID_FILE_PATH = './settings/id_file'

# change this to your PhantomJS unzip path, point to bin/phantomjs executable file, full path
PHANTOM_JS_PATH = '/Users/jintian/phantomjs-2.1.1-macosx/bin/phantomjs'
COOKIES_SAVE_PATH = 'settings/cookies.pkl'



def is_valid_id(s):
    try:
        a = float(s)
        return True
    except ValueError as e:
        return False


def is_number(s):
    try:
        a = float(s)
        return True
    except ValueError as e:
        return False



################

class WeiBoScraper(object):

    def __init__(self, using_account, uuid, filter_flag=0):
        """
        uuid user id, filter flag indicates weibo type
        :param uuid:
        :param filter_flag:
        """
        self.using_account = using_account
        self._init_cookies()
        self._init_headers()

        self.user_id = uuid
        self.filter = filter_flag
        self.user_name = ''
        self.weibo_num = 0
        self.weibo_scraped = 0
        self.following = 0
        self.followers = 0
        self.weibo_content = []
        self.num_zan = []
        self.num_forwarding = []
        self.num_comment = []
        self.weibo_detail_urls = []

    def _init_cookies(self):
        try:
            with open(COOKIES_SAVE_PATH, 'rb') as f:
                cookies_dict = pickle.load(f)
            cookies_string = cookies_dict[self.using_account]
            cookie = {
                "Cookie": cookies_string
            }
            print('setting cookies..')
            self.cookie = cookie
        except FileNotFoundError:
            print('have not get cookies yet.')

    def _init_headers(self):
        """
        avoid span
        :return:
        """
        headers = requests.utils.default_headers()
        user_agent = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 5.1; rv:11.0) Gecko/20100101 Firefox/11.0'
        }
        headers.update(user_agent)
        print('headers: ', headers)
        self.headers = headers

    def crawl(self):
        # this is the most time-cost part, we have to catch errors, return to dispatch center
        try:
            self._get_html()
            self._get_user_name()
            self._get_user_info()
            self._get_weibo_info()
            self._get_weibo_detail_comment()
            print('weibo scrap done!')
            print('-' * 30)
            return True
        except Exception as e:
            print(e)
            print('current account being banned, return to dispatch center, resign for new account..')
            return False

    def _get_html(self):
        try:
            if is_number(self.user_id):
                url = 'http://weibo.cn/u/%s?filter=%s&page=1' % (self.user_id, self.filter)
                print(url)
            else:
                url = 'http://weibo.cn/%s?filter=%s&page=1' % (self.user_id, self.filter)
                print(url)
            self.html = requests.get(url, cookies=self.cookie, headers=self.headers).content
            print('success load html..')
        except Exception as e:
            print(e)

    def _get_user_name(self):
        print('-- getting user name')
        try:
            selector = etree.HTML(self.html)
            self.user_name = selector.xpath('//table//div[@class="ut"]/span[1]/text()')[0]
            print('current user name is: {}'.format(self.user_name))
        except Exception as e:
            print(e)
            print('html not properly loaded, maybe cookies out of date or account being banned. '
                  'change an account please')
            exit()

    def _get_user_info(self):
        print('-- getting user info')
        selector = etree.HTML(self.html)
        pattern = r"\d+\.?\d*"
        str_wb = selector.xpath('//span[@class="tc"]/text()')[0]
        guid = re.findall(pattern, str_wb, re.S | re.M)
        for value in guid:
            num_wb = int(value)
            break
        self.weibo_num = num_wb

        str_gz = selector.xpath("//div[@class='tip2']/a/text()")[0]
        guid = re.findall(pattern, str_gz, re.M)
        self.following = int(guid[0])

        str_fs = selector.xpath("//div[@class='tip2']/a/text()")[1]
        guid = re.findall(pattern, str_fs, re.M)
        self.followers = int(guid[0])
        print('current user all weibo num {}, following {}, followers {}'.format(self.weibo_num, self.following,
                                                                                 self.followers))

    def _get_weibo_info(self):
        print('-- getting weibo info')
        selector = etree.HTML(self.html)
        try:
            if selector.xpath('//input[@name="mp"]') is None:
                page_num = 1
            else:
                page_num = int(selector.xpath('//input[@name="mp"]')[0].attrib['value'])
            pattern = r"\d+\.?\d*"
            print('--- all weibo page {}'.format(page_num))

            try:
                # traverse all weibo, and we will got weibo detail urls
                # TODO: inside for loop must set sleep avoid banned by official.
                for page in range(1, page_num):
                    url2 = 'http://weibo.cn/%s?filter=%s&page=%s' % (self.user_id, self.filter, page)
                    html2 = requests.get(url2, cookies=self.cookie, headers=self.headers).content
                    selector2 = etree.HTML(html2)
                    info = selector2.xpath("//div[@class='c']")
                    print('---- current solving page {}'.format(page))

                    if page % 10 == 0:
                        print('[ATTEMPTING] rest for 5 minutes to cheat weibo site, avoid being banned.')
                        time.sleep(60*5)

                    if len(info) > 3:
                        for i in range(0, len(info) - 2):
                            detail = info[i].xpath("@id")[0]
                            self.weibo_detail_urls.append('http://weibo.cn/comment/{}?uid={}&rl=0'.
                                                          format(detail.split('_')[-1], self.user_id))

                            self.weibo_scraped += 1
                            str_t = info[i].xpath("div/span[@class='ctt']")
                            weibos = str_t[0].xpath('string(.)')
                            self.weibo_content.append(weibos)
                            print(weibos)

                            str_zan = info[i].xpath("div/a/text()")[-4]
                            guid = re.findall(pattern, str_zan, re.M)
                            num_zan = int(guid[0])
                            self.num_zan.append(num_zan)

                            forwarding = info[i].xpath("div/a/text()")[-3]
                            guid = re.findall(pattern, forwarding, re.M)
                            num_forwarding = int(guid[0])
                            self.num_forwarding.append(num_forwarding)

                            comment = info[i].xpath("div/a/text()")[-2]
                            guid = re.findall(pattern, comment, re.M)
                            num_comment = int(guid[0])
                            self.num_comment.append(num_comment)
            except etree.XMLSyntaxError as e:
                print('get weibo info finished.')
            if self.filter == 0:
                print('共' + str(self.weibo_scraped) + '条微博')

            else:
                print('共' + str(self.weibo_num) + '条微博，其中' + str(self.weibo_scraped) + '条为原创微博')
        except IndexError as e:
            print('get weibo info done, current user {} has no weibo yet.'.format(self.user_id))

    def _get_weibo_detail_comment(self):
        """
        this is the core method, we will using self.weibo_detail_urls
        to get all weibo details and get all comments.
        :return:
        """
        weibo_comments_save_path = './weibo_detail/{}.txt'.format(self.user_id)
        if not os.path.exists(weibo_comments_save_path):
            os.makedirs(os.path.dirname(weibo_comments_save_path))
        with open(weibo_comments_save_path, 'w+') as f:
            for i, url in enumerate(self.weibo_detail_urls):
                print('solving weibo detail from {}'.format(url))
                html_detail = requests.get(url, cookies=self.cookie, headers=self.headers).content
                selector_detail = etree.HTML(html_detail)
                all_comment_pages = selector_detail.xpath('//*[@id="pagelist"]/form/div/input[1]/@value')[0]
                print('\n这是 {} 的微博：'.format(self.user_name))
                print('微博内容： {}'.format(self.weibo_content[i]))
                print('接下来是下面的评论：\n\n')

                # write weibo content
                f.writelines('E\n')
                f.writelines(self.weibo_content[i] + '\n')
                f.writelines('E\n')
                f.writelines('F\n')
                for page in range(int(all_comment_pages) - 2):

                    if page % 10 == 0:
                        print('[ATTEMPTING] rest for 5 minutes to cheat weibo site, avoid being banned.')
                        time.sleep(60*5)

                    # we crawl from page 2, cause front pages have some noise
                    detail_comment_url = url + '&page=' + str(page + 2)
                    try:
                        # from every detail comment url we will got all comment
                        html_detail_page = requests.get(detail_comment_url, cookies=self.cookie).content
                        selector_comment = etree.HTML(html_detail_page)

                        comment_div_element = selector_comment.xpath('//div[starts-with(@id, "C_")]')

                        for child in comment_div_element:
                            single_comment_user_name = child.xpath('a[1]/text()')[0]
                            if child.xpath('span[1][count(*)=0]'):
                                single_comment_content = child.xpath('span[1][count(*)=0]/text()')[0]
                            else:
                                span_element = child.xpath('span[1]')[0]
                                at_user_name = span_element.xpath('a/text()')[0]
                                at_user_name = '$' + at_user_name.split('@')[-1] + '$'
                                single_comment_content = span_element.xpath('/text()')
                                single_comment_content.insert(1, at_user_name)
                                single_comment_content = ' '.join(single_comment_content)

                            full_single_comment = '<' + single_comment_user_name + '>' + ': ' + single_comment_content
                            print(full_single_comment)
                            f.writelines(full_single_comment + '\n')
                    except etree.XMLSyntaxError as e:
                        print('-*20')
                        print('user id {} all done!'.format(self.user_id))
                        print('all weibo content and comments saved into {}'.format(weibo_comments_save_path))
                f.writelines('F\n')

    def switch_account(self, new_account):
        assert new_account.isinstance(str), 'account must be string'
        self.using_account = new_account

    def write_text(self):
        try:
            if self.filter == 1:
                result_header = '\n\n原创微博内容：\n'
            else:
                result_header = '\n\n微博内容：\n'
            result = '用户信息\n用户昵称：' + self.user_name + '\n用户id：' + str(self.user_id) + '\n微博数：' + str(
                self.weibo_num) + '\n关注数：' + str(self.following) + '\n粉丝数：' + str(self.followers) + result_header
            for i in range(1, self.weibo_scraped + 1):
                text = str(i) + ':' + self.weibo_content[i - 1] + '\n' + '点赞数：' + str(self.num_zan[i - 1]) + '	 转发数：' + str(
                    self.num_forwarding[i - 1]) + '	 评论数：' + str(self.num_comment[i - 1]) + '\n\n'
                result += text
            if not os.path.isdir('weibo'):
                os.mkdir('weibo')
            f = open("weibo/%s.txt" % self.user_id, "w")
            f.write(result)
            f.close()
            file_path = os.getcwd() + "\weibo" + "\%s" % self.user_id + ".txt"
            print('微博写入文件完毕，保存路径%s' % file_path)

        except Exception as e:
            print("Error: ", e)
            traceback.print_exc()


def main():
    user_id = input("输入weibo用户名：")
    filter_flag = 1
    wb = WeiBoScraper(user_id, filter_flag)
    wb.crawl()
    print('用户名：', wb.user_name)
    print('全部微博数：', str(wb.weibo_num))
    print('关注数：', str(wb.following))
    print('粉丝数：', str(wb.followers))
    print('最新一条微博为：', wb.weibo_content[0])
    print('最新一条微博获得的点赞数：', wb.num_zan[0])
    print('最新一条微博获得的转发数：', str(wb.num_forwarding[0]))
    print('最新一条微博获得的评论数：', str(wb.num_comment[0]))
    wb.write_text()


if __name__ == '__main__':
    main()

输入weibo用户名：13141055789
have not get cookies yet.
headers:  {'User-Agent': 'Mozilla/5.0 (Windows NT 5.1; rv:11.0) Gecko/20100101 Firefox/11.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
http://weibo.cn/u/1?filter=0&page=1
'WeiBoScraper' object has no attribute 'cookie'
-- getting user name
'WeiBoScraper' object has no attribute 'html'
html not properly loaded, maybe cookies out of date or account being banned. change an account please
-- getting user info
'WeiBoScraper' object has no attribute 'html'
current account being banned, return to dispatch center, resign for new account..
用户名： 
全部微博数： 0
关注数： 0
粉丝数： 0


IndexError: list index out of range