### 解析tushare 的trading模块中的get_hist_data函数

In [2]:
# -*- coding:utf-8 -*- 
"""
交易数据接口 
Created on 2014/07/31
@author: Jimmy Liu
@group : waditu
@contact: jimmysoa@sina.cn
"""
from __future__ import division

import time
import json
import lxml.html
from lxml import etree
import pandas as pd
import numpy as np
import datetime
from stock import cons as ct
import re
from pandas.compat import StringIO
from stock import dateu as du
from stock.formula import MA
import os
from stock.conns import get_apis, close_apis
from stock.fundamental import get_stock_basics
try:
    from urllib.request import urlopen, Request
except ImportError:
    from urllib2 import urlopen, Request

In [3]:
def get_hist_data(code=None, start=None, end=None,
                  ktype='D', retry_count=3,
                  pause=0.001):
    """
        获取个股历史交易记录
    Parameters
    ------
      code:string
                  股票代码 e.g. 600848
      start:string
                  开始日期 format：YYYY-MM-DD 为空时取到API所提供的最早日期数据
      end:string
                  结束日期 format：YYYY-MM-DD 为空时取到最近一个交易日数据
      ktype：string
                  数据类型，D=日k线 W=周 M=月 5=5分钟 15=15分钟 30=30分钟 60=60分钟，默认为D
      retry_count : int, 默认 3
                 如遇网络等问题重复执行的次数 
      pause : int, 默认 0
                重复请求数据过程中暂停的秒数，防止请求间隔时间太短出现的问题
    return
    -------
      DataFrame
          属性:日期 ，开盘价， 最高价， 收盘价， 最低价， 成交量， 价格变动 ，涨跌幅，5日均价，10日均价，20日均价，5日均量，10日均量，20日均量，换手率
    """
    symbol = ct._code_to_symbol(code)
    url = ''
    if ktype.upper() in ct.K_LABELS:
        url = ct.DAY_PRICE_URL%(ct.P_TYPE['http'], ct.DOMAINS['ifeng'],
                                ct.K_TYPE[ktype.upper()], symbol)
    elif ktype in ct.K_MIN_LABELS:
        url = ct.DAY_PRICE_MIN_URL%(ct.P_TYPE['http'], ct.DOMAINS['ifeng'],
                                    symbol, ktype)
    else:
        raise TypeError('ktype input error.')
    
    for _ in range(retry_count):
        time.sleep(pause)
        try:
            request = Request(url)
            lines = urlopen(request, timeout = 10).read()
            if len(lines) < 15: #no data
                return None
        except Exception as e:
            print(e)
        else:
            js = json.loads(lines.decode('utf-8') if ct.PY3 else lines)
            cols = []
            if (code in ct.INDEX_LABELS) & (ktype.upper() in ct.K_LABELS):
                cols = ct.INX_DAY_PRICE_COLUMNS
            else:
                cols = ct.DAY_PRICE_COLUMNS
            if len(js['record'][0]) == 14:
                cols = ct.INX_DAY_PRICE_COLUMNS
            df = pd.DataFrame(js['record'], columns=cols)
            if ktype.upper() in ['D', 'W', 'M']:
                df = df.applymap(lambda x: x.replace(u',', u''))
                df[df==''] = 0
            for col in cols[1:]:
                df[col] = df[col].astype(float)
            if start is not None:
                df = df[df.date >= start]
            if end is not None:
                df = df[df.date <= end]
            if (code in ct.INDEX_LABELS) & (ktype in ct.K_MIN_LABELS):
                df = df.drop('turnover', axis=1)
            df = df.set_index('date')
            df = df.sort_index(ascending = False)
            return df
    raise IOError(ct.NETWORK_URL_ERROR_MSG)

In [4]:
code='603987'
start=None
end=None,
ktype='D'
retry_count=3
pause=0.001

symbol = ct._code_to_symbol(code)
print(symbol)

sh603987


In [5]:
url = ''
if ktype.upper() in ct.K_LABELS:
    url = ct.DAY_PRICE_URL%(ct.P_TYPE['http'], ct.DOMAINS['ifeng'],
                                ct.K_TYPE[ktype.upper()], symbol)
elif ktype in ct.K_MIN_LABELS:
    url = ct.DAY_PRICE_MIN_URL%(ct.P_TYPE['http'], ct.DOMAINS['ifeng'],
                                    symbol, ktype)
else:
    raise TypeError('ktype input error.')

In [6]:
url

'http://api.finance.ifeng.com/akdaily/?code=sh603987&type=last'

In [7]:
#后复权
request = Request(ct.HIST_FQ_FACTOR_URL%(ct.P_TYPE['http'],
                                             ct.DOMAINS['vsf'], symbol))
ct.HIST_FQ_FACTOR_URL%(ct.P_TYPE['http'],ct.DOMAINS['vsf'], symbol)

'http://vip.stock.finance.sina.com.cn/api/json.php/BasicStockSrv.getStockFuQuanData?symbol=sh603987&type=hfq'

In [8]:
text = urlopen(request, timeout=10).read()
#print(text)

In [9]:
text = text.decode(encoding='utf-8')
text = text[1:len(text)-1]
#text = text.decode('utf-8') if ct.PY3 else text
text = text.replace('{_', '{"')
text = text.replace('total', '"total"')
text = text.replace('data', '"data"')
text = text.replace(':"', '":"')
text = text.replace('",_', '","')
text = text.replace('_', '-')
text = json.loads(text)

In [10]:
print(text)

{'total': 741, 'data': {'2019-12-03': '15.6117', '2019-12-02': '15.6336', '2019-11-29': '15.8522', '2019-11-28': '15.8304', '2019-11-27': '16.0709', '2019-11-26': '16.2021', '2019-11-25': '16.4645', '2019-11-22': '16.8362', '2019-11-21': '16.8362', '2019-11-20': '16.6175', '2019-11-19': '16.7487', '2019-11-18': '16.3114', '2019-11-15': '16.2896', '2019-11-14': '16.5082', '2019-11-13': '16.7269', '2019-11-12': '16.8799', '2019-11-11': '16.6831', '2019-11-08': '18.5416', '2019-11-07': '19.2851', '2019-11-06': '18.9134', '2019-11-05': '18.2355', '2019-11-04': '18.4761', '2019-11-01': '18.4979', '2019-10-31': '18.1044', '2019-10-30': '18.5198', '2019-10-29': '18.4979', '2019-10-28': '18.3230', '2019-10-25': '17.7108', '2019-10-24': '17.7545', '2019-10-23': '17.5796', '2019-10-22': '17.9076', '2019-10-21': '17.6889', '2019-10-18': '18.4761', '2019-10-17': '18.7385', '2019-10-16': '18.0825', '2019-10-15': '17.9295', '2019-10-14': '18.3449', '2019-10-11': '18.3230', '2019-10-10': '18.3449', '

In [11]:
df = pd.DataFrame({'date':list(text['data'].keys()), 'factor':list(text['data'].values())})
print(df)

           date   factor
0    2019-12-03  15.6117
1    2019-12-02  15.6336
2    2019-11-29  15.8522
3    2019-11-28  15.8304
4    2019-11-27  16.0709
5    2019-11-26  16.2021
6    2019-11-25  16.4645
7    2019-11-22  16.8362
8    2019-11-21  16.8362
9    2019-11-20  16.6175
10   2019-11-19  16.7487
11   2019-11-18  16.3114
12   2019-11-15  16.2896
13   2019-11-14  16.5082
14   2019-11-13  16.7269
15   2019-11-12  16.8799
16   2019-11-11  16.6831
17   2019-11-08  18.5416
18   2019-11-07  19.2851
19   2019-11-06  18.9134
20   2019-11-05  18.2355
21   2019-11-04  18.4761
22   2019-11-01  18.4979
23   2019-10-31  18.1044
24   2019-10-30  18.5198
25   2019-10-29  18.4979
26   2019-10-28  18.3230
27   2019-10-25  17.7108
28   2019-10-24  17.7545
29   2019-10-23  17.5796
..          ...      ...
711  2016-12-30  32.2900
712  2016-12-29  32.1800
713  2016-12-28  31.8800
714  2016-12-27  32.7800
715  2016-12-26  33.1800
716  2016-12-23  32.8900
717  2016-12-22  36.0500
718  2016-12-21  35.3500


In [12]:
start,end=None,None
start = du.today_last_year() if start is None else start
end = du.today() if end is None else end
qs = du.get_quarts(start, end)
qt = qs[0]
ct._write_head()


[Getting data:]

In [31]:
index=False;retry_count=3;pause=0.01
url=_get_index_url(index, code, qt)
#data = _parse_fq_data(_get_index_url(index, code, qt), index,retry_count, pause)

NameError: name '_get_index_url' is not defined

In [53]:
url
'''
下面是我自己加的
'''
url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_FuQuanMarketHistory/stockid/000546.phtml?year=2018&jidu=4'

In [54]:
request = Request(url)
text = urlopen(request, timeout=10).read()
text = text.decode('GBK')
print(text)

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
    <meta http-equiv="Content-Type" content="text/html; charset=gb2312" />
    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
    <meta content="always" name="referrer">
    <meta name="renderer" content="webkit">
    <title>金圆股份(000546)股票股价,行情,新闻,财报数据_新浪财经_新浪网</title>
    <meta name="keywords" content="金圆股份,000546,金圆股份股票行情,000546股票行情,金圆股份股价,金圆股份实时行情,金圆股份交易,金圆股份实时资金流向,金圆股份机构研究报告,金圆股份点评,金圆股份新闻,金圆股份财务分析,新浪财经 " />
    <meta name="description" content="新浪财经为您提供金圆股份(000546)股票实时行情走势,实时资金流向,实时新闻资讯,研究报告,股吧互动,交易信息,个股点评,公告,财务指标分析等与金圆股份(000546)股票相关的信息与服务." />
    <meta name="stencil" content="PGLS000314" />
    <link rel="Stylesheet" type="text/css" href="//n.sinaimg.cn/finance/hq_kcb/base_stock_A_20190625.css?ts=3.8" />
    <link rel="Stylesheet" type="text/css" href="//n.sinaimg.cn/tech/66ceb6d9/20190429

In [55]:
vMS_FuQuanMarketHistoryhtml = lxml.html.parse(StringIO(text))

In [56]:
res = html.xpath('//table[@id=\"FundHoldSharesTable\"]')

In [61]:
if ct.PY3:
    sarr = [etree.tostring(node).decode('utf-8') for node in res]
else:
    sarr = [etree.tostring(node) for node in res]
sarr = ''.join(sarr)

In [62]:
df = pd.read_html(sarr, skiprows = [0, 1])[0]

ValueError: No text parsed from document: 

In [59]:
if len(df) == 0:
      pd.DataFrame()
if index:
     df.columns = ct.HIST_FQ_COLS[0:7]
else:
     df.columns = ct.HIST_FQ_COLS
if df['date'].dtypes == np.object:
     df['date'] = df['date'].astype(np.datetime64)
df = df.drop_duplicates('date')

ValueError: Length mismatch: Expected axis has 2 elements, new values have 8 elements

In [43]:
data = _parse_fq_data(url, index,retry_count, pause)

  return self.apply('astype', dtype=dtype, **kwargs)


In [44]:
if index:
        url = ct.HIST_INDEX_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
                             code, qt[0], qt[1])
else:
        url = ct.HIST_FQ_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'],
                              code, qt[0], qt[1])

In [45]:
url

'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_FuQuanMarketHistory/stockid/000546.phtml?year=2018&jidu=4'

In [46]:
if data is None:
     data = pd.DataFrame()
if len(qs)>1:
    for d in range(1, len(qs)):
        qt = qs[d]
        ct._write_console()
        df = _parse_fq_data(_get_index_url(index, code, qt), index,
                                retry_count, pause)

##

  return self.apply('astype', dtype=dtype, **kwargs)


##

In [47]:
import os
import re
import numbers
import collections
import warnings

from distutils.version import LooseVersion

import numpy as np

from pandas.io.common import _is_url, urlopen, parse_url
from pandas.io.parsers import TextParser
from pandas.compat import (lrange, lmap, u, string_types, iteritems,
                           raise_with_traceback, binary_type)
from pandas.core import common as com
from pandas import Series


try:
    import bs4
except ImportError:
    _HAS_BS4 = False
else:
    _HAS_BS4 = True


try:
    import lxml
except ImportError:
    _HAS_LXML = False
else:
    _HAS_LXML = True


try:
    import html5lib
except ImportError:
    _HAS_HTML5LIB = False
else:
    _HAS_HTML5LIB = True

In [48]:
_HAS_LXML

True

In [49]:
_HAS_BS4

True

In [50]:
_HAS_HTML5LIB

True

In [51]:
import bs4

In [52]:
import html5lib

### 任务

寻找连续N天上涨的标的，起点3天；上涨定义为收盘价大于开盘价

In [22]:
import tushare as ts
import pandas as pd 

计算从2019-11-01起曾经最多N天上涨的标的，起点3天
标：300738 奥飞数据连续11天上涨
标：300017 网宿科技连续12天上涨
标：601698 中国卫通连续12天上涨
标：600804 鹏博士连续13天上涨
标：002093 国脉科技连续10天上涨
标：002467 二六三连续10天上涨
标：600050 中国联通连续10天上涨
标：300383 光环新网连续14天上涨
标：603881 数据港连续12天上涨
标：000428 华天酒店连续8天上涨
标：600258 首旅酒店连续15天上涨
标：002306 ST云网连续10天上涨
标：000721 西安饮食连续12天上涨
标：002186 全 聚 德连续10天上涨
标：600754 锦江酒店连续14天上涨
标：000524 岭南控股连续9天上涨
标：601007 金陵饭店连续10天上涨
标：000007 全新好连续11天上涨
