Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

我读取json文件,显示乱码。 #6

Closed
kof0012 opened this issue Nov 4, 2017 · 5 comments
Closed

我读取json文件,显示乱码。 #6

kof0012 opened this issue Nov 4, 2017 · 5 comments
Labels

Comments

@kof0012
Copy link

kof0012 commented Nov 4, 2017

我抓今日头条的json,出现raise JSONDecodeError("Expecting value", s, err.value) from None
���������
出现类似这样的乱码。
image

@kof0012
Copy link
Author

kof0012 commented Nov 4, 2017

import hashlib
import json
import time

import pymysql
import requests
from fake_useragent import UserAgent
from requests.exceptions import RequestException
import trip


ua = UserAgent()
s = requests.session()


def getASCP():
    t = round(time.time())
    e = hex(t).upper()[2:]
    m = hashlib.md5()
    m.update(str(t).encode(encoding='utf-8'))
    i = m.hexdigest().upper()

    if len(e) != 8:
        AS = '479BB4B7254C150'
        CP = '7E0AC8874BB0985'
        return AS, CP

    n = i[0:5]
    a = i[-5:]
    s = ''
    r = ''
    for o in range(5):
        s += n[o] + e[o]
        r += e[o + 3] + a[o]

    AS = 'A1' + s + e[-3:]
    CP = e[0:3] + r + 'E1'
    return AS, CP

@trip.coroutine
def start_requests(maxtime=0):
    AS, CP = getASCP()
    headers = {'User-Agent': ua.random}
    feed_url = 'https://www.toutiao.com/api/pc/feed/'
    payloads = {'max_behot_time': maxtime, 'category': '__all__', 'utm_source': 'toutiao', 'widen': 1,
                'tadrequire': 'false', 'as': AS, 'cp': CP}
    global r
    try:
        r =yield trip.get(feed_url, params=payloads, headers=headers)

        r_co=r.content
        r.encoding="utf-8"
        print(r.encoding)
        r_js=r.json()
        if 'data' in r.keys():
            return r_js
    except RequestException as e:
        print('请求不成功', e)
        return None


async def parse_detail(response):
    for i in response.get('data', None):
        if i.get('is_feed_ad') == False:
            title = i.get('title')
            tags = i.get('chinese_tag')
            comments = i.get('comments_count')
            result = {'title': title, 'tags': tags, 'comments': comments}
            print(result)
            insert_mysql(result)
    if response.get('next'):
        maxtime = response.get('next').get('max_behot_time')

        return parse_detail(trip.run(start_requests(maxtime=maxtime)))


def write_json(result):
    with open('tt.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(result, ensure_ascii=False) + '\n')


def insert_mysql(result):
    try:
        conn = pymysql.Connect(host="127.0.0.1", port=3306, user='root', passwd='root', db='spider', charset='utf8')
        cursor = conn.cursor()
        sql_in = "replace into lala (title,tags,comments) values(%s,%s,%s)"
        cursor.execute(sql_in, (result['title'], result['tags'], result['comments']))
        conn.commit()
    except Exception as  e:
        print(e)
        conn.rollback()


def main():
    response = trip.run(start_requests)

    trip.run(parse_detail(response))


if __name__ == '__main__':
    main()

@littlecodersh
Copy link
Owner

@kof0012 是我gzip处理的时候的问题,你更新一下版本(0.0.3)即可。

python -m pip install trip -U

@kof0012
Copy link
Author

kof0012 commented Nov 6, 2017

@littlecodersh 感谢回复,另外请问怎么在trip.run(fun)里写参数。实现trip.run(fun(args)),还是抓今日头条的json文件,想要递归回调(带参数),想了半天想不出来办法。。求教。

import trip
import hashlib
import json
import time
import pymysql
import requests
from fake_useragent import UserAgent
from requests.exceptions import RequestException

ua = UserAgent()

conn = pymysql.Connect(host="127.0.0.1", port=3306,
                       user='root', passwd='root', db='spider', charset='utf8')
cursor = conn.cursor()


def getASCP():
    t = round(time.time())
    e = hex(t).upper()[2:]
    m = hashlib.md5()
    m.update(str(t).encode(encoding='utf-8'))
    i = m.hexdigest().upper()

    if len(e) != 8:
        AS = '479BB4B7254C150'
        CP = '7E0AC8874BB0985'
        return AS, CP

    n = i[0:5]
    a = i[-5:]
    s = ''
    r = ''
    for o in range(5):
        s += n[o] + e[o]
        r += e[o + 3] + a[o]

    AS = 'A1' + s + e[-3:]
    CP = e[0:3] + r + 'E1'
    return AS, CP

@trip.coroutine
def start_requests(maxtime=0):
    global r
    AS, CP = getASCP()
    headers = {'User-Agent': ua.random}
    feed_url = 'https://www.toutiao.com/api/pc/feed/'
    payloads = {'max_behot_time': maxtime, 'category': '__all__', 'utm_source': 'toutiao', 'widen': 1,
                'tadrequire': 'false', 'as': AS, 'cp': CP}
    try:
        r =yield trip.get(feed_url, params=payloads, headers=headers)
        res=r.json()
    except RequestException as e:
        print('请求不成功', e)
        return None


def parse_detail(response):
    global ss
    for i in response.get('data', None):
        if i.get('is_feed_ad') == False:
            result = {'title': i.get('title'), 'tags': i.get('chinese_tag'), 'comments': i.get(
                'comments_count'), 'url': 'https://www.toutiao.com' + i.get('source_url')}
            print(result)
            insert_mysql(result)


    if response.get('next'):
        maxtime = response.get('next').get('max_behot_time')
        ss=start_requests(maxtime=maxtime)
        return parse_detail(ss.json())



def write_json(result):
    with open('tt.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(result, ensure_ascii=False) + '\n')


def insert_mysql(result):
    try:

        sql_in = "insert into toutiaocomment(title,tags,comments,url) VALUES(%s,%s,%s,%s) ON DUPLICATE KEY UPDATE comments=VALUES(comments)"
        cursor.execute(
            sql_in, (result['title'], result['tags'], result['comments'], result['url']))
        conn.commit()
    except Exception as e:
        print(e)
        conn.rollback()

def main():
    trip.run(start_requests)
    parse_detail(r.json())



if __name__ == '__main__':
    main()

@littlecodersh
Copy link
Owner

@kof0012

from functools import partial

@kof0012
Copy link
Author

kof0012 commented Nov 6, 2017

@littlecodersh 多谢提醒,已经解决。

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

No branches or pull requests

2 participants