diff --git a/NewsSpider/NewsSpider.py b/NewsSpider/NewsSpider.py index 38ba4dac..1a2c7758 100644 --- a/NewsSpider/NewsSpider.py +++ b/NewsSpider/NewsSpider.py @@ -1,61 +1,36 @@ -# -*- coding: utf-8 -*- -import os -import sys -import urllib2 import requests -import re -from lxml import etree - - -def StringListSave(save_path, filename, slist): - if not os.path.exists(save_path): - os.makedirs(save_path) - path = save_path+"/"+filename+".txt" - with open(path, "w+") as fp: - for s in slist: - fp.write("%s\t\t%s\n" % (s[0].encode("utf8"), s[1].encode("utf8"))) - -def Page_Info(myPage): - '''Regex''' - mypage_Info = re.findall(r'

(.*?)

.*?
', myPage, re.S) - return mypage_Info - -def New_Page_Info(new_page): - '''Regex(slowly) or Xpath(fast)''' - # new_page_Info = re.findall(r'.*?(.*?)', new_page, re.S) - # # new_page_Info = re.findall(r'.*?(.*?)', new_page, re.S) # bugs - # results = [] - # for url, item in new_page_Info: - # results.append((item, url+".html")) - # return results - dom = etree.HTML(new_page) - new_items = dom.xpath('//tr/td/a/text()') - new_urls = dom.xpath('//tr/td/a/@href') - assert(len(new_items) == len(new_urls)) - return zip(new_items, new_urls) - -def Spider(url): - i = 0 - print "downloading ", url - myPage = requests.get(url).content.decode("gbk") - # myPage = urllib2.urlopen(url).read().decode("gbk") - myPageResults = Page_Info(myPage) - save_path = u"网易新闻抓取" - filename = str(i)+"_"+u"新闻排行榜" - StringListSave(save_path, filename, myPageResults) - i += 1 - for item, url in myPageResults: - print "downloading ", url - new_page = requests.get(url).content.decode("gbk") - # new_page = urllib2.urlopen(url).read().decode("gbk") - newPageResults = New_Page_Info(new_page) - filename = str(i)+"_"+item - StringListSave(save_path, filename, newPageResults) - i += 1 +from bs4 import BeautifulSoup + + +def save_to_file(file_path, file_name, data): + if not os.path.exists(file_path): + os.makedirs(file_path) + path = os.path.join(file_path, file_name + '.txt') + with open(path, "w", encoding='utf-8') as file: + for item in data: + file.write(item + '\n') + + +def get_page_info(page_content): + soup = BeautifulSoup(page_content, 'html.parser') + title = soup.find('h2', class_='titleBar').get_text(strip=True) + more_link = soup.find('div', class_='more').find('a')['href'] + return title, more_link + + +def spider(url): + response = requests.get(url) + response.encoding = 'gbk' + page_content = response.text + title, more_link = get_page_info(page_content) + print(f"Title: {title}") + print(f"More Link: {more_link}") + # Save the title and more link to a file + save_to_file('news', 'news_info', [title, more_link]) if __name__ == '__main__': - print "start" - start_url = "http://news.163.com/rank/" - Spider(start_url) - print "end" \ No newline at end of file + start_url = "http://example.com/start-page" + print("Start") + spider(start_url) + print("End")