diff --git a/NewsSpider/NewsSpider.py b/NewsSpider/NewsSpider.py
index 38ba4dac..1a2c7758 100644
--- a/NewsSpider/NewsSpider.py
+++ b/NewsSpider/NewsSpider.py
@@ -1,61 +1,36 @@
-# -*- coding: utf-8 -*-
-import os
-import sys
-import urllib2
import requests
-import re
-from lxml import etree
-
-
-def StringListSave(save_path, filename, slist):
- if not os.path.exists(save_path):
- os.makedirs(save_path)
- path = save_path+"/"+filename+".txt"
- with open(path, "w+") as fp:
- for s in slist:
- fp.write("%s\t\t%s\n" % (s[0].encode("utf8"), s[1].encode("utf8")))
-
-def Page_Info(myPage):
- '''Regex'''
- mypage_Info = re.findall(r'
', myPage, re.S)
- return mypage_Info
-
-def New_Page_Info(new_page):
- '''Regex(slowly) or Xpath(fast)'''
- # new_page_Info = re.findall(r'.*?(.*?) | ', new_page, re.S)
- # # new_page_Info = re.findall(r'.*?(.*?) | ', new_page, re.S) # bugs
- # results = []
- # for url, item in new_page_Info:
- # results.append((item, url+".html"))
- # return results
- dom = etree.HTML(new_page)
- new_items = dom.xpath('//tr/td/a/text()')
- new_urls = dom.xpath('//tr/td/a/@href')
- assert(len(new_items) == len(new_urls))
- return zip(new_items, new_urls)
-
-def Spider(url):
- i = 0
- print "downloading ", url
- myPage = requests.get(url).content.decode("gbk")
- # myPage = urllib2.urlopen(url).read().decode("gbk")
- myPageResults = Page_Info(myPage)
- save_path = u"网易新闻抓取"
- filename = str(i)+"_"+u"新闻排行榜"
- StringListSave(save_path, filename, myPageResults)
- i += 1
- for item, url in myPageResults:
- print "downloading ", url
- new_page = requests.get(url).content.decode("gbk")
- # new_page = urllib2.urlopen(url).read().decode("gbk")
- newPageResults = New_Page_Info(new_page)
- filename = str(i)+"_"+item
- StringListSave(save_path, filename, newPageResults)
- i += 1
+from bs4 import BeautifulSoup
+
+
+def save_to_file(file_path, file_name, data):
+ if not os.path.exists(file_path):
+ os.makedirs(file_path)
+ path = os.path.join(file_path, file_name + '.txt')
+ with open(path, "w", encoding='utf-8') as file:
+ for item in data:
+ file.write(item + '\n')
+
+
+def get_page_info(page_content):
+ soup = BeautifulSoup(page_content, 'html.parser')
+ title = soup.find('h2', class_='titleBar').get_text(strip=True)
+ more_link = soup.find('div', class_='more').find('a')['href']
+ return title, more_link
+
+
+def spider(url):
+ response = requests.get(url)
+ response.encoding = 'gbk'
+ page_content = response.text
+ title, more_link = get_page_info(page_content)
+ print(f"Title: {title}")
+ print(f"More Link: {more_link}")
+ # Save the title and more link to a file
+ save_to_file('news', 'news_info', [title, more_link])
if __name__ == '__main__':
- print "start"
- start_url = "http://news.163.com/rank/"
- Spider(start_url)
- print "end"
\ No newline at end of file
+ start_url = "http://example.com/start-page"
+ print("Start")
+ spider(start_url)
+ print("End")