## 爬取古泉园地网站内的图片
### 1、Crawler 类是爬图片的实现类
### 2、Test 类是单元测试类，可以在里面写针对每个函数的测试方法
### 3、Main 类是主程序执行类

In [None]:
import requests
import re
import os
import time
from bs4 import BeautifulSoup

class Crawler:
    
    url = ""
    headers = ""
    savePath = ""
    soupContent = BeautifulSoup() 
    
    def __init__(self, url, headers, savePath):
        self.url = url
        self.headers = headers
        self.savePath = savePath
        self.get_response_info()
    
    # 加载soupContent的内容
    def get_response_info(self):
        response = requests.get(self.url, headers = self.headers)
        self.soupContent = BeautifulSoup(response.content, "html.parser")  #第一个参数传response.text也是可以的，但是解析慢
        
    # 抓取总共多少页图片
    def catch_page_sizes(self):
        pageScope = self.soupContent.findAll(name="div", attrs={"class":"r_float filter-r f-filter-r"})
        labelAList = pageScope[0].find_all('a')
        listLength = len(labelAList)
        return labelAList[listLength - 1].get_text()
        
    # 捕获图片所在区域的所有div    
    def get_div_lists(self):
        return self.soupContent.findAll(name="div", attrs={"class":"gq-box-detail"})
        
    # 捕获所有图片区域的list数组    
    def iterate_div_lists(self, divLists):
        for eachDiv in divLists:
            moneyInfo = self.catch_price(eachDiv)
            self.download_and_save_images(self.catch_img_url(eachDiv), 
                                          self.catch_name(eachDiv))
    # 捕获图片的url地址
    def catch_img_url(self, eachDiv):
        imgDiv = eachDiv.find_all(name="div", attrs={"class" :"div-imgs"})
        imgSrc = imgDiv[0].find_all('img')
        imgUrl = imgSrc[0]['data-original']#获取链接
        return imgUrl
        
    # 捕获图片的名称    
    def catch_name(self, eachDiv):
        nameDiv = eachDiv.find_all(name="div", attrs={"class":"div-name"})
        imgName = nameDiv[0].get_text()
        return imgName
        
    # 捕获图片对应钱币的价格    
    def catch_price(self, eachDiv):
        priceDiv = eachDiv.find_all(name="div", attrs={"class":"div-price"})
        moneyFlag = priceDiv[0].find_all('em')[0].get_text()
        moneyNum = priceDiv[0].find_all('i')[0].get_text()
        return ({"moneyFlag":moneyFlag, "moneyNum":moneyNum})
        
    # 捕获图片对应钱币的介绍信息    
    def catch_description(self, eachDiv):
        priceDiv = eachDiv.find_all(name="div", attrs={"class":"div-specification"})
        coinDescription = priceDiv[0].find_all('i')[0].get_text()
        print("====================================================")
        return coinDescription
        
    # 下载并保存抓取到的图片    
    def download_and_save_images(self, imgUrl, imgName):
        imgResponse = requests.get(imgUrl, headers = self.headers)
        with open(str(self.savePath) + str(imgName).strip() + ".jpg", 'wb') as f:    #/为分级 wb代表二进制模式文件，允许写入文件，
            f.write(imgResponse.content)
    

In [None]:
# 单元测试类
class Test:
    baseUrl = 'https://tuku.chcoin.com/'
    url = baseUrl + "listing-0-0-1-1.html"
    imageSavePath = '/Users/lujiahuan/4projects/yolov5+javaweb/yolov5/images/test_images/'
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36'}

    def unit_test(self):
        crawler = Crawler(self.url, self.headers, self.imageSavePath)
        print(crawler.catch_page_sizes())    
    

In [None]:
# 主函数类
class Main:
    baseUrl = 'https://tuku.chcoin.com/'
    firstPageUrl = baseUrl + "listing-0-0-1-1.html"
    imageSavePath = '/Users/lujiahuan/4projects/yolov5+javaweb/yolov5/images/test_images/'
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36'}

    def main(self):
        crawler = Crawler(self.firstPageUrl, self.headers, self.imageSavePath)
        pageSize = crawler.catch_page_sizes()
        print(pageSize)
        for x in range(500):  # 这里的20如果改用上面的pageSize的话，太多图片了
            if x == 0:
                pass
            else:
                url = self.baseUrl + "listing-0-0-1-" + str(x) + ".html"
                crawler = Crawler(url, self.headers, self.imageSavePath)
                crawler.iterate_div_lists(crawler.get_div_lists())


In [None]:
# Test().unit_test()  # 单元测试

# TODO:获取图片的函数还可以优化（如果获取的图片地址有误，则程序不终端，继续往后执行）
Main().main()