In [110]:
# 安装第三方依赖库
!pip install pandas pyecharts selenium dataclasses_json dacite halo chromedriver-binary-auto

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
















In [94]:
import pandas as pd
import numpy as np
import json
from pyecharts.charts import Bar, Pie
from pyecharts import options as opts
from pyecharts.globals import ThemeType
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from urllib.parse import urlparse
# https://docs.python.org/3/library/dataclasses.html
from dataclasses import dataclass
# https://github.com/lidatong/dataclasses-json/stargazers
from dataclasses_json import dataclass_json
# https://github.com/konradhalas/dacite
from dacite import from_dict
# convert string to buffer, used in pd.read_csv
from io import StringIO
# https://github.com/ManrajGrover/halo
from halo import Halo
# https://realpython.com/python-logging/#the-logging-module
import logging

# https://pypi.org/project/chromedriver-binary-auto/
import chromedriver_binary
chromedriver_binary.chromedriver_filename
#!chromedriver-path



In [95]:
# set debug level
logging.basicConfig(level=logging.INFO)



In [96]:
# get intput data from csv string
inputData = '''
序号,学校名称,门户网站网址,主管部门,所在地,办学层次
1,云南大学,http://www.ynu.edu.cn/,云南省,昆明市,本科
2,昆明理工大学,http://www.kmust.edu.cn,云南省,昆明市,本科
3,云南农业大学,https://www.ynau.edu.cn/,云南省,昆明市,本科
4,西南林业大学,http://www.swfu.edu.cn/,云南省,昆明市,本科
5,昆明医科大学,http://www.kmmc.cn,云南省,昆明市,本科
6,大理大学,http://www.dali.edu.cn,云南省,大理白族自治州,本科
7,云南中医药大学,http://www.ynutcm.edu.cn/,云南省,昆明市,本科
8,云南师范大学,https://www.ynnu.edu.cn/,云南省,昆明市,本科
9,昭通学院,http://www.ztu.edu.cn/,云南省,昭通市,本科
10,曲靖师范学院,http://www.qjnu.edu.cn/,云南省,曲靖市,本科
11,普洱学院,http://www.peuni.cn/,云南省,普洱市,本科
12,保山学院,http://www.bsnc.cn/,云南省,保山市,本科
13,红河学院,http://www.uoh.edu.cn/,云南省,红河哈尼族彝族自治州,本科
14,云南财经大学,http://www.ynufe.edu.cn/,云南省,昆明市,本科
15,云南艺术学院,https://www.ynart.edu.cn/,云南省,昆明市,本科
16,云南民族大学,http://www.ynni.edu.cn/,云南省,昆明市,本科
17,玉溪师范学院,http://www.yxnu.edu.cn/,云南省,玉溪市,本科
18,楚雄师范学院,http://www.cxtc.edu.cn/,云南省,楚雄彝族自治州,本科
19,云南警官学院,https://www.ynpc.edu.cn/,云南省,昆明市,本科
20,昆明学院,http://www.kmu.edu.cn,云南省,昆明市,本科
21,文山学院,http://www.wsu.edu.cn/,云南省,文山壮族苗族自治州,本科
22,云南经济管理学院,https://www.ynjgy.com/,云南省教育厅,昆明市,本科
23,云南大学滇池学院,https://www.ynudcc.cn/,云南省教育厅,昆明市,本科
24,丽江旅游文化学院,http://www.lywhxy.com,云南省教育厅,丽江市,本科
25,昆明理工大学津桥学院,http://www.oxbridge.edu.cn/,云南省教育厅,昆明市,本科
26,云南师范大学商学院,http://www.ynnubs.com/,云南省教育厅,昆明市,本科
27,昆明文理学院（云南师范大学）,http://www.caskm.cn/,云南省教育厅,昆明市,本科
28,昆明医科大学海源学院,http://www.kyhyxy.com/,云南省教育厅,昆明市,本科
29,云南艺术学院文华学院,http://www.whxyart.cn/,云南省教育厅,昆明市,本科
30,云南工商学院,https://www.yngsxy.net/,云南省教育厅,昆明市,本科
31,滇西科技师范学院,http://www.wynu.edu.cn/,云南省,临沧市,本科
32,滇西应用技术大学,http://www.wyuas.edu.cn,云南省,大理市,本科
'''

# df=pd.read_csv('resources/云南省普通高等学校名单（本科）.csv')
df=pd.read_csv(StringIO(inputData))
siteInfo = []
for (name, url) in df[['学校名称','门户网站网址']].values:
    siteInfo.append({'name':name, 'url':url})
# only use first n for testing
# siteInfo = siteInfo[:2]



In [97]:
@dataclass_json
@dataclass
class SeoInfo:
    referenceBaidu: int = -1
    backLinkBaidu: int = -1
    referenceSogou: int = -1
    backLinkSogou: int = -1
    reference360: int = -1
    backLink360: int = -1
    referenceGoogle: int = -1
    backLinkGoogle: int = -1
    sslEnable: bool = False



In [98]:
# 定义一些常用的Utils函数

from functools import reduce
def ilen(iterable):
    '''
    获取interable对象的长度
    '''
    return reduce(lambda sum, element: sum + 1, iterable, 0)

def parseSpecialInt(data):
    '''
    解析特殊数字，例如 2万230
    '''
    numbers = data.split('万')
    return int(numbers[0]) * 10000 + int(numbers[1])

def parseInt(data):
    '''
    解析数字
    '''
    try:
        if('万' in data):
            return parseSpecialInt(data)
        value = int(data)
        return value
    except ValueError:
        logging.warning(f'error parse int({data})')
        return -1

def getSeoUrl(originalUrl):
    '''
    获取seo的网址
    '''
    hostname=urlparse(originalUrl).hostname
    return f'http://seo.chinaz.com/{hostname}'

def getElementIntValue(element, defaultValue=-1):
    '''
    获取element的int值
    '''
    try:
        return parseInt(element.text)
    except:
        logging.warning(f'error got int value from {element}')
        return defaultValue

def findElementByCssSelector(browser, css_selector):
    '''
    通过css_selector获取element，不抛出异常
    '''
    try:
        return browser.find_element_by_css_selector(css_selector)
    except:
        return None

def getSslEnable(element, defaultValue=False):
    '''
    获取是否是SSL Enable
    '''
    try:
        return element.get_attribute('src').endswith('ssl-yes.png')
    except:
        logging.warning(f'error got bool value from {element}')
        return defaultValue



In [100]:
# 通过 Selenium 抓取网页，解析网页里面的信息

# disabling-images-in-chrome
# https://tarunlalwani.com/post/selenium-disable-image-loading-different-browsers/#disabling-images-in-chrome
option = webdriver.ChromeOptions()
chrome_prefs = {}
option.experimental_options["prefs"] = chrome_prefs
chrome_prefs["profile.default_content_settings"] ={"images": 2}
chrome_prefs["profile.managed_default_content_settings"] = {"images": 2}

browser = webdriver.Chrome(options=option)
# browser = webdriver.Chrome()

total = len(siteInfo)
for idx, siteInfoItem in enumerate(siteInfo):
    seoUrl = getSeoUrl(siteInfoItem['url'])
    spinner = Halo(text=f'[{idx+1:02}/{total:02}] processing {seoUrl}', spinner='dots')
    spinner.start()
    browser.get(seoUrl)
    # baidu
    referenceBaiduElem = findElementByCssSelector(browser, '#seo_BaiduSiteIndex_2 > a')
    backLinkBaiduElem = findElementByCssSelector(browser, '#seo_BaiduLink > a')
    # sogou, backLink data is always not available
    referenceSogouElem = findElementByCssSelector(browser, '#seo_SogouPages > a')
    backLinkSogouElem = findElementByCssSelector(browser, '#seo_SogouLink > a')
    # 360
    reference360Elem = findElementByCssSelector(browser, '#seo_Pages360 > a')
    backLink360Elem = findElementByCssSelector(browser, '#seo_Link360 > a')
    # Google
    referenceGoogleElem = findElementByCssSelector(browser, '#seo_GooglePages > a')
    backLinkGoogleElem = findElementByCssSelector(browser, '#seo_GoogleLink > a')
    # SSL Enable?
    sslEnableElem = findElementByCssSelector(browser, '#ssl > a:nth-child(2) > img')
    seoInfo = SeoInfo(
        referenceBaidu=getElementIntValue(referenceBaiduElem),
        backLinkBaidu=getElementIntValue(backLinkBaiduElem),
        referenceSogou=getElementIntValue(referenceSogouElem),
        backLinkSogou=getElementIntValue(backLinkSogouElem),
        reference360=getElementIntValue(reference360Elem),
        backLink360=getElementIntValue(backLink360Elem),
        referenceGoogle=getElementIntValue(referenceGoogleElem),
        backLinkGoogle=getElementIntValue(backLinkGoogleElem),
        sslEnable=getSslEnable(sslEnableElem)
    )
    siteInfoItem['seoInfo'] = seoInfo
    spinner.succeed(text=f'process {seoUrl} finished!')

browser.quit()

# print(siteInfo)

v process http://seo.chinaz.com/www.ynu.edu.cn finished!
v process http://seo.chinaz.com/www.kmust.edu.cn finished!
v process http://seo.chinaz.com/www.ynau.edu.cn finished!
v process http://seo.chinaz.com/www.swfu.edu.cn finished!
v process http://seo.chinaz.com/www.kmmc.cn finished!
v process http://seo.chinaz.com/www.dali.edu.cn finished!
v process http://seo.chinaz.com/www.ynutcm.edu.cn finished!
v process http://seo.chinaz.com/www.ynnu.edu.cn finished!
v process http://seo.chinaz.com/www.ztu.edu.cn finished!
v process http://seo.chinaz.com/www.qjnu.edu.cn finished!
v process http://seo.chinaz.com/www.peuni.cn finished!
v process http://seo.chinaz.com/www.bsnc.cn finished!
v process http://seo.chinaz.com/www.uoh.edu.cn finished!
v process http://seo.chinaz.com/www.ynufe.edu.cn finished!
v process http://seo.chinaz.com/www.ynart.edu.cn finished!
v process http://seo.chinaz.com/www.ynni.edu.cn finished!
v process http://seo.chinaz.com/www.yxnu.edu.cn finished!
v process http://seo.ch

In [105]:
# 显示各搜索引擎对网站的收录情况

xaxis = list(map(lambda item: item['name'],siteInfo))
yaxisBaidu = list(map(lambda item: item['seoInfo'].referenceBaidu, siteInfo))
yaxisSogou = list(map(lambda item: item['seoInfo'].referenceSogou, siteInfo))
yaxis360 = list(map(lambda item: item['seoInfo'].reference360, siteInfo))
yaxisGoogle = list(map(lambda item: item['seoInfo'].referenceGoogle, siteInfo))
# print(xaxis,yaxisBaidu,yaxisSogou,yaxis360)

bar = (
    Bar({"theme": ThemeType.SHINE})
    .add_xaxis(xaxis)
    .add_yaxis("百度", yaxisBaidu)
    .add_yaxis("搜狗", yaxisSogou)
    .add_yaxis("360", yaxis360)
    .set_global_opts(
        title_opts=opts.TitleOpts(title="网页收录情况"),
        datazoom_opts=opts.DataZoomOpts(),
        xaxis_opts=opts.AxisOpts(
            type_="category",
            axispointer_opts=opts.AxisPointerOpts(is_show=True, type_="shadow"),
        ),
        yaxis_opts=opts.AxisOpts(
            min_=-10,
        ),)
)
bar.render_notebook()




In [106]:
# 显示 SSL Enable 的情况

sslEnableCount = ilen(filter(lambda item: item['seoInfo'].sslEnable, siteInfo))
sslDisableCount = ilen(filter(lambda item: not item['seoInfo'].sslEnable, siteInfo))
pie = (
    Pie()
    .add("", [['Enable SSL', sslEnableCount],['Disable SSL', sslDisableCount]])
    .set_global_opts(title_opts=opts.TitleOpts(title="SSL Enabled 统计图"))
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
)
pie.render_notebook()



In [109]:
# add the result columns to the df

df['百度收录']=yaxisBaidu
df['搜狗收录']=yaxisSogou
df['360收录']=yaxis360
df['Google收录']=yaxisGoogle
df['SSL启用']=list(map(lambda item: item['seoInfo'].sslEnable, siteInfo))
# save the result to csv/excel
df.to_excel('resources/result.xlsx',index=False)



In [22]:
from pyecharts import options as opts
from pyecharts.charts import Bar
from time import sleep

bar = Bar()
bar.add_xaxis(["衬衫", "羊毛衫", "雪纺衫", "裤子", "高跟鞋", "袜子"])
bar.add_yaxis("商家A", [5, 20, 36, 10, 75, 90])
bar.render_notebook()

# try to update the bar chart later
sleep(3)
bar.add_yaxis("商家B", [15, 10, 26, 20, 35, 30])
bar.render_notebook()

In [21]:
import pyecharts
print(pyecharts.__version__)

1.9.0
