In [1]:
import pymongo

client = pymongo.MongoClient("mongodb://127.0.0.1:27017/")
db = client["spider"]

In [2]:
from pyecharts.globals import CurrentConfig, NotebookType

CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_LAB

In [3]:
import pprint

In [4]:
"""
近期一百页推荐中男女发博客数量
"""
data = list(
    db.article.aggregate(
        [
            {
                "$lookup": {
                    "from": "user",
                    "localField": "user_name",
                    "foreignField": "user_name",
                    "as": "user",
                }
            },
            {"$unwind": {"path": "$user"}},
            {"$group": {"_id": "$user.sex", "count": {"$count": {}}}},
        ]
    )
)
pprint.pprint(data)
from pyecharts.charts import Bar

result = {}
for item in data:
    result[item["_id"]] = item["count"]

# from pyecharts import options as opts
# 近期一百页推荐中男女发博客数量
bar1 = Bar().add_xaxis(["男", "女"]).add_yaxis("发帖量", [result["male"], result["famale"]])
bar1.load_javascript()

[{'_id': 'famale', 'count': 856}, {'_id': 'male', 'count': 1101}]


<pyecharts.render.display.Javascript at 0x1982f467c10>

In [5]:
bar1.render_notebook()

In [13]:
%%html
<img src="https://tva1.sinaimg.cn/large/008d89Swgy1h5sqdtm13cg30pk0dqwid.gif"/>

In [6]:
"""
码龄、发表博客量、性别关系
"""
from datetime import datetime

pipeline = [
    {
        "$addFields": {
            "join_time": {
                "$filter": {
                    "input": ["$join_time"],
                    "as": "d",
                    "cond": {"$ne": ["$$d", None]},
                }
            }
        }
    },
    {"$unwind": {"path": "$join_time"}},
    {
        "$addFields": {
            "join_time_date": {"$dateFromString": {"dateString": "$join_time"}}
        }
    },
    {
        "$addFields": {
            "join_year_util_now": {
                "$add": [
                    {
                        "$dateDiff": {
                            "startDate": "$join_time_date",
                            "endDate": datetime.now(),
                            "unit": "year",
                            "timezone": "Asia/Shanghai",
                        }
                    }
                ]
            }
        }
    },
    {
        "$group": {
            "_id": {"date": "$join_year_util_now", "sex": "$sex"},
            "count": {"$count": {}},
            "join_time": {"$first": "$join_time"},
        }
    },
    {"$addFields": {"sex": "$_id.sex", "join_year_util_now": "$_id.date"}},
    {"$sort": {"join_year_util_now": -1}},
    {
        "$addFields": {
            "sex": {
                "$filter": {
                    "input": ["$sex"],
                    "as": "s",
                    "cond": {"$ne": ["$$s", "famale"]},
                }
            }
        }
    },
    {"$unwind": {"path": "$sex"}},
    {
        "$addFields": {
            "sex": {
                "$filter": {
                    "input": ["$sex"],
                    "as": "s",
                    "cond": {"$ne": ["$$s", None]},
                }
            }
        }
    },
    {"$unwind": {"path": "$sex"}},
]

In [7]:
"""
码龄、发表博客量、性别关系
"""
import copy
male_data = list(db.user.aggregate(pipeline))
pipeline[-4]["$addFields"]["sex"]["$filter"]["cond"]["$ne"] = ["$$s", "male"]
famale_data = list(db.user.aggregate(pipeline))
data = list(db.user.aggregate(pipeline[0:-4]))

from pyecharts import options as opts
from pyecharts.charts import Line

xaxis = list(range(data[-1]["join_year_util_now"], data[0]["join_year_util_now"] + 1))

yaxis = []
for i in range(len(xaxis)):
    yaxis.append([i, 0, 0])
for item in data:
    yaxis[item["join_year_util_now"]][0] = item["join_year_util_now"]
    if item["sex"] == "male":
        yaxis[item["join_year_util_now"]][1] = item["count"]
    elif item["sex"] == "famale":
        yaxis[item["join_year_util_now"]][2] = item["count"]
dimensions = ["join_year_util_now", '男', '女']
source = [dimensions] + yaxis

pprint.pprint(source)
"""
pyecharts太垃圾了。用js版本的echarts很好
链接：https://echarts.apache.org/examples/zh/editor.html?c=line-smooth&code=PYBwLglsB2AEC8sDeAoWsA2BTA5l6AJgFzIC-ANGrGMMBpCCauumAE4Q55skDkAhgA8IAZ15UKVAvzD8RWMEyroRwAK5sAxlhIBtZS128AVsAjQA-gE8s_NhbWQMF6MADuvcrF6B2V0_fAZ01eAF1KFkMABi8AVgBmLwBGWIjQg3RdBMSE6JiANlTw9IAmLIBOLwAWWILC3XjYUqjYZJrw3QrEiMzmlLDanNgADkGvIuq-ttzK8obWwwB2L1ip2ATetNhdEdgi7YS59Jmx0YPNtcS904zuhO2Kq4SS1cXYdcKz-oSZ_YnDBI6dk03rVspVEg8VitgW0Ei8gQ9tvVoX9vl5kcUgWirkVuk99gZghI-oIAILCERMWDQfgAWx03kAgB6AEPz_GArCB6bxNDJcMA2FZeLBJOgrGTRJTqXS-IBF5UAFhGALWVAEXagAdTQXC2DyDhYCmbJDUdmcjDmLBqrx6tkcvhG6AmoWE0gAbiAA
"""

# line1 = Line().add_dataset(source).add_yaxis(series_name="男", y_axis=[]).add_yaxis(series_name="女", y_axis=[]).set_global_opts(
#         xaxis_opts=opts.AxisOpts(type_="category"),
#     )
# pprint.pprint(line1.dump_options())
# line1.render_notebook()

[['join_year_util_now', '男', '女'],
 [0, 53, 130],
 [1, 115, 56],
 [2, 119, 43],
 [3, 90, 30],
 [4, 101, 30],
 [5, 88, 23],
 [6, 49, 9],
 [7, 36, 10],
 [8, 28, 1],
 [9, 23, 2],
 [10, 18, 1],
 [11, 18, 4],
 [12, 17, 0],
 [13, 19, 1],
 [14, 20, 0],
 [15, 4, 1],
 [16, 6, 0],
 [17, 0, 0],
 [18, 3, 0],
 [19, 1, 0],
 [20, 0, 0],
 [21, 2, 1]]


'\npyecharts太垃圾了。用js版本的echarts很好\n链接：https://echarts.apache.org/examples/zh/editor.html?c=line-smooth&code=PYBwLglsB2AEC8sDeAoWsA2BTA5l6AJgFzIC-ANGrGMMBpCCauumAE4Q55skDkAhgA8IAZ15UKVAvzD8RWMEyroRwAK5sAxlhIBtZS128AVsAjQA-gE8s_NhbWQMF6MADuvcrF6B2V0_fAZ01eAF1KFkMABi8AVgBmLwBGWIjQg3RdBMSE6JiANlTw9IAmLIBOLwAWWILC3XjYUqjYZJrw3QrEiMzmlLDanNgADkGvIuq-ttzK8obWwwB2L1ip2ATetNhdEdgi7YS59Jmx0YPNtcS904zuhO2Kq4SS1cXYdcKz-oSZ_YnDBI6dk03rVspVEg8VitgW0Ei8gQ9tvVoX9vl5kcUgWirkVuk99gZghI-oIAILCERMWDQfgAWx03kAgB6AEPz_GArCB6bxNDJcMA2FZeLBJOgrGTRJTqXS-IBF5UAFhGALWVAEXagAdTQXC2DyDhYCmbJDUdmcjDmLBqrx6tkcvhG6AmoWE0gAbiAA\n'

In [8]:
%%html
<img src="https://tva1.sinaimg.cn/large/008d89Swgy1h5spiylcakj30ui0lm0vo.jpg"/>
<img src="https://tva1.sinaimg.cn/large/008d89Swgy1h5spkm5f7wg30uy0l8thy.gif"/>
<img src="https://tva1.sinaimg.cn/large/008d89Swgy1h5spnfapqlj30ud0l7ac2.jpg"/>

In [9]:
"""
发布博客量与发布地 地图
@link https://codesandbox.io/s/air-quality-forked-sk406u?from-embed

经纬度坐标太难找了，这里只显示了国内的
"""
data = db.article.aggregate([
    {
        "$lookup": {
            "from": "user",
            "localField": "user_name",
            "foreignField": "user_name",
            "as": "user"
        }
    },
    {
        "$unwind": {
            "path": "$user"
        }
    },
    {
        "$addFields": {
            "address": {
                "$filter": {
                    "input": [
                        "$user.ip_address"
                    ],
                    "as": "d",
                    "cond": {
                        "$ne": [
                            "$$d",
                             None
                        ]
                    }
                }
            }
        }
    },
    {
        "$unwind": {
            "path": "$address"
        }
    },
    {
        "$group": {
            "_id": "$address",
            "value": {
                "$count": {}
            },
            "name": {
                "$first": "$address"
            }
        }
    }
])
list(data)

[{'_id': '江苏省', 'value': 102, 'name': '江苏省'},
 {'_id': '安徽省', 'value': 56, 'name': '安徽省'},
 {'_id': '新加坡', 'value': 3, 'name': '新加坡'},
 {'_id': '吉林省', 'value': 14, 'name': '吉林省'},
 {'_id': '云南省', 'value': 8, 'name': '云南省'},
 {'_id': '青海省', 'value': 2, 'name': '青海省'},
 {'_id': '台湾省', 'value': 1, 'name': '台湾省'},
 {'_id': '宁夏', 'value': 2, 'name': '宁夏'},
 {'_id': '内蒙古', 'value': 3, 'name': '内蒙古'},
 {'_id': '天津市', 'value': 26, 'name': '天津市'},
 {'_id': '上海市', 'value': 63, 'name': '上海市'},
 {'_id': '广东省', 'value': 409, 'name': '广东省'},
 {'_id': '陕西省', 'value': 79, 'name': '陕西省'},
 {'_id': '香港', 'value': 10, 'name': '香港'},
 {'_id': '日本', 'value': 2, 'name': '日本'},
 {'_id': '河南省', 'value': 50, 'name': '河南省'},
 {'_id': '辽宁省', 'value': 18, 'name': '辽宁省'},
 {'_id': '美国', 'value': 3, 'name': '美国'},
 {'_id': '中国', 'value': 2, 'name': '中国'},
 {'_id': '黑龙江省', 'value': 16, 'name': '黑龙江省'},
 {'_id': '北京市', 'value': 176, 'name': '北京市'},
 {'_id': '四川省', 'value': 77, 'name': '四川省'},
 {'_id': '山西省', 'value':

In [10]:
%%html
<img src='https://tva1.sinaimg.cn/large/008d89Swgy1h5spf3rftkj31hc0psalb.jpg' />
<img src="https://tva1.sinaimg.cn/large/008d89Swgy1h5spju815pg31h80pohdt.gif"/>

In [11]:
"""
文章tag词云
"""
data = db.article.aggregate([
    {
        "$project": {
            "tags": 1
        }
    },
    {
        "$addFields": {
            "tags": {
                "$filter": {
                    "input": [
                        "$tags"
                    ],
                    "as": "d",
                    "cond": {
                        "$ne": [
                            "$$d",
                            None
                        ]
                    }
                }
            }
        }
    },
    {
        "$unwind": {
            "path": "$tags"
        }
    },
    {
        "$project": {
            "_id": 0,
            "tags": 1
        }
    },
    {
        "$unwind": {
            "path": "$tags"
        }
    },
    {
        "$group": {
            "_id": "$tags",
            "value": {
                "$count": {}
            }
        }
    },
    {
        "$sort": {
            "value": -1
        }
    }
])
result = []
for item in data:
    result.append(tuple(item.values()))
pprint.pprint(result)

[('开发语言', 582),
 ('java', 538),
 ('前端', 248),
 ('人工智能', 244),
 ('运维', 190),
 ('javascript', 181),
 ('数据库', 179),
 ('python', 171),
 ('服务器', 169),
 ('算法', 145),
 ('机器学习', 130),
 ('linux', 122),
 ('深度学习', 121),
 ('大数据', 121),
 ('ecmascript', 113),
 ('vue.js', 102),
 ('spring', 99),
 ('网络', 98),
 ('spring boot', 98),
 ('后端', 97),
 ('mybatis', 88),
 ('神经网络', 73),
 ('mysql', 67),
 ('职场和发展', 67),
 ('面试', 66),
 ('jvm', 65),
 ('servlet', 62),
 ('学习', 61),
 ('c++', 58),
 ('计算机视觉', 57),
 ('docker', 52),
 ('数据结构', 52),
 ('嵌入式硬件', 50),
 ('单片机', 48),
 ('容器', 46),
 ('html', 45),
 ('前端框架', 44),
 ('ide', 42),
 ('分布式', 42),
 ('redis', 39),
 ('sql', 37),
 ('缓存', 36),
 ('elementui', 32),
 ('css', 31),
 ('leetcode', 31),
 ('kubernetes', 30),
 ('云原生', 29),
 ('物联网', 29),
 ('arm', 28),
 ('dubbo', 27),
 ('stm32', 27),
 ('intellij-idea', 27),
 ('数据挖掘', 26),
 ('网络协议', 26),
 ('android', 25),
 ('安全', 24),
 ('opencv', 22),
 ('微服务', 22),
 ('搜索引擎', 22),
 ('elasticsearch', 21),
 ('zookeeper', 19),
 ('css3', 19),
 ('g

In [24]:
import pyecharts.options as opts
from pyecharts.charts import WordCloud
map = WordCloud().add(series_name="热词标签", data_pair=result, word_size_range=[6, 66]) \
    .set_global_opts(
        title_opts=opts.TitleOpts(
            title="热词标签", title_textstyle_opts=opts.TextStyleOpts(font_size=23)
        ),
        tooltip_opts=opts.TooltipOpts(is_show=True),
    )
map.render()
# 这里显示不出来。要在html文件里面才可以看到
map.render_notebook()

In [25]:
%%html
<img src="https://tva1.sinaimg.cn/large/008d89Swgy1h5sqjw4lkuj30pi0dfn58.jpg"/>
<img src="https://tva1.sinaimg.cn/large/008d89Swgy1h5sqjomcvxg30os0ean21.gif"/>