## 存入MongoDB

In [1]:
from top_100 import crawling_movies
res = crawling_movies()
res

{1: {'ranking': 1,
  'title': '霸王别姬',
  'stars': '张国荣,张丰毅,巩俐',
  'score': 9.5,
  'release_time': '1993-01-01',
  'img_url': 'https://p1.meituan.net/movie/20803f59291c47e1e116c11963ce019e68711.jpg'},
 2: {'ranking': 2,
  'title': '肖申克的救赎',
  'stars': '蒂姆·罗宾斯,摩根·弗里曼,鲍勃·冈顿',
  'score': 9.5,
  'release_time': '1994-09-10',
  'img_url': 'https://p0.meituan.net/movie/283292171619cdfd5b240c8fd093f1eb255670.jpg'},
 3: {'ranking': 3,
  'title': '罗马假日',
  'stars': '格利高里·派克,奥黛丽·赫本,埃迪·艾伯特',
  'score': 9.1,
  'release_time': '1953-09-02',
  'img_url': 'https://p0.meituan.net/movie/289f98ceaa8a0ae737d3dc01cd05ab052213631.jpg'},
 4: {'ranking': 4,
  'title': '这个杀手不太冷',
  'stars': '让·雷诺,加里·奥德曼,娜塔莉·波特曼',
  'score': 9.5,
  'release_time': '1994-09-14',
  'img_url': 'https://p1.meituan.net/movie/6bea9af4524dfbd0b668eaa7e187c3df767253.jpg'},
 5: {'ranking': 5,
  'title': '泰坦尼克号',
  'stars': '莱昂纳多·迪卡普里奥,凯特·温丝莱特,比利·赞恩',
  'score': 9.5,
  'release_time': '1998-04-03',
  'img_url': 'https://p1.meituan.net/mov

In [2]:
def load_config():
    import json
    with open("config.json") as config:
        return json.load(config) 

In [3]:
def mongo_db(db_name):
    from pymongo import MongoClient
    client = MongoClient(f'{load_config()["host"]}:27017')
    mongo = client[db_name]
    return mongo


mongo = mongo_db('maoyan')

MongoDB插入数据之后，每个元素会增加`_id`的字段，为了后面继续用原始结果，MongoDB入库之前，将数据深拷贝一份。

In [4]:
import copy
res_deep_copy = copy.deepcopy(res)
list_to_mongo = [res_deep_copy[i] for i in res_deep_copy]
list_to_mongo[0]

{'ranking': 1,
 'title': '霸王别姬',
 'stars': '张国荣,张丰毅,巩俐',
 'score': 9.5,
 'release_time': '1993-01-01',
 'img_url': 'https://p1.meituan.net/movie/20803f59291c47e1e116c11963ce019e68711.jpg'}

In [5]:
res_deep_copy[1]

{'ranking': 1,
 'title': '霸王别姬',
 'stars': '张国荣,张丰毅,巩俐',
 'score': 9.5,
 'release_time': '1993-01-01',
 'img_url': 'https://p1.meituan.net/movie/20803f59291c47e1e116c11963ce019e68711.jpg'}

In [6]:
mongo.top_100.drop()
mongo.top_100.insert_many(list_to_mongo)

<pymongo.results.InsertManyResult at 0x7ff8584031c8>

In [7]:
res_deep_copy[1]

{'ranking': 1,
 'title': '霸王别姬',
 'stars': '张国荣,张丰毅,巩俐',
 'score': 9.5,
 'release_time': '1993-01-01',
 'img_url': 'https://p1.meituan.net/movie/20803f59291c47e1e116c11963ce019e68711.jpg',
 '_id': ObjectId('5d45b09db27ef8d970b32082')}

## 从MongoDB获取数据

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_columns", 100)  # 设置显示数据的最大列数，防止出现省略号…，导致数据显示不全
pd.set_option("expand_frame_repr", False)  # 当列太多时不自动换行
%matplotlib inline
sns.set_style("darkgrid")

In [9]:
mongo.top_100.find()

<pymongo.cursor.Cursor at 0x7ff858b186a0>

In [10]:
df = pd.DataFrame(mongo.top_100.find()).drop("_id", axis=1)
df.sample(5)

Unnamed: 0,img_url,ranking,release_time,score,stars,title
69,https://p1.meituan.net/movie/0b0d45b58946078dd...,70,2015-02-13,9.2,"黎明,张曼玉,曾志伟",甜蜜蜜
83,https://p0.meituan.net/movie/b5ff0216e689b3fcc...,84,2013-07-31,8.8,"河正宇,李璟荣,李大为",恐怖直播
50,https://p0.meituan.net/movie/47dd790e19dad72b5...,51,2009-08-04,8.9,"爱德华·阿斯纳,乔丹·长井,鲍勃·彼德森",飞屋环游记
55,https://p0.meituan.net/movie/34998e31c6d07475f...,56,2012-11-22,9.1,"苏拉·沙玛,伊尔凡·可汗,塔布",少年派的奇幻漂流
99,https://p0.meituan.net/movie/30b20139e68c46d02...,100,2019-06-21,9.3,"柊瑠美,周冬雨,入野自由",千与千寻


### 电影9分以上的电影有哪些 取Top10

In [11]:
def top_10_upper_9(mongo, score):
    query = {"score": {"$gte": score}}
    res = mongo.top_100.find(query).limit(10)
    return pd.DataFrame(res).drop("_id", axis=1)


top_10_upper_9(mongo, 9.0)

Unnamed: 0,img_url,ranking,release_time,score,stars,title
0,https://p1.meituan.net/movie/20803f59291c47e1e...,1,1993-01-01,9.5,"张国荣,张丰毅,巩俐",霸王别姬
1,https://p0.meituan.net/movie/283292171619cdfd5...,2,1994-09-10,9.5,"蒂姆·罗宾斯,摩根·弗里曼,鲍勃·冈顿",肖申克的救赎
2,https://p0.meituan.net/movie/289f98ceaa8a0ae73...,3,1953-09-02,9.1,"格利高里·派克,奥黛丽·赫本,埃迪·艾伯特",罗马假日
3,https://p1.meituan.net/movie/6bea9af4524dfbd0b...,4,1994-09-14,9.5,"让·雷诺,加里·奥德曼,娜塔莉·波特曼",这个杀手不太冷
4,https://p1.meituan.net/movie/b607fba7513e7f15e...,5,1998-04-03,9.5,"莱昂纳多·迪卡普里奥,凯特·温丝莱特,比利·赞恩",泰坦尼克号
5,https://p0.meituan.net/movie/da64660f82b98cdc1...,6,1993-07-01,9.1,"周星驰,巩俐,郑佩佩",唐伯虎点秋香
6,https://p0.meituan.net/movie/46c29a8b8d8424bdd...,7,1940-05-17,9.2,"费雯·丽,罗伯特·泰勒,露塞尔·沃特森",魂断蓝桥
7,https://p0.meituan.net/movie/223c3e186db3ab4ea...,8,1939-12-15,9.1,"费雯·丽,克拉克·盖博,奥利维娅·德哈维兰",乱世佳人
8,https://p1.meituan.net/movie/ba1ed511668402605...,9,1992-01-01,9.1,"寺田农,鹫尾真知子,龟山助清",天空之城
9,https://p0.meituan.net/movie/b0d986a8bf89278af...,10,1993-12-15,9.2,"连姆·尼森,拉尔夫·费因斯,本·金斯利",辛德勒的名单


### 2010年以后的电影有哪些

In [12]:
def after_date(mongo, date):
    query = {'release_time': {"$gt": date}}
    res = mongo.top_100.find(query).limit(10)
    return pd.DataFrame(res).drop("_id", axis=1)


after_date(mongo, "2010")

Unnamed: 0,img_url,ranking,release_time,score,stars,title
0,https://p0.meituan.net/movie/7787c10ad5e95b03c...,35,2010-03-12,9.3,"Forest,理查·基尔,琼·艾伦",忠犬八公的故事
1,https://p1.meituan.net/movie/bc022b86345c643ca...,36,2013-04-20,9.5,"尼古拉斯·凯奇,艾玛·斯通,瑞安·雷诺兹",疯狂原始人
2,https://p1.meituan.net/movie/2f344a9f9575edbca...,37,2010-09-01,9.2,"莱昂纳多·迪卡普里奥,渡边谦,约瑟夫·高登-莱维特",盗梦空间
3,https://p1.meituan.net/movie/91f575ec93f019f42...,40,2010-01-04,9.0,"萨姆·沃辛顿,佐伊·索尔达娜,米歇尔·罗德里格兹",阿凡达
4,https://p0.meituan.net/movie/b0d97e4158b47d653...,46,2010-05-14,9.0,"杰伊·巴鲁切尔,杰拉德·巴特勒,亚美莉卡·费雷拉",驯龙高手
5,https://p0.meituan.net/movie/845ce32778a1b3f25...,47,2011-05-12,9.2,"范·迪塞尔,保罗·沃克,道恩·强森",速度与激情5
6,https://p0.meituan.net/movie/85c2bfba6025bfbfb...,50,2010-07-09,9.0,"史蒂夫·卡瑞尔,杰森·席格尔,拉塞尔·布兰德",神偷奶爸
7,https://p1.meituan.net/movie/ca4a128a5a54d5b5e...,53,2011-12-08,9.1,"阿米尔·汗,黄渤,卡琳娜·卡普",三傻大闹宝莱坞
8,https://p0.meituan.net/movie/34998e31c6d07475f...,56,2012-11-22,9.1,"苏拉·沙玛,伊尔凡·可汗,塔布",少年派的奇幻漂流
9,https://p0.meituan.net/movie/92eb862c42c49f8e4...,59,2014-10-24,9.6,"周星驰,莫文蔚,吴孟达",大话西游之月光宝盒


### 查询某个明星的电影有哪些

In [15]:
def actors_movies(mongo, actor):
    query = {"stars": {'$regex': actor}}
    res = mongo.top_100.find(query)
    return pd.DataFrame(res).drop("_id", axis=1)


actors_movies(mongo, "张国荣")

Unnamed: 0,img_url,ranking,release_time,score,stars,title
0,https://p1.meituan.net/movie/20803f59291c47e1e116c11963ce019e68711.jpg,1,1993-01-01,9.5,"张国荣,张丰毅,巩俐",霸王别姬
1,https://p0.meituan.net/movie/ae7245920d95c03765fe1615f3a1fe3865785.jpg,14,1997-05-30,9.2,"张国荣,梁朝伟,张震",春光乍泄
2,https://p1.meituan.net/movie/53b6f0b66882a53b08896c92076515a8236400.jpg,24,1993-02-05,8.9,"张国荣,梁朝伟,张学友",射雕英雄传之东成西就
3,https://p1.meituan.net/movie/7e471a9171a410ebc9413b2f1de67afc130067.jpg,39,1994-09-17,8.8,"张国荣,梁朝伟,刘嘉玲",东邪西毒
4,https://p1.meituan.net/movie/96d98200d2afb4b87ff189f9c15b6545568339.jpg,65,2011-04-30,9.2,"张国荣,王祖贤,午马",倩女幽魂
5,https://p0.meituan.net/movie/3e5f5f3aa4b7e5576521e26c2c7c894d253975.jpg,92,2017-11-17,9.2,"狄龙,张国荣,周润发",英雄本色
6,https://p0.meituan.net/movie/885fc379c614a2b4175587b95ac98eb95045650.jpg,93,2018-06-25,8.8,"张国荣,张曼玉,刘德华",阿飞正传


## mongo_main.py

In [None]:
# %load mongo_main.py
from top_100 import crawling_movies
from pymongo import MongoClient
import pandas as pd
import pprint
import copy

pd.set_option("display.max_columns", 100)  # 设置显示数据的最大列数，防止出现省略号…，导致数据显示不全
pd.set_option("expand_frame_repr", False)  # 当列太多时不自动换行
pd.set_option('max_colwidth', 255)  # 单元格最大数据长度

def load_config():
    import json
    with open("config.json") as config:
        return json.load(config) 
    
def mongo_db(db_name):
    client = MongoClient(f'{load_config()["host"]}:27017')
    mongo = client[db_name]
    return mongo


def data_to_mongo(mongo):
    mongo.top_100.drop()
    res = crawling_movies()
    res_deep_copy = copy.deepcopy(res)
    list_to_mongo = [res_deep_copy[i] for i in res_deep_copy]
    mongo.top_100.insert_many(list_to_mongo)


def top_10_upper_9(mongo, score):
    query = {"score": {"$lte": score}}
    projection = {"_id": 0}
    res = mongo.top_100.find(query, projection)
    return [i for i in res]


def after_date(mongo, date):
    query = {'release_time': {"$lt": date}}
    res = mongo.top_100.find(query)
    return [i for i in res]


def actors_movies(mongo, actor):
    query = {"stars": {"$exists": actor}}
    res = mongo.top_100_json.find(query)
    return [i for i in res]


if __name__ == "__main__":
    # 建立连接
    mongo = mongo_db('maoyan')
    # 数据入库
    # data_to_mongo(mongo)
    # 查询数据

    top_100s = mongo.top_100.find()
    pprint.pprint(top_100s)

    # 修改为DataFrame可以查看的格式
    df = pd.DataFrame(top_100s).drop("_id", axis=1)
    print(df)

    print('电影9分以上的电影有哪些 取Top10')
    pprint.pprint(top_10_upper_9(mongo, 9.0))
    print('2010年以后的电影有哪些')
    pprint.pprint(after_date(mongo, '2010-01-01'))
    print('查询张国荣的电影有哪些')
    pprint.pprint(actors_movies(mongo, '张国荣'))
    pprint.pprint([i for i in mongo.top_100.find({'1': {}})])
