# 3. Get Only What You Need, and Fast
You can now query collections with ease and collect documents to examine and analyze with Python. But this process is sometimes slow and onerous for large collections and documents. This chapter is about various ways to speed up and simplify that process.

In [60]:
# Importing libraries
import expectexception

from pymongo import MongoClient
from pprint import pprint

from bson.regex import Regex
from operator import itemgetter
from collections import Counter

In [2]:
# Instantiating the mongodb client
client = MongoClient()

# Create local "nobel" database on the fly
db = client["nobel"]

## 3.1 Projection

In [3]:
# Reviewing the laureates data
pprint(db.laureates.find_one({}))

{'_id': ObjectId('6706d88371ea025ecc350f6e'),
 'born': '1853-07-18',
 'bornCity': 'Arnhem',
 'bornCountry': 'the Netherlands',
 'bornCountryCode': 'NL',
 'died': '1928-02-04',
 'diedCountry': 'the Netherlands',
 'diedCountryCode': 'NL',
 'firstname': 'Hendrik Antoon',
 'gender': 'male',
 'id': '2',
 'prizes': [{'affiliations': [{'city': 'Leiden',
                               'country': 'the Netherlands',
                               'name': 'Leiden University'}],
             'category': 'physics',
             'motivation': '"in recognition of the extraordinary service they '
                           'rendered by their researches into the influence of '
                           'magnetism upon radiation phenomena"',
             'share': '2',
             'year': '1902'}],
 'surname': 'Lorentz'}


### Projection in MongoDB

In [4]:
# include only prizes.affiliations
# exclude _id
docs = db.laureates.find(
    filter={},
    projection={"prizes.affiliations": 1,
                "_id": 0})  # To exclude it
type(docs)

pymongo.cursor.Cursor

In [5]:
data = list(docs)
len(data)

934

In [6]:
pprint(data[:2])

[{'prizes': [{'affiliations': [{'city': 'Leiden',
                                'country': 'the Netherlands',
                                'name': 'Leiden University'}]}]},
 {'prizes': [{'affiliations': [{'city': 'Providence, RI',
                                'country': 'USA',
                                'name': 'Brown University'}]}]}]


In [7]:
# This not works!
docs = db.laureates.find(
    filter={},
    projection={"affiliations": 1,
                "_id": 0})  # To exclude it
pprint(list(docs[:2]))

[{}, {}]


### Missing fields

In [8]:
# use "gender":"org" to select organizations
# organizations have no bornCountry, 
docs = db.laureates.find(filter={"gender": "org"},
                         projection=["bornCountry", "firstname"])
pprint(list(docs))

[{'_id': ObjectId('6706d88371ea025ecc3510a4'),
  'firstname': 'Comité international de la Croix Rouge (International '
               'Committee of the Red Cross)'},
 {'_id': ObjectId('6706d88371ea025ecc3510f9'),
  'firstname': 'Friends Service Council (The Quakers)'},
 {'_id': ObjectId('6706d88371ea025ecc3510fd'),
  'firstname': "United Nations Children's Fund (UNICEF)"},
 {'_id': ObjectId('6706d88371ea025ecc3510ff'),
  'firstname': 'International Labour Organization (I.L.O.)'},
 {'_id': ObjectId('6706d88371ea025ecc351109'),
  'firstname': 'Amnesty International'},
 {'_id': ObjectId('6706d88371ea025ecc351113'),
  'firstname': 'United Nations Peacekeeping Forces'},
 {'_id': ObjectId('6706d88371ea025ecc35111e'),
  'firstname': 'Pugwash Conferences on Science and World Affairs'},
 {'_id': ObjectId('6706d88371ea025ecc351121'),
  'firstname': 'International Campaign to Ban Landmines (ICBL)'},
 {'_id': ObjectId('6706d88371ea025ecc351125'),
  'firstname': 'Médecins Sans Frontières'},
 {'_id'

In [9]:
# only projected fields that exist are returned
#  laureates does not have `favoriteIceCreamFlavor` field
docs = db.laureates.find({}, ["favoriteIceCreamFlavor"])
pprint(list(docs)[:5])

[{'_id': ObjectId('6706d88371ea025ecc350f6e')},
 {'_id': ObjectId('6706d88371ea025ecc350f6f')},
 {'_id': ObjectId('6706d88371ea025ecc350f70')},
 {'_id': ObjectId('6706d88371ea025ecc350f71')},
 {'_id': ObjectId('6706d88371ea025ecc350f72')}]


### Simple aggregation

In [10]:
# wrong way
docs = db.laureates.find({}, ["prizes"])
n_prizes = 0
for doc in docs:
    # count the number of pizes in each doc
    n_prizes += len(doc["prizes"])
print(n_prizes)

941


In [11]:
# using comprehension
docs = db.laureates.find({}, ["prizes"])
sum([len(doc["prizes"]) for doc in docs])

941

### Excercises

#### Shares of the 1903 Prize in Physics

In [12]:
# You want to examine the laureates of the 1903 prize in physics 
# and how they split the prize. Here is a query without projection:
pprint(db.laureates.find_one({
    "prizes": {
        "$elemMatch": {
            "category": "physics", 
            "year": "1903"
        }
    }
}))

{'_id': ObjectId('6706d88371ea025ecc350fb0'),
 'born': '1852-12-15',
 'bornCity': 'Paris',
 'bornCountry': 'France',
 'bornCountryCode': 'FR',
 'died': '1908-08-25',
 'diedCountry': 'France',
 'diedCountryCode': 'FR',
 'firstname': 'Antoine Henri',
 'gender': 'male',
 'id': '4',
 'prizes': [{'affiliations': [{'city': 'Paris',
                               'country': 'France',
                               'name': 'École Polytechnique'}],
             'category': 'physics',
             'motivation': '"in recognition of the extraordinary services he '
                           'has rendered by his discovery of spontaneous '
                           'radioactivity"',
             'share': '2',
             'year': '1903'}],
 'surname': 'Becquerel'}


In [13]:
# Which projection(s) will fetch ONLY the laureates' 
# full names and prize share info? I encourage you to 
# experiment with the console and re-familiarize yourself 
# with the structure of laureate collection documents.
pprint(list(db.laureates.find(
    filter = {
        "prizes": {
            "$elemMatch": {
                "category": "physics", 
                "year": "1903"
            }
        }
    },
    projection = {
        'firstname': 1, 
        'surname': 1, 
        'prizes.share': 1,
        '_id': 0
    }
)))

[{'firstname': 'Antoine Henri',
  'prizes': [{'share': '2'}],
  'surname': 'Becquerel'},
 {'firstname': 'Pierre', 'prizes': [{'share': '4'}], 'surname': 'Curie'},
 {'firstname': 'Marie',
  'prizes': [{'share': '4'}, {'share': '1'}],
  'surname': 'Curie, née Sklodowska'}]


In [14]:
pprint(list(db.laureates.find(
    filter = {
        "prizes": {
            "$elemMatch": {
                "category": "physics", 
                "year": "1903"
            }
        }
    },
    projection = ['firstname', 'surname', 'prizes.share']
)))

[{'_id': ObjectId('6706d88371ea025ecc350fb0'),
  'firstname': 'Antoine Henri',
  'prizes': [{'share': '2'}],
  'surname': 'Becquerel'},
 {'_id': ObjectId('6706d88371ea025ecc350fb1'),
  'firstname': 'Pierre',
  'prizes': [{'share': '4'}],
  'surname': 'Curie'},
 {'_id': ObjectId('6706d88371ea025ecc350fb2'),
  'firstname': 'Marie',
  'prizes': [{'share': '4'}, {'share': '1'}],
  'surname': 'Curie, née Sklodowska'}]


#### Rounding up the G.S. crew

In [15]:
# Get documents where the field 'name' starts with "Py"
pprint(db.laureates.find_one({"firstname": {"$regex": "^Py"}}))

{'_id': ObjectId('6706d88371ea025ecc35104b'),
 'born': '1894-07-09',
 'bornCity': 'Kronshtadt',
 'bornCountry': 'Russian Empire (now Russia)',
 'bornCountryCode': 'RU',
 'died': '1984-04-08',
 'diedCity': 'Moscow',
 'diedCountry': 'USSR (now Russia)',
 'diedCountryCode': 'RU',
 'firstname': 'Pyotr Leonidovich',
 'gender': 'male',
 'id': '110',
 'prizes': [{'affiliations': [{'city': 'Moscow',
                               'country': 'USSR',
                               'name': 'Academy of Sciences'}],
             'category': 'physics',
             'motivation': '"for his basic inventions and discoveries in the '
                           'area of low-temperature physics"',
             'share': '2',
             'year': '1978'}],
 'surname': 'Kapitsa'}


In [16]:
# Find laureates whose first name starts with "G" and last name starts with "S"
docs = db.laureates.find(
       filter= {"firstname" : {"$regex" : "^G"},
                "surname" : {"$regex" : "^S"}  })
pprint(docs[0])

{'_id': ObjectId('6706d88371ea025ecc35108b'),
 'born': '1903-12-19',
 'bornCity': 'Bradford, MA',
 'bornCountry': 'USA',
 'bornCountryCode': 'US',
 'died': '1996-06-06',
 'diedCity': 'Bar Harbor, ME',
 'diedCountry': 'USA',
 'diedCountryCode': 'US',
 'firstname': 'George D.',
 'gender': 'male',
 'id': '421',
 'prizes': [{'affiliations': [{'city': 'Bar Harbor, ME',
                               'country': 'USA',
                               'name': 'Jackson Laboratory'}],
             'category': 'medicine',
             'motivation': '"for their discoveries concerning genetically '
                           'determined structures on the cell surface that '
                           'regulate immunological reactions"',
             'share': '3',
             'year': '1980'}],
 'surname': 'Snell'}


In [17]:
docs = db.laureates.find(
       filter= {"firstname" : Regex("^G"),
                "surname" : Regex("^S")})
pprint(docs[0])

{'_id': ObjectId('6706d88371ea025ecc35108b'),
 'born': '1903-12-19',
 'bornCity': 'Bradford, MA',
 'bornCountry': 'USA',
 'bornCountryCode': 'US',
 'died': '1996-06-06',
 'diedCity': 'Bar Harbor, ME',
 'diedCountry': 'USA',
 'diedCountryCode': 'US',
 'firstname': 'George D.',
 'gender': 'male',
 'id': '421',
 'prizes': [{'affiliations': [{'city': 'Bar Harbor, ME',
                               'country': 'USA',
                               'name': 'Jackson Laboratory'}],
             'category': 'medicine',
             'motivation': '"for their discoveries concerning genetically '
                           'determined structures on the cell surface that '
                           'regulate immunological reactions"',
             'share': '3',
             'year': '1980'}],
 'surname': 'Snell'}


In [18]:
# Find laureates whose first name starts with "G" and last name starts with "S"
# and return only the "firstname" and "surname" fields.
docs = db.laureates.find(
    filter = {"firstname" : {"$regex" : "^G"},
              "surname" : {"$regex" : "^S"}  },
	projection = {"firstname" : 1,
                  "surname" : 1,
                  "_id" : 0 })
pprint(docs[0])

{'firstname': 'George D.', 'surname': 'Snell'}


In [19]:
# Find laureates whose first name starts with "G" and last name starts with "S"
# and return only the "firstname" and "surname" fields.
# Iterate over the documents, and for each document, concatenate the first name 
# and the surname fields together with a space in between to obtain full names.
# Use projection to select only firstname and surname
# This not works!
docs = db.laureates.find(
    filter = {"firstname" : {"$regex" : "^G"},
              "surname" : {"$regex" : "^S"}  },
	projection = {"firstname" : 1,
                  "surname" : 1,
                  "_id" : 0 })
full_names = [doc["firstname"] + " " + doc["surname"]  for doc in docs]
pprint(full_names)

['George D. Snell',
 'Gustav Stresemann',
 'Glenn Theodore Seaborg',
 'George J. Stigler',
 'George F. Smoot',
 'George E. Smith',
 'George P. Smith',
 'George Bernard Shaw',
 'Giorgos Seferis']


#### Doing our share of data validation

In [20]:
# Reviewing the data
pprint(db.prizes.find_one({}))

{'_id': ObjectId('6706d88371ea025ecc350d20'),
 'category': 'physics',
 'laureates': [{'firstname': 'Arthur',
                'id': '960',
                'motivation': '"for the optical tweezers and their application '
                              'to biological systems"',
                'share': '2',
                'surname': 'Ashkin'},
               {'firstname': 'Gérard',
                'id': '961',
                'motivation': '"for their method of generating high-intensity, '
                              'ultra-short optical pulses"',
                'share': '4',
                'surname': 'Mourou'},
               {'firstname': 'Donna',
                'id': '962',
                'motivation': '"for their method of generating high-intensity, '
                              'ultra-short optical pulses"',
                'share': '4',
                'surname': 'Strickland'}],
 'overallMotivation': '“for groundbreaking inventions in the field of laser '
                   

In [21]:
# check that for each prize, all the shares of all the laureates add up to 1!

prizes = db.prizes.find({}, ["laureates.share"])
for prize in prizes:
    total_share = 0
    for laureate in prize["laureates"]:
        total_share += 1 / float(laureate['share'])
    if total_share != 1:
        raise Exception('Wrong Data!')
print('All documents pass the validation check!')

All documents pass the validation check!


## 3.2 Sorting

### Sorting post-query with Python

In [22]:
docs= list(db.prizes.find({"category": "physics"}, ["year"]))
pprint([doc["year"] for doc in docs][:5])

['2018', '2015', '2014', '2012', '2011']


In [23]:
docs = sorted(docs, key=itemgetter("year"))
pprint([doc["year"] for doc in docs][:5])

['1901', '1902', '1903', '1904', '1905']


In [24]:
docs = sorted(docs, key=itemgetter("year"), reverse=True)
pprint([doc["year"] for doc in docs][:5])

['2018', '2017', '2016', '2015', '2014']


### Sorting in-query with MongoDB

In [25]:
cursor = db.prizes.find({"category": "physics"}, ["year"],
                        sort={"year": 1})
print([doc["year"] for doc in cursor][:5])

['1901', '1902', '1903', '1904', '1905']


In [26]:
cursor = db.prizes.find({"category": "physics"}, ["year"],
                        sort=[("year", -1)])
print([doc["year"] for doc in cursor][:5])

['2018', '2017', '2016', '2015', '2014']


### Primary and secondary sorting

In [27]:
for doc in db.prizes.find(
    filter={"year": {"$gt": "1966", "$lt": "1970"}},
    projection=["category", "year"],
    sort=[("year", 1), ("category", -1)]
):
    print("{year} {category}".format(**doc))

1967 physics
1967 medicine
1967 literature
1967 chemistry
1968 physics
1968 peace
1968 medicine
1968 literature
1968 chemistry
1969 physics
1969 peace
1969 medicine
1969 literature
1969 economics
1969 chemistry


### Excercises

#### What the sort?

In [28]:
# This block prints out the first five projections of a sorted query.
docs = list(db.laureates.find(
    filter={"born": {"$gte": "1900"}, "prizes.year": {"$gte": "1954"}},
    projection={"born": 1, "prizes.year": 1, "_id": 0},
    sort=[("prizes.year", 1), ("born", -1)])    # This `{"prizes.year": 1, "born": -1}`
                                                # does not work because dictionaries 
                                                # are not ordered in Python, so this 
                                                # will not ensure that 'prizes.year' 
                                                # gets ordered before 'born'.
)
for doc in docs[:5]:
    print(doc)

{'born': '1916-08-25', 'prizes': [{'year': '1954'}]}
{'born': '1915-06-15', 'prizes': [{'year': '1954'}]}
{'born': '1901-02-28', 'prizes': [{'year': '1954'}, {'year': '1962'}]}
{'born': '1913-07-12', 'prizes': [{'year': '1955'}]}
{'born': '1911-01-26', 'prizes': [{'year': '1955'}]}


#### What happens if key is missing in some records

In [29]:
%%expect_exception KeyError

# Sorting laureates by "surname"
sorted_laureates = sorted(list(db.laureates.find(filter={})), 
                          key=itemgetter("surname"))

[1;31m---------------------------------------------------------------------------[0m
[1;31mKeyError[0m                                  Traceback (most recent call last)
Cell [1;32mIn[29], line 2[0m
[0;32m      1[0m [38;5;66;03m# Sorting laureates by "surname"[39;00m
[1;32m----> 2[0m sorted_laureates [38;5;241m=[39m [38;5;28;43msorted[39;49m[43m([49m[38;5;28;43mlist[39;49m[43m([49m[43mdb[49m[38;5;241;43m.[39;49m[43mlaureates[49m[38;5;241;43m.[39;49m[43mfind[49m[43m([49m[38;5;28;43mfilter[39;49m[38;5;241;43m=[39;49m[43m{[49m[43m}[49m[43m)[49m[43m)[49m[43m,[49m[43m [49m
[0;32m      3[0m [43m                          [49m[43mkey[49m[38;5;241;43m=[39;49m[43mitemgetter[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43msurname[39;49m[38;5;124;43m"[39;49m[43m)[49m[43m)[49m

[1;31mKeyError[0m: 'surname'


In [30]:
# We have 34 documents without surname
db.laureates.count_documents(filter={"surname": {"$exists": False}})

34

In [31]:
# So we need to exclude them and then sort by "surname"
sorted_laureates = sorted(
    list(db.laureates.find(filter={"surname": {"$exists": True}})), 
    key=itemgetter("surname")
)
pprint([doc['surname'] for doc in sorted_laureates[:5]])

["'t Hooft",
 '(John William Strutt)',
 '(Lord Edgar Algernon Robert Gascoyne Cecil)',
 'Abrikosov',
 'Addams']


#### Sorting together: MongoDB + Python

In [32]:
# Sorting together: MongoDB + Python
# In this exercise you'll explore the prizes in the physics category.
# We will use Python to sort laureates for one prize by last name, 
# and then MongoDB to sort prizes by year
# sort the laureates by surname

def all_laureates(prize):  
    # sort the laureates by surname
    sorted_laureates = sorted(prize["laureates"], key=itemgetter("surname"))
    # extract surnames
    surnames = [laureate["surname"] for laureate in sorted_laureates]
    # concatenate surnames separated with " and " 
    all_names = " and ".join(surnames)    
    return all_names

# find physics prizes, project year and name, and sort by year
docs = db.prizes.find(
           filter= {"category": "physics"}, 
           projection= ["year", "laureates.firstname", "laureates.surname"], 
           sort= [("year", 1)])

# print the year and laureate names (from all_laureates)
for doc in docs[:10]:
  print("{year}: {names}".format(year=doc["year"], names=all_laureates(doc)))

# Take a look on a doc as sample
print('--------------------')
pprint(doc)

1901: Röntgen
1902: Lorentz and Zeeman
1903: Becquerel and Curie and Curie, née Sklodowska
1904: (John William Strutt)
1905: von Lenard
1906: Thomson
1907: Michelson
1908: Lippmann
1909: Braun and Marconi
1910: van der Waals
--------------------
{'_id': ObjectId('6706d88371ea025ecc350f55'),
 'laureates': [{'firstname': 'Johannes Diderik', 'surname': 'van der Waals'}],
 'year': '1910'}


In [33]:
db.prizes.find_one({})

{'_id': ObjectId('6706d88371ea025ecc350d20'),
 'year': '2018',
 'category': 'physics',
 'overallMotivation': '“for groundbreaking inventions in the field of laser physics”',
 'laureates': [{'id': '960',
   'firstname': 'Arthur',
   'surname': 'Ashkin',
   'motivation': '"for the optical tweezers and their application to biological systems"',
   'share': '2'},
  {'id': '961',
   'firstname': 'Gérard',
   'surname': 'Mourou',
   'motivation': '"for their method of generating high-intensity, ultra-short optical pulses"',
   'share': '4'},
  {'id': '962',
   'firstname': 'Donna',
   'surname': 'Strickland',
   'motivation': '"for their method of generating high-intensity, ultra-short optical pulses"',
   'share': '4'}]}

#### Gap years

In [34]:
# In this exercise, you will utilize sorting by multiple 
# fields to see which categories are missing in which years.
# original categories from 1901
original_categories = db.prizes.distinct('category', {'year': '1901'})
print(f'Original categories: {original_categories}\n')

# project year and category, and sort
docs = db.prizes.find(
    filter={},
    projection = {'year': 1, 'category': 1, '_id': 0},
    sort=[('year', -1), ('category', 1)]
)

#print the documents
year_category = {}
for doc in docs[:20]:
    if doc['year'] in year_category:
        year_category[doc['year']].append(doc['category'])
    else:
        year_category[doc['year']] = [doc['category']]
pprint(year_category)

Original categories: ['chemistry', 'literature', 'medicine', 'peace', 'physics']

{'2015': ['chemistry', 'economics', 'literature'],
 '2016': ['chemistry',
          'economics',
          'literature',
          'medicine',
          'peace',
          'physics'],
 '2017': ['chemistry',
          'economics',
          'literature',
          'medicine',
          'peace',
          'physics'],
 '2018': ['chemistry', 'economics', 'medicine', 'peace', 'physics']}


## 3.3 What are indexes

### Gauging performance before indexing

In [35]:
%%timeit
docs = list(db.prizes.find({"year": "1901"}))

443 μs ± 17.1 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [36]:
%%timeit
docs = list(db.prizes.find({}, sort=[("year", 1)]))

2.36 ms ± 34.3 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### Adding a single-field index

In [37]:
# index model: list of (field, direction) pairs.
# directions: 1 (ascending) and -1 (descending)
db.prizes.create_index([("year", 1)])

'year_1'

In [38]:
%%timeit
# Previously: 443 μs ± 17.1 μs
docs = list(db.prizes.find({"year": "1901"}))

267 μs ± 13.7 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [39]:
%%timeit
# Previously: 2.36 ms ± 34.3 μs
docs = list(db.prizes.find({}, sort=[("year", 1)]))

2.79 ms ± 144 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### Adding a compound (multiple-field) index

In [47]:
%%timeit
docs = list(db.prizes.find(
    filter={"category": "economics"},
    projection={"year": 1, "_id": 0}
))

549 μs ± 5.78 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [48]:
%%timeit
doc = db.prizes.find_one(
    filter={"category": "economics"},
    projection={"year": 1, "_id": 0},
    sort=[("year", 1)]
)

613 μs ± 29.5 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [49]:
# creating the compound index
db.prizes.create_index([("category", 1), ("year", 1)])
# db.prizes.drop_index('category_1_year_1')   # To drop it

'category_1_year_1'

In [50]:
%%timeit

# index "covering" a query with projection
# before: 549 μs ± 5.78 μs
docs = list(db.prizes.find(
    filter={"category": "economics"},
    projection={"year": 1, "_id": 0}
))

316 μs ± 31.1 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [51]:
%%timeit

# index "covering" a query with projection and sorting
# before: 613 μs ± 29.5 μs
doc = db.prizes.find_one(
    filter={"category": "economics"},
    projection={"year": 1, "_id": 0},
    sort=[("year", 1)]
)

258 μs ± 18.1 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


### Learn more: ask your collection and your queries

In [54]:
# This helps confirm which indexes exist for a collection.
db.laureates.index_information() # always an index on "_id" field

{'_id_': {'v': 2, 'key': [('_id', 1)]}}

In [56]:
db.prizes.index_information()

{'_id_': {'v': 2, 'key': [('_id', 1)]},
 'year_1': {'v': 2, 'key': [('year', 1)]},
 'category_1_year_1': {'v': 2, 'key': [('category', 1), ('year', 1)]}}

In [57]:
# This provides output from its query plan detailing how a given query will execute.
db.laureates.find(
    filter={"firstname": "Marie"}, projection={"bornCountry": 1, "_id": 0}
).explain()

# ...
# 'winningPlan': {'stage': 'PROJECTION',
# 'transformBy': {'bornCountry': 1, '_id': 0},
# 'inputStage': {'stage': 'COLLSCAN',
# ...

{'explainVersion': '1',
 'queryPlanner': {'namespace': 'nobel.laureates',
  'parsedQuery': {'firstname': {'$eq': 'Marie'}},
  'indexFilterSet': False,
  'queryHash': '67081E99',
  'planCacheKey': '7FD1D01D',
  'optimizationTimeMillis': 0,
  'maxIndexedOrSolutionsReached': False,
  'maxIndexedAndSolutionsReached': False,
  'maxScansToExplodeReached': False,
  'prunedSimilarIndexes': False,
  'winningPlan': {'isCached': False,
   'stage': 'PROJECTION_SIMPLE',
   'transformBy': {'bornCountry': 1, '_id': 0},
   'inputStage': {'stage': 'COLLSCAN',
    'filter': {'firstname': {'$eq': 'Marie'}},
    'direction': 'forward'}},
  'rejectedPlans': []},
 'executionStats': {'executionSuccess': True,
  'nReturned': 1,
  'executionTimeMillis': 1,
  'totalKeysExamined': 0,
  'totalDocsExamined': 934,
  'executionStages': {'isCached': False,
   'stage': 'PROJECTION_SIMPLE',
   'nReturned': 1,
   'executionTimeMillisEstimate': 0,
   'works': 935,
   'advanced': 1,
   'needTime': 933,
   'needYield': 0,


In [58]:
db.laureates.create_index([("firstname", 1), ("bornCountry", 1)])
db.laureates.find({"firstname": "Marie"}, {"bornCountry": 1, "_id": 0}).explain()

# ...
# 'winningPlan': {'stage': 'PROJECTION',
# 'transformBy': {'bornCountry': 1, '_id': 0},
# 'inputStage': {'stage': 'IXSCAN',
# 'keyPattern': {'firstname': 1, 'bornCountry': 1},
# 'indexName': 'firstname_1_bornCountry_1',
# ...

{'explainVersion': '1',
 'queryPlanner': {'namespace': 'nobel.laureates',
  'parsedQuery': {'firstname': {'$eq': 'Marie'}},
  'indexFilterSet': False,
  'queryHash': '67081E99',
  'planCacheKey': '321AB6A0',
  'optimizationTimeMillis': 0,
  'maxIndexedOrSolutionsReached': False,
  'maxIndexedAndSolutionsReached': False,
  'maxScansToExplodeReached': False,
  'prunedSimilarIndexes': False,
  'winningPlan': {'isCached': False,
   'stage': 'PROJECTION_COVERED',
   'transformBy': {'bornCountry': 1, '_id': 0},
   'inputStage': {'stage': 'IXSCAN',
    'keyPattern': {'firstname': 1, 'bornCountry': 1},
    'indexName': 'firstname_1_bornCountry_1',
    'isMultiKey': False,
    'multiKeyPaths': {'firstname': [], 'bornCountry': []},
    'isUnique': False,
    'isSparse': False,
    'isPartial': False,
    'indexVersion': 2,
    'direction': 'forward',
    'indexBounds': {'firstname': ['["Marie", "Marie"]'],
     'bornCountry': ['[MinKey, MaxKey]']}}},
  'rejectedPlans': []},
 'executionStats': {'

### Excercises

#### Recently single?

In [59]:
# A prize might be awarded to a single laureate or to several. 
# For each prize category, report the most recent year that 
# a single laureate -- rather than several -- received a prize 
# in that category. As part of this task, we will ensure an index 
# that speeds up finding prizes by category and then sorting 
# results by decreasing year

# Specify an index model for compound sorting
index_model = [('category', 1), ('year', -1)]
db.prizes.create_index(index_model)

# Collect the last single-laureate year for each category
report = ""
for category in sorted(db.prizes.distinct("category")):
    doc = db.prizes.find_one(
        {'category': category, "laureates.share": "1"},
        sort=[('year', -1)]
    )
    report += "{category}: {year}\n".format(**doc)
print(report)

chemistry: 2011
economics: 2017
literature: 2017
medicine: 2016
peace: 2017
physics: 1992



#### Born and affiliated

In [62]:
# Some countries are, for one or more laureates, 
# both their country of birth ("bornCountry") and a 
# country of affiliation for one or more of their prizes 
# ("prizes.affiliations.country"). We will find the five 
# countries of birth with the highest counts of such laureates.

# Ensure an index on country of birth
db.laureates.create_index([('bornCountry', 1)])

# Collect a count of laureates for each country of birth
n_born_and_affiliated = {
    country: db.laureates.count_documents({
        'bornCountry': country,
        "prizes.affiliations.country": country
    })
    for country in db.laureates.distinct("bornCountry")
}
five_most_common = Counter(n_born_and_affiliated).most_common(5)
pprint(five_most_common)

[('USA', 241),
 ('United Kingdom', 56),
 ('France', 26),
 ('Germany', 19),
 ('Japan', 17)]


## 3.4 Limits

### Limiting our exploration

In [66]:
for doc in db.prizes.find({}, ["laureates.share"]):
    share_is_three = [laureate["share"] == "3" 
                      for laureate in doc["laureates"]]
assert all(share_is_three) or not any(share_is_three)

In [68]:
docs = list(db.prizes.find({"laureates.share": "3"}))
len(docs)

69

In [70]:
# Using limits
for doc in db.prizes.find({"laureates.share": "3"}, limit=3):
    print("{year} {category}".format(**doc))

2016 chemistry
2015 chemistry
2014 physics


In [71]:
for doc in db.prizes.find({"laureates.share": "3"}, limit=5):
    print("{year} {category}".format(**doc))

2016 chemistry
2015 chemistry
2014 physics
2013 chemistry
2013 medicine


In [73]:
pprint(doc)

{'_id': ObjectId('6706d88371ea025ecc350d33'),
 'category': 'medicine',
 'laureates': [{'firstname': 'James E.',
                'id': '884',
                'motivation': '"for their discoveries of machinery regulating '
                              'vesicle traffic, a major transport system in '
                              'our cells"',
                'share': '3',
                'surname': 'Rothman'},
               {'firstname': 'Randy W.',
                'id': '885',
                'motivation': '"for their discoveries of machinery regulating '
                              'vesicle traffic, a major transport system in '
                              'our cells"',
                'share': '3',
                'surname': 'Schekman'},
               {'firstname': 'Thomas C.',
                'id': '886',
                'motivation': '"for their discoveries of machinery regulating '
                              'vesicle traffic, a major transport system in '
                 

### Skips and paging through results

In [80]:
for doc in db.prizes.find({"laureates.share": "3"}, limit=9):
    print("{year} {category}".format(**doc))

2016 chemistry
2015 chemistry
2014 physics
2013 chemistry
2013 medicine
2013 economics
2011 peace
2010 chemistry
2008 chemistry


In [84]:
# We can use skip for pagination
for page in range(3):
    print("Page", page+1)
    for doc in db.prizes.find({"laureates.share": "3"}, skip=3*page, limit=3):
        print("{year} {category}".format(**doc))
    print("--------------------")

Page 1
2016 chemistry
2015 chemistry
2014 physics
--------------------
Page 2
2013 chemistry
2013 medicine
2013 economics
--------------------
Page 3
2011 peace
2010 chemistry
2008 chemistry
--------------------


### Using cursor methods for {sort, skip, limit}

In [89]:
# No order applied
for doc in db.prizes.find({"laureates.share": "3"}).limit(6):
    print("{year} {category}".format(**doc))

2016 chemistry
2015 chemistry
2014 physics
2013 chemistry
2013 medicine
2013 economics


In [90]:
# No order applied
for doc in (db.prizes.find({"laureates.share": "3"}).skip(3).limit(3)):
    print("{year} {category}".format(**doc))

2013 chemistry
2013 medicine
2013 economics


In [91]:
for doc in (db.prizes.find({"laureates.share": "3"})
                     .sort([("year", 1)])
                     .skip(3)
                     .limit(3)):
    print("{year} {category}".format(**doc))

1954 medicine
1956 physics
1956 medicine


### Simpler sorts of sort

In [94]:
cursor1 = (db.prizes.find({"laureates.share": "3"})
                    .skip(3)
                    .limit(3)
                    .sort([("year", 1)])) 
cursor2 = (db.prizes.find({"laureates.share": "3"})
                    .skip(3)
                    .limit(3)
                    .sort([("year", 1)])) 
cursor3 = (db.prizes.find({"laureates.share": "3"})
                    .skip(3)
                    .limit(3)
                    .sort([("year", 1)])) 

In [95]:
docs = list(cursor1)
assert docs == list(cursor2) == list(cursor3)
for doc in docs:
    print("{year} {category}".format(**doc))

1954 medicine
1956 physics
1956 medicine


In [97]:
doc = db.prizes.find_one({"laureates.share": "3"}, 
                         skip=3, 
                         sort=[("year", 1)])
print("{year} {category}".format(**doc))

1954 medicine


### Excercises

#### Setting a new limit

In [99]:
# How many documents does the following expression return?
# 5: the second call to limit overrides the first
# You can think of the query parameters as being updated like a dictionary in Python
list(db.prizes.find(filter={"category": "economics"}, 
                    projection={"year": 1, "_id": 0})
       .sort("year")
       .limit(3)
       .limit(5))

[{'year': '1969'},
 {'year': '1970'},
 {'year': '1971'},
 {'year': '1972'},
 {'year': '1973'}]

#### The first five prizes with quarter shares

In [100]:
# Find the first five prizes with one or more laureates 
# sharing 1/4 of the prize. Project our prize category, 
# year, and laureates' motivations.

filter_ = {'laureates.share': '4'}
projection = ['category', 'year', 'laureates.motivation']
cursor = db.prizes.find(filter_, projection).sort('year').limit(5)
pprint(list(cursor))

[{'_id': ObjectId('6706d88371ea025ecc350f15'),
  'category': 'physics',
  'laureates': [{'motivation': '"in recognition of the extraordinary services '
                               'he has rendered by his discovery of '
                               'spontaneous radioactivity"'},
                {'motivation': '"in recognition of the extraordinary services '
                               'they have rendered by their joint researches '
                               'on the radiation phenomena discovered by '
                               'Professor Henri Becquerel"'},
                {'motivation': '"in recognition of the extraordinary services '
                               'they have rendered by their joint researches '
                               'on the radiation phenomena discovered by '
                               'Professor Henri Becquerel"'}],
  'year': '1903'},
 {'_id': ObjectId('6706d88371ea025ecc350ebc'),
  'category': 'chemistry',
  'laureates': [{'motivation':

#### Pages of particle-prized people

In [101]:
# Create the function get_particle_laureates that, 
# given page_number and page_size, retrieves a given 
# page of prize data on laureates who have the word 
# "particle"
def get_particle_laureates(page_number=1, page_size=3):
    if page_number < 1 or not isinstance(page_number, int):
        raise ValueError("Pages are natural numbers (starting from 1).")
    particle_laureates = list(
        db.laureates.find(
            {'prizes.motivation': {'$regex': "particle"}},
            ["firstname", "surname", "prizes"])
        .sort([('prizes.year', 1), ('surname', 1)])
        .skip(page_size * (page_number - 1))
        .limit(page_size))
    return particle_laureates
pages = [get_particle_laureates(page_number=page) for page in range(1,9)]
pprint(pages[0])

[{'_id': ObjectId('6706d88371ea025ecc350fcd'),
  'firstname': 'Charles Thomson Rees',
  'prizes': [{'affiliations': [{'city': 'Cambridge',
                                'country': 'United Kingdom',
                                'name': 'University of Cambridge'}],
              'category': 'physics',
              'motivation': '"for his method of making the paths of '
                            'electrically charged particles visible by '
                            'condensation of vapour"',
              'share': '2',
              'year': '1927'}],
  'surname': 'Wilson'},
 {'_id': ObjectId('6706d88371ea025ecc350fe3'),
  'firstname': 'Sir John Douglas',
  'prizes': [{'affiliations': [{'city': 'Harwell, Berkshire',
                                'country': 'United Kingdom',
                                'name': 'Atomic Energy Research '
                                        'Establishment'}],
              'category': 'physics',
              'motivation': '"for their pione

------------------------------------