In [1]:
import pymongo
import pprint
import json
from pprint import pprint

import warnings
warnings.filterwarnings('ignore')

In [8]:
with open('credential.json') as jfile:
    credential = json.load(jfile)

### Load Sample Data Through MongoDB Atlas

* Go to your Atlas account, click "Collections"
* Click "Load Sample Data"
* You can also see the data structure as well as some sample values through Atlas

In [7]:
# connect to the mongoclient
connection_str = credential['connection_str']
client = pymongo.MongoClient(connection_str)

client.list_database_names()

['sample_airbnb',
 'sample_analytics',
 'sample_geospatial',
 'sample_mflix',
 'sample_restaurants',
 'sample_supplies',
 'sample_training',
 'sample_weatherdata',
 'admin',
 'local']

In [9]:
db = client.sample_restaurants

db.list_collection_names()  # a collection is like a table

['restaurants', 'neighborhoods']

In [11]:
# count the number of documents in each collection
print(db['restaurants'].find().count())
print(db['neighborhoods'].find().count())

25359
195


In [12]:
db['restaurants'].find_one()

{'_id': ObjectId('5eb3d668b31de5d588f4292a'),
 'address': {'building': '2780',
  'coord': [-73.98241999999999, 40.579505],
  'street': 'Stillwell Avenue',
  'zipcode': '11224'},
 'borough': 'Brooklyn',
 'cuisine': 'American',
 'grades': [{'date': datetime.datetime(2014, 6, 10, 0, 0),
   'grade': 'A',
   'score': 5},
  {'date': datetime.datetime(2013, 6, 5, 0, 0), 'grade': 'A', 'score': 7},
  {'date': datetime.datetime(2012, 4, 13, 0, 0), 'grade': 'A', 'score': 12},
  {'date': datetime.datetime(2011, 10, 12, 0, 0), 'grade': 'A', 'score': 12}],
 'name': 'Riviera Caterer',
 'restaurant_id': '40356018'}

In [14]:
db['neighborhoods'].find_one().keys()

dict_keys(['_id', 'geometry', 'name'])

## MongoDB Index

### Default Index

* By default, mongoDB gives an id to each document in a collection, this is the default index of each collection.
* However, it's using "COLLSCAN" (collection scan), which means MongoDB needs to scan every document in a collection to look for the relevant documents.

In [16]:
print(db.restaurants.index_information())
print(db.neighborhoods.index_information())

{'_id_': {'v': 2, 'key': [('_id', 1)], 'ns': 'sample_restaurants.restaurants'}}
{'_id_': {'v': 2, 'key': [('_id', 1)], 'ns': 'sample_restaurants.neighborhoods'}}


In [17]:
pprint(db.restaurants.find().explain())

{'$clusterTime': {'clusterTime': Timestamp(1600551368, 1),
                  'signature': {'hash': b'R\xeey\xf5{\xf7\xf2\xec\xb2^\xfe\x9a'
                                        b'.\xf3\xb8\x8d\x81K\x81"',
                                'keyId': 6873329467658338307}},
 'executionStats': {'allPlansExecution': [],
                    'executionStages': {'advanced': 25359,
                                        'direction': 'forward',
                                        'docsExamined': 25359,
                                        'executionTimeMillisEstimate': 1,
                                        'isEOF': 1,
                                        'nReturned': 25359,
                                        'needTime': 1,
                                        'needYield': 0,
                                        'restoreState': 198,
                                        'saveState': 198,
                                        'stage': 'COLLSCAN',
                     

ðŸ’¡ If we check `executionStats` here, "stage" is "COLLSCAN" indicates it's using the collection scan. Therefore, `nReturned` and `totalDocsExamined` have the same amount of value, which is also the total number of documents, same as `db['restaurants'].find().count()` output.