# 2. Working with Distinct Values and Sets

In [1]:
# Importing libraries
import re

from pymongo import MongoClient
from pprint import pprint

from bson.regex import Regex

In [2]:
# Instantiating the mongodb client
client = MongoClient()

# Create local "nobel" database on the fly
db = client["nobel"]

## 2.1 Survey Distinct Values

In [3]:
# Laureate with at least 3 prizes
pprint(db.laureates.find_one({"prizes.2": {"$exists": True}}))

{'_id': ObjectId('6706d88371ea025ecc3510a4'),
 'born': '0000-00-00',
 'died': '0000-00-00',
 'firstname': 'Comité international de la Croix Rouge (International Committee '
              'of the Red Cross)',
 'gender': 'org',
 'id': '482',
 'prizes': [{'affiliations': [[]],
             'category': 'peace',
             'share': '1',
             'year': '1917'},
            {'affiliations': [[]],
             'category': 'peace',
             'share': '1',
             'year': '1944'},
            {'affiliations': [[]],
             'category': 'peace',
             'share': '2',
             'year': '1963'}]}


### Using .distinct()

In [4]:
db.laureates.distinct("gender")

['female', 'male', 'org']

### .distinct() with dot notation

In [5]:
db.laureates.distinct("prizes.category")

['chemistry', 'economics', 'literature', 'medicine', 'peace', 'physics']

### Excercises

In [6]:
# Are the distinct Nobel Prize categories cataloged 
# by the "prizes" collection, the same as those
# cataloged by the "laureates"?
print(db.prizes.distinct("category"))
set(db.laureates.distinct("prizes.category")) == set(db.prizes.distinct("category"))

['chemistry', 'economics', 'literature', 'medicine', 'peace', 'physics']


True

In [7]:
db.prizes.find_one({})

{'_id': ObjectId('6706d88371ea025ecc350d20'),
 'year': '2018',
 'category': 'physics',
 'overallMotivation': '“for groundbreaking inventions in the field of laser physics”',
 'laureates': [{'id': '960',
   'firstname': 'Arthur',
   'surname': 'Ashkin',
   'motivation': '"for the optical tweezers and their application to biological systems"',
   'share': '2'},
  {'id': '961',
   'firstname': 'Gérard',
   'surname': 'Mourou',
   'motivation': '"for their method of generating high-intensity, ultra-short optical pulses"',
   'share': '4'},
  {'id': '962',
   'firstname': 'Donna',
   'surname': 'Strickland',
   'motivation': '"for their method of generating high-intensity, ultra-short optical pulses"',
   'share': '4'}]}

In [8]:
# Countries recorded as countries of death but not as countries of birth
countries = (set(db.laureates.distinct('diedCountry')) 
             - set(db.laureates.distinct('bornCountry')))
pprint(countries)

{'Barbados',
 'Czechoslovakia',
 'East Germany',
 'Gabon',
 'Greece',
 'Israel',
 'Jamaica',
 'Northern Rhodesia (now Zambia)',
 'Philippines',
 'Puerto Rico',
 'Tunisia',
 'USSR',
 'Yugoslavia (now Serbia)'}


In [9]:
# The number of distinct countries of laureate affiliation for prizes
count = len(db.laureates.distinct('prizes.affiliations.country'))
print(count)

29


## 2.2 Distinct Values Given Filters

In [10]:
# Reviewing data
pprint(db.laureates.find_one({"prizes.share": "4"}))

{'_id': ObjectId('6706d88371ea025ecc350f7c'),
 'born': '1936-01-10',
 'bornCity': 'Houston, TX',
 'bornCountry': 'USA',
 'bornCountryCode': 'US',
 'died': '0000-00-00',
 'firstname': 'Robert Woodrow',
 'gender': 'male',
 'id': '112',
 'prizes': [{'affiliations': [{'city': 'Holmdel, NJ',
                               'country': 'USA',
                               'name': 'Bell Laboratories'}],
             'category': 'physics',
             'motivation': '"for their discovery of cosmic microwave '
                           'background radiation"',
             'share': '4',
             'year': '1978'}],
 'surname': 'Wilson'}


### Finding Unique Values

In [11]:
# Finding unique categories
db.laureates.distinct("prizes.category")

['chemistry', 'economics', 'literature', 'medicine', 'peace', 'physics']

### Filtering

In [12]:
share_gte_4 = list(db.laureates.find({"prizes.share": "4"}))
print(len(share_gte_4))
pprint(share_gte_4[:2])

66
[{'_id': ObjectId('6706d88371ea025ecc350f7c'),
  'born': '1936-01-10',
  'bornCity': 'Houston, TX',
  'bornCountry': 'USA',
  'bornCountryCode': 'US',
  'died': '0000-00-00',
  'firstname': 'Robert Woodrow',
  'gender': 'male',
  'id': '112',
  'prizes': [{'affiliations': [{'city': 'Holmdel, NJ',
                                'country': 'USA',
                                'name': 'Bell Laboratories'}],
              'category': 'physics',
              'motivation': '"for their discovery of cosmic microwave '
                            'background radiation"',
              'share': '4',
              'year': '1978'}],
  'surname': 'Wilson'},
 {'_id': ObjectId('6706d88371ea025ecc350f81'),
  'born': '1947-07-20',
  'bornCity': 'Frankfurt-on-the-Main',
  'bornCountry': 'West Germany (now Germany)',
  'bornCountryCode': 'DE',
  'died': '0000-00-00',
  'firstname': 'Gerd',
  'gender': 'male',
  'id': '128',
  'prizes': [{'affiliations': [{'city': 'Rüschlikon',
                    

### All together: Filtering and Finding Unique Values

In [13]:
db.laureates.distinct(
    "prizes.category", 
    {"prizes.share": '4'}
)

['chemistry', 'medicine', 'physics']

In [14]:
db.prizes.distinct(
    "category", 
    {"laureates.share": "4"}
)

['chemistry', 'medicine', 'physics']

In [15]:
# Prize categories with multi-winners
# Laureates with more than 1 prize
db.laureates.count_documents({"prizes.1": {"$exists": True}})

6

In [16]:
db.laureates.distinct(
    "prizes.category", 
    {"prizes.1": {"$exists": True}}
)

['chemistry', 'peace', 'physics']

### Excercises

In [17]:
# In which countries have USA-born laureates had affiliations for their prizes?
db.laureates.distinct("prizes.affiliations.country",
{'bornCountry': 'USA' })

['Australia', 'Denmark', 'USA', 'United Kingdom']

In [18]:
# Returns all prize categories shared by three or more laureates
criteria = {'laureates.2': {"$exists": True}}
triple_play_categories = set(db.prizes.distinct('category', criteria))
print(triple_play_categories)

# Confirm literature as the only category not satisfying the criteria.
assert set(db.prizes.distinct('category')) - triple_play_categories == {'literature'}

{'physics', 'medicine', 'peace', 'chemistry', 'economics'}


## 2.3 Filter Arrays using Distinct Values

### Reviewing laureates data

In [19]:
pprint(db.laureates.find_one({}))

{'_id': ObjectId('6706d88371ea025ecc350f6e'),
 'born': '1853-07-18',
 'bornCity': 'Arnhem',
 'bornCountry': 'the Netherlands',
 'bornCountryCode': 'NL',
 'died': '1928-02-04',
 'diedCountry': 'the Netherlands',
 'diedCountryCode': 'NL',
 'firstname': 'Hendrik Antoon',
 'gender': 'male',
 'id': '2',
 'prizes': [{'affiliations': [{'city': 'Leiden',
                               'country': 'the Netherlands',
                               'name': 'Leiden University'}],
             'category': 'physics',
             'motivation': '"in recognition of the extraordinary service they '
                           'rendered by their researches into the influence of '
                           'magnetism upon radiation phenomena"',
             'share': '2',
             'year': '1902'}],
 'surname': 'Lorentz'}


### Array fields and operators

In [20]:
# count laureates with a prize category equal to physics
db.laureates.count_documents({"prizes.category": "physics"})

209

In [21]:
# count laureates with a prize not in physics
db.laureates.count_documents({"prizes.category": {"$ne": "physics"}})

725

In [22]:
# count laureates with at least one prize in these three categories
db.laureates.count_documents({
    "prizes.category": {
        "$in": ["physics", "chemistry", "medicine"]
    }
})

604

In [23]:
# count laureates with at least one prize not in these three categories
db.laureates.count_documents({
    "prizes.category": {
        "$nin": ["physics", "chemistry", "medicine"]
    }
})

330

### Enter $elemMatch

In [24]:
# we want to filter on more than one field within 
# a prize subdocument? 
# Let's try something like this to count laureates who won unshared prizes in physics. 
# Hmm, that's not quite what we want. This filter prize subdocuments that have two and 
# only two fields. 
# No laureates have a prize subdocument that looks exactly like this. 
# All prize subdocuments also have a year field, for instance.
db.laureates.count_documents({
    "prizes": {
        "category": "physics", "share": "1"
    }
})

0

In [25]:
# it's not quite what we want. 
# This filter matches laureate documents satisfying two conditions. 
# The first is that a prizes field has at least one subdocument with 
# a "category" field equal to "physics". The second is that that a 
# prizes field has at least one subdocument with a "share" field equal to "1".
# The prizes that match for a laureate could be different prizes.
db.laureates.count_documents({
    "prizes.category": "physics", "prizes.share": "1"
})

48

In [26]:
pprint(db.laureates.find_one({}))

{'_id': ObjectId('6706d88371ea025ecc350f6e'),
 'born': '1853-07-18',
 'bornCity': 'Arnhem',
 'bornCountry': 'the Netherlands',
 'bornCountryCode': 'NL',
 'died': '1928-02-04',
 'diedCountry': 'the Netherlands',
 'diedCountryCode': 'NL',
 'firstname': 'Hendrik Antoon',
 'gender': 'male',
 'id': '2',
 'prizes': [{'affiliations': [{'city': 'Leiden',
                               'country': 'the Netherlands',
                               'name': 'Leiden University'}],
             'category': 'physics',
             'motivation': '"in recognition of the extraordinary service they '
                           'rendered by their researches into the influence of '
                           'magnetism upon radiation phenomena"',
             'share': '2',
             'year': '1902'}],
 'surname': 'Lorentz'}


In [27]:
# Finally, we count all laureates that have at least one unshared prize in physics. 
db.laureates.count_documents({
    "prizes": {
        "$elemMatch": {
            "category": "physics", 
            "share": "1"
        }
    }
})

47

In [28]:
pprint(db.laureates.find_one({
    "prizes": {
        "$elemMatch": {
            "category": "physics", 
            "share": "1"
        }
    }
}))

{'_id': ObjectId('6706d88371ea025ecc350f7f'),
 'born': '1943-06-28',
 'bornCity': 'Schroda',
 'bornCountry': 'German-occupied Poland (now Poland)',
 'bornCountryCode': 'PL',
 'died': '0000-00-00',
 'firstname': 'Klaus',
 'gender': 'male',
 'id': '126',
 'prizes': [{'affiliations': [{'city': 'Stuttgart',
                               'country': 'Federal Republic of Germany',
                               'name': 'Max-Planck-Institut für '
                                       'Festkörperforschung'}],
             'category': 'physics',
             'motivation': '"for the discovery of the quantized Hall effect"',
             'share': '1',
             'year': '1985'}],
 'surname': 'von Klitzing'}


In [29]:
# Within the "elemMatch" operation, as with any operation, 
# we can continue to drill down. Operations can nest to make 
# finer-grained queries. Here, we extend the last filter to 
# include laureates only if they won a solo prize in physics before 1945.
db.laureates.count_documents({
    "prizes": {
        "$elemMatch": {
            "category": "physics",
            "share": "1",
            "year": {
                "$lt": "1945"
            },
        }
    }
})

29

In [30]:
pprint(db.laureates.find_one({
    "prizes": {
        "$elemMatch": {
            "category": "physics",
            "share": "1",
            "year": {
                "$lt": "1945"
            },
        }
    }
}))

{'_id': ObjectId('6706d88371ea025ecc350fae'),
 'born': '1845-03-27',
 'bornCity': 'Lennep (now Remscheid)',
 'bornCountry': 'Prussia (now Germany)',
 'bornCountryCode': 'DE',
 'died': '1923-02-10',
 'diedCity': 'Munich',
 'diedCountry': 'Germany',
 'diedCountryCode': 'DE',
 'firstname': 'Wilhelm Conrad',
 'gender': 'male',
 'id': '1',
 'prizes': [{'affiliations': [{'city': 'Munich',
                               'country': 'Germany',
                               'name': 'Munich University'}],
             'category': 'physics',
             'motivation': '"in recognition of the extraordinary services he '
                           'has rendered by the discovery of the remarkable '
                           'rays subsequently named after him"',
             'share': '1',
             'year': '1901'}],
 'surname': 'Röntgen'}


### Excercises

In [31]:
# number of laureates who won a shared prize in physics before 1945.
db.laureates.count_documents({
    "prizes": {
        "$elemMatch": {
            "category": "physics",
            "share": {"$ne": "1"},
            "year": {"$lt": "1945"}
        }
    }
})

19

In [32]:
# What is the approximate ratio of the number of laureates 
# who won an unshared ({"share": "1"}) prize in physics 
# after World War II ({"year": {"$gte": "1945"}}) to the 
# number of laureates who won a shared prize in physics after World War II?
count_unshared = db.laureates.count_documents({
    "prizes": {
        "$elemMatch": {
            "category": "physics",
            "share": "1",
            "year": {"$gte": "1945"}
        }
    }
})
count_shared = db.laureates.count_documents({
    "prizes": {
        "$elemMatch": {
            "category": "physics",
            "share": {"$ne": "1"},
            "year": {"$gte": "1945"}
        }
    }
})
count_unshared, count_shared, count_unshared / count_shared

(18, 143, 0.1258741258741259)

In [33]:
# What is this ratio for prize categories other than physics, chemistry, and medicine?
unshared = {
    "prizes": {"$elemMatch": {
        "category": {"$nin": ["physics", "chemistry", "medicine"]},
        "share": "1",
        "year": {"$gte": "1945"},
    }}}

shared = {
    "prizes": {"$elemMatch": {
        "category": {"$nin": ["physics", "chemistry", "medicine"]},
        "share": {"$ne": "1"},
        "year": {"$gte": "1945"},
    }}}

ratio = db.laureates.count_documents(unshared) / db.laureates.count_documents(shared)
print(ratio)

1.3653846153846154


In [34]:
# How many organizations won prizes before 1945 versus in or after 1945?
before = {
    "gender": "org",
    "prizes.year": {"$lt": "1945"},
}

# Save a filter for organization laureates with prizes won in or after 1945
in_or_after = {
    "gender": "org",
    "prizes.year": {"$gte": "1945"},
}

n_before = db.laureates.count_documents(before)
n_in_or_after = db.laureates.count_documents(in_or_after)
ratio = n_in_or_after / (n_in_or_after + n_before)
print(ratio)

0.84


## 2.4 Distinct As You Like It

### Finding a substring with $regex

In [35]:
# look at the laureate document for Marie
pprint(db.laureates.find_one({"firstname": "Marie"}))

{'_id': ObjectId('6706d88371ea025ecc350fb2'),
 'born': '1867-11-07',
 'bornCity': 'Warsaw',
 'bornCountry': 'Russian Empire (now Poland)',
 'bornCountryCode': 'PL',
 'died': '1934-07-04',
 'diedCity': 'Sallanches',
 'diedCountry': 'France',
 'diedCountryCode': 'FR',
 'firstname': 'Marie',
 'gender': 'female',
 'id': '6',
 'prizes': [{'affiliations': [[]],
             'category': 'physics',
             'motivation': '"in recognition of the extraordinary services they '
                           'have rendered by their joint researches on the '
                           'radiation phenomena discovered by Professor Henri '
                           'Becquerel"',
             'share': '4',
             'year': '1903'},
            {'affiliations': [{'city': 'Paris',
                               'country': 'France',
                               'name': 'Sorbonne University'}],
             'category': 'chemistry',
             'motivation': '"in recognition of her services to the a

In [36]:
# How can we filter for values of "bornCountry" that contain Poland as a substring?
db.laureates.distinct(
    "bornCountry",
    {"bornCountry": {"$regex": "Poland"}}
)

['Austria-Hungary (now Poland)',
 'Free City of Danzig (now Poland)',
 'German-occupied Poland (now Poland)',
 'Germany (now Poland)',
 'Poland',
 'Poland (now Belarus)',
 'Poland (now Lithuania)',
 'Poland (now Ukraine)',
 'Prussia (now Poland)',
 'Russian Empire (now Poland)']

### Flag options for regular expressions

In [37]:
# Case sensitive
case_sensitive = db.laureates.distinct(
    "bornCountry",
    {"bornCountry": {"$regex": "Poland"}}
)
case_sensitive

['Austria-Hungary (now Poland)',
 'Free City of Danzig (now Poland)',
 'German-occupied Poland (now Poland)',
 'Germany (now Poland)',
 'Poland',
 'Poland (now Belarus)',
 'Poland (now Lithuania)',
 'Poland (now Ukraine)',
 'Prussia (now Poland)',
 'Russian Empire (now Poland)']

In [38]:
# the "i" option ensures case-insensitive matching.
case_insensitive = db.laureates.distinct(
    "bornCountry",
    {"bornCountry": {"$regex": "poland", "$options": "i"}}
)
case_insensitive

['Austria-Hungary (now Poland)',
 'Free City of Danzig (now Poland)',
 'German-occupied Poland (now Poland)',
 'Germany (now Poland)',
 'Poland',
 'Poland (now Belarus)',
 'Poland (now Lithuania)',
 'Poland (now Ukraine)',
 'Prussia (now Poland)',
 'Russian Empire (now Poland)']

In [39]:
assert set(case_sensitive) == set(case_insensitive)

In [40]:
# The pymongo driver includes a bson package with a Regex class, 
# which you can import and use as shown.
db.laureates.distinct(
    "bornCountry",
    {"bornCountry": Regex("poland", "i")}
)

['Austria-Hungary (now Poland)',
 'Free City of Danzig (now Poland)',
 'German-occupied Poland (now Poland)',
 'Germany (now Poland)',
 'Poland',
 'Poland (now Belarus)',
 'Poland (now Lithuania)',
 'Poland (now Ukraine)',
 'Prussia (now Poland)',
 'Russian Empire (now Poland)']

In [41]:
# Finally, using native Python regular expression objects is possible. 
# I do not recommend this, though. 
# Use of the bson Regex class is more robust for MongoDB.
db.laureates.distinct(
    "bornCountry",
    {"bornCountry": re.compile("poland", re.I)}
)

['Austria-Hungary (now Poland)',
 'Free City of Danzig (now Poland)',
 'German-occupied Poland (now Poland)',
 'Germany (now Poland)',
 'Poland',
 'Poland (now Belarus)',
 'Poland (now Lithuania)',
 'Poland (now Ukraine)',
 'Prussia (now Poland)',
 'Russian Empire (now Poland)']

### Beginning and ending (and escaping)

In [42]:
# To match the beginning of a field's value, 
# use the caret (^) character
db.laureates.distinct(
    "bornCountry",
    {"bornCountry": Regex("^Poland")}
)

['Poland',
 'Poland (now Belarus)',
 'Poland (now Lithuania)',
 'Poland (now Ukraine)']

In [43]:
# and if we want to match a literal open paren and not use this function, 
# we escape it with a backslash.
db.laureates.distinct(
    "bornCountry",
    {"bornCountry": Regex(r"^Poland \(now")}
)

['Poland (now Belarus)', 'Poland (now Lithuania)', 'Poland (now Ukraine)']

In [44]:
# Finally, to match the end of a field's value, use the dollar $ sign.
db.laureates.distinct(
    "bornCountry",
    {"bornCountry": Regex(r"now Poland\)$")}
)

['Austria-Hungary (now Poland)',
 'Free City of Danzig (now Poland)',
 'German-occupied Poland (now Poland)',
 'Germany (now Poland)',
 'Prussia (now Poland)',
 'Russian Empire (now Poland)']

In [45]:
# How many laureates in total have a first name beginning 
# with "G" and a surname beginning with "S"?
db.laureates.count_documents({"firstname": Regex(r"^G"), "surname": Regex(r"^S")})

9

In [46]:
# Use a regular expression object to filter for laureates with 
# "Germany" in their "bornCountry" value.
criteria = {"bornCountry": Regex("Germany")}
pprint(set(db.laureates.distinct("bornCountry", criteria)))

{'Bavaria (now Germany)',
 'East Friesland (now Germany)',
 'Germany',
 'Germany (now France)',
 'Germany (now Poland)',
 'Germany (now Russia)',
 'Hesse-Kassel (now Germany)',
 'Mecklenburg (now Germany)',
 'Prussia (now Germany)',
 'Schleswig (now Germany)',
 'W&uuml;rttemberg (now Germany)',
 'West Germany (now Germany)'}


In [47]:
# Use a regular expression object to filter for laureates with a 
# "bornCountry" value starting with "Germany".
criteria = {"bornCountry": Regex(r"^Germany")}
pprint(set(db.laureates.distinct("bornCountry", criteria)))

{'Germany',
 'Germany (now France)',
 'Germany (now Poland)',
 'Germany (now Russia)'}


In [48]:
# Use a regular expression object to filter for laureates born in what was at the 
# time Germany but is now another country.
criteria = {"bornCountry": Regex(r"^Germany \(now")}
pprint(set(db.laureates.distinct("bornCountry", criteria)))

{'Germany (now Russia)', 'Germany (now France)', 'Germany (now Poland)'}


In [49]:
# Use a regular expression object to filter for laureates born 
# in what is now Germany but at the time was another country.
criteria = {"bornCountry": Regex(r"now Germany\)$")}
pprint(set(db.laureates.distinct("bornCountry", criteria)))

{'Bavaria (now Germany)',
 'East Friesland (now Germany)',
 'Hesse-Kassel (now Germany)',
 'Mecklenburg (now Germany)',
 'Prussia (now Germany)',
 'Schleswig (now Germany)',
 'W&uuml;rttemberg (now Germany)',
 'West Germany (now Germany)'}


In [50]:
# Three people shared a Nobel prize "for their researches on 
# semiconductors and their discovery of the transistor effect". 
# We can filter on "transistor" as a substring of a laureate's 
# "prizes.motivation" field value to find these laureates.
criteria = {"prizes.motivation": Regex("transistor", "i")}
first, last = "firstname", "surname"
print([(laureate[first], laureate[last]) for laureate in db.laureates.find(criteria)])

[('William Bradford', 'Shockley'), ('John', 'Bardeen'), ('Walter Houser', 'Brattain')]


---------------------------------------------