In [2]:
import pandas as pd
import numpy as np
import urllib.request
import os.path
import zipfile

data_url = "https://www.ssa.gov/oact/babynames/state/namesbystate.zip"
local_filename = "babynamesbystate.zip"
if not os.path.exists(local_filename): # if the data exists don't download again
    with urllib.request.urlopen(data_url) as resp, open(local_filename, 'wb') as f:
        f.write(resp.read())

zf = zipfile.ZipFile(local_filename, 'r')

ca_name = 'CA.TXT'
field_names = ['State', 'Sex', 'Year', 'Name', 'Count']
with zf.open(ca_name) as fh:
    babynames = pd.read_csv(fh, header=None, names=field_names)

babynames.head()

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
1,CA,F,1910,Helen,239
2,CA,F,1910,Dorothy,220
3,CA,F,1910,Margaret,163
4,CA,F,1910,Frances,134


In [3]:
babynames_first_10_rows = babynames.loc[:9, :]

# Notice how we have exactly 10 elements in our boolean array argument
babynames_first_10_rows[[True, False, True, False, True, False, True, False, True, False]]

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
2,CA,F,1910,Dorothy,220
4,CA,F,1910,Frances,134
6,CA,F,1910,Evelyn,126
8,CA,F,1910,Virginia,101


In [4]:
babynames_first_10_rows.loc[[True, False, True, False, True, False, True, False, True, False], :]

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
2,CA,F,1910,Dorothy,220
4,CA,F,1910,Frances,134
6,CA,F,1910,Evelyn,126
8,CA,F,1910,Virginia,101


In [5]:
# First, use a logical condition to generate a boolean array
logical_operator = (babynames["Sex"] == "F")

# Then, use this boolean array to filter the DataFrame
babynames[logical_operator].head()

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
1,CA,F,1910,Helen,239
2,CA,F,1910,Dorothy,220
3,CA,F,1910,Margaret,163
4,CA,F,1910,Frances,134


In [7]:
print("There are a total of {} values in 'logical_operator'".format(len(logical_operator)))


There are a total of 400762 values in 'logical_operator'


In [8]:
babynames.loc[babynames["Sex"] == "F"].head()

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
1,CA,F,1910,Helen,239
2,CA,F,1910,Dorothy,220
3,CA,F,1910,Margaret,163
4,CA,F,1910,Frances,134


In [11]:
babynames[(babynames["Sex"] == "F") & (babynames["Year"] < 2000) & (babynames["State"] == "CA")].head()

Unnamed: 0,State,Sex,Year,Name,Count
0,CA,F,1910,Mary,295
1,CA,F,1910,Helen,239
2,CA,F,1910,Dorothy,220
3,CA,F,1910,Margaret,163
4,CA,F,1910,Frances,134


In [13]:
(
    babynames[(babynames["Name"] == "Jordan") | 
              (babynames["Name"] == "Carl") |
              (babynames["Name"] == "Alex") |
              (babynames["Name"] == "Sarah")]
).head()

Unnamed: 0,State,Sex,Year,Name,Count
56,CA,F,1910,Sarah,29
368,CA,F,1911,Sarah,12
575,CA,F,1912,Sarah,29
887,CA,F,1913,Sarah,32
1211,CA,F,1914,Sarah,37


In [14]:
names = ["Bella", "Alex", "Ani", "Lisa"]
babynames[babynames["Name"].isin(names)].head()

Unnamed: 0,State,Sex,Year,Name,Count
6289,CA,F,1923,Bella,5
7512,CA,F,1925,Bella,8
12368,CA,F,1932,Lisa,5
14741,CA,F,1936,Lisa,8
17084,CA,F,1939,Lisa,5


In [16]:
babynames[babynames["Name"].str.startswith("Jo")].head()

Unnamed: 0,State,Sex,Year,Name,Count
16,CA,F,1910,Josephine,66
179,CA,F,1910,Joyce,7
195,CA,F,1910,Joan,6
254,CA,F,1911,Josephine,70
466,CA,F,1911,Joan,5


In [23]:
bella_counts = babynames[babynames["Name"] == "Bella"]["Count"]
bella_counts.head()

6289     5
7512     8
35477    5
54487    7
58451    6
Name: Count, dtype: int64

In [19]:
# Average number of babies named Bella each year
np.mean(bella_counts)

270.1860465116279

In [21]:
# Max number of babies named Bella born on a given year
max(bella_counts)

902