In [17]:
# Import packages
import pandas as pd
import altair as alt

In [18]:
# Import the birthday data
url = 'https://raw.githubusercontent.com/fivethirtyeight/data/master/births/US_births_2000-2014_SSA.csv'
birthday_df = pd.read_csv(url)

In [19]:
birthday_df.head()

Unnamed: 0,year,month,date_of_month,day_of_week,births
0,2000,1,1,6,9083
1,2000,1,2,7,8006
2,2000,1,3,1,11363
3,2000,1,4,2,13032
4,2000,1,5,3,12558


In [20]:
birthday_df.tail()

Unnamed: 0,year,month,date_of_month,day_of_week,births
5474,2014,12,27,6,8656
5475,2014,12,28,7,7724
5476,2014,12,29,1,12811
5477,2014,12,30,2,13634
5478,2014,12,31,3,11990


In [21]:
# Could find out the number of births for each year
years_df = birthday_df.groupby("year").sum().reset_index()

In [22]:
# years_df.head(20)

In [23]:


years_df1 = years_df[["year", "births"]] # .sort_values(by = "births", ascending=True)
years_df1

Unnamed: 0,year,births
0,2000,4149598
1,2001,4110963
2,2002,4099313
3,2003,4163060
4,2004,4186863
5,2005,4211941
6,2006,4335154
7,2007,4380784
8,2008,4310737
9,2009,4190991


In [24]:
# Show this data in a chart
alt.Chart(years_df1).mark_bar().encode(
    x = 'year',
    y = 'births'
)

# Could find a way to start it at like 3900000 births on the axis of the chart.

In [25]:
# Could find out the most common birthday by aggregating the data to the month and day
years_df.columns

Index(['year', 'month', 'date_of_month', 'day_of_week', 'births'], dtype='object')

In [26]:
days_df = birthday_df[['month', 'date_of_month', 'births']]

In [27]:
days_df1 = days_df.groupby(["month", "date_of_month"]).sum().reset_index()
days_df1.head()

Unnamed: 0,month,date_of_month,births
0,1,1,116030
1,1,2,144083
2,1,3,170115
3,1,4,171663
4,1,5,166682


In [28]:
days_df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   month          366 non-null    int64
 1   date_of_month  366 non-null    int64
 2   births         366 non-null    int64
dtypes: int64(3)
memory usage: 8.7 KB


In [29]:
days_df1.describe()

Unnamed: 0,month,date_of_month,births
count,366.0,366.0,366.0
mean,6.513661,15.756831,169909.901639
std,3.455958,8.823592,11588.371352
min,1.0,1.0,41869.0
25%,4.0,8.0,165908.25
50%,7.0,16.0,170017.5
75%,9.75,23.0,175587.0
max,12.0,31.0,191024.0


In [30]:
top_days_df = days_df1.sort_values(by = ['births'], ascending = False)
top_days_df.head()

Unnamed: 0,month,date_of_month,births
255,9,12,191024
262,9,19,190435
263,9,20,188123
353,12,19,186685
253,9,10,186668


In [31]:
# Convert to actual months instead of numbers before displaying the most common birthdays to the user.
month_dict = {
    1:'January',
    2:'February',
    3:'March',
    4:'April',
    5:'May',
    6:'June',
    7:'July',
    8:'August',
    9:'September',
    10:'October',
    11:'November',
    12:'December',
}

top_days_df1 = top_days_df.copy()
top_days_df1['month'] = top_days_df1['month'].replace(month_dict)
top_days_df1.head()

Unnamed: 0,month,date_of_month,births
255,September,12,191024
262,September,19,190435
263,September,20,188123
353,December,19,186685
253,September,10,186668


In [32]:
low_days_df = days_df1.sort_values(by = 'births', ascending = True)
low_days_df.head()

Unnamed: 0,month,date_of_month,births
59,2,29,41869
359,12,25,96568
0,1,1,116030
358,12,24,120516
185,7,4,132229


In [33]:
# Allow us to search a birthday and out of the 365 days of the year tell them where it lies. Like April 13th is 
# the 161st most common birthday. You are on the top 50%. Or something like that? An interactive piece they could 
# see after with this birthday data set.add
# Give context for this data set so they understand the years this data was collected as opposed to thinking 
# this is since 2020 or something.

In [34]:
# Add numbering to the top days df
top_days_df['row_number'] = range(1, len(top_days_df) + 1)
top_days_df.head()

Unnamed: 0,month,date_of_month,births,row_number
255,9,12,191024,1
262,9,19,190435,2
263,9,20,188123,3
353,12,19,186685,4
253,9,10,186668,5


In [35]:
top_days_df.tail()

Unnamed: 0,month,date_of_month,births,row_number
185,7,4,132229,362
358,12,24,120516,363
0,1,1,116030,364
359,12,25,96568,365
59,2,29,41869,366


In [None]:
# Create a number suffix column that is conditional on the row_number column- so that when printing out the the 
# user it makes more sense.

# Add a column for the last digit
top_days_df['last_digit'] = top_days_df['row_number'] % 10
top_days_df.head()
# top_days_df.tail()

Unnamed: 0,month,date_of_month,births,row_number,last_digit
185,7,4,132229,362,2
358,12,24,120516,363,3
0,1,1,116030,364,4
359,12,25,96568,365,5
59,2,29,41869,366,6


In [39]:
def simple_suffix(d):
    if d == 1:
        return 'st'
    elif d == 2:
        return 'nd'
    elif d == 3:
        return 'rd'
    else:
        return 'th'

top_days_df['suffix'] = top_days_df['last_digit'].apply(simple_suffix)

In [40]:
top_days_df.head()

Unnamed: 0,month,date_of_month,births,row_number,last_digit,suffix
255,9,12,191024,1,1,st
262,9,19,190435,2,2,nd
263,9,20,188123,3,3,rd
353,12,19,186685,4,4,th
253,9,10,186668,5,5,th


In [None]:
# Now if you put in your birthday we can get your day number and say that it is the ___'th most common birthday 
# of all the days in the year