In [2]:
import pandas as pd

In [3]:
# The data source is from Malaysia DOSM (https://data.gov.my/data-catalogue/births)
URL_DATA = 'https://storage.data.gov.my/demography/births.parquet'

# Extract data from source
df = pd.read_parquet(URL_DATA)

print(df.head())

         date     state  births
0  1920-01-01  Malaysia      96
1  1920-01-02  Malaysia     115
2  1920-01-03  Malaysia     111
3  1920-01-04  Malaysia     101
4  1920-01-05  Malaysia      95


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37833 entries, 0 to 37832
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    37833 non-null  object
 1   state   37833 non-null  object
 2   births  37833 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 886.8+ KB


In [6]:
# Task 1: Based on the lesson learned previously, convert the "DATE" column data type into datetime format
df["date"] = pd.to_datetime(df["date"],
                                  format= '%m/%d/%Y %I:%M:%S %p')

In [7]:
# Task 2a: Extract the Month name (e.g. January/February/March etc) into another column called "MONTH_NAME"
# Then get average of births aggregate/group by MONTH_NAME
# The expected dataframe is something like this:

# |----------------|-----------|
# |MONTH_NAME      | AVG_BIRTH |
# |----------------|-----------|
# |JANUARY XXX     |           |
# |FEBRUARY XXX    |           |
# |MARCH XXX       |           |

df.describe()

Unnamed: 0,date,births
count,37833,37833.0
mean,1971-10-16 00:00:00.000000008,950.482991
min,1920-01-01 00:00:00,43.0
25%,1945-11-23 00:00:00,459.0
50%,1971-10-16 00:00:00,1046.0
75%,1997-09-07 00:00:00,1379.0
max,2023-07-31 00:00:00,3868.0
std,,491.371681


In [8]:
df.head()

Unnamed: 0,date,state,births
0,1920-01-01,Malaysia,96
1,1920-01-02,Malaysia,115
2,1920-01-03,Malaysia,111
3,1920-01-04,Malaysia,101
4,1920-01-05,Malaysia,95


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37833 entries, 0 to 37832
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    37833 non-null  datetime64[ns]
 1   state   37833 non-null  object        
 2   births  37833 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 886.8+ KB


In [10]:
df['month_name'] = df['date'].dt.month_name()

In [11]:
df.sample()

Unnamed: 0,date,state,births,month_name
35151,2016-03-28,Malaysia,1392,March


In [13]:
df.head(10)

Unnamed: 0,date,state,births,month_name
0,1920-01-01,Malaysia,96,January
1,1920-01-02,Malaysia,115,January
2,1920-01-03,Malaysia,111,January
3,1920-01-04,Malaysia,101,January
4,1920-01-05,Malaysia,95,January
5,1920-01-06,Malaysia,91,January
6,1920-01-07,Malaysia,85,January
7,1920-01-08,Malaysia,83,January
8,1920-01-09,Malaysia,96,January
9,1920-01-10,Malaysia,123,January


In [28]:
# Get average of births aggregate/group by MONTH_NAME
# Define a mapping of month names to their corresponding order
month_order = ['January', 'February', 'March', 'April', 'May', 'June', 
               'July', 'August', 'September', 'October', 'November', 'December']

# Create the dataframe with average births, sorted by month
df_average_births = df\
                        .groupby(['month_name']) \
                        .agg(
                            avg_birth=('births', 'mean'),
                            )\
                        .reset_index()

# Sort by the order of months
df_average_births['month_number'] = df_average_births['month_name'].apply(lambda x: month_order.index(x) + 1)
df_average_births = df_average_births.sort_values('month_number').drop('month_number', axis=1)
df_average_births

Unnamed: 0,month_name,avg_birth
4,January,928.116935
3,February,904.976515
7,March,926.42897
0,April,945.3875
8,May,959.320099
6,June,960.348718
5,July,946.085608
1,August,951.493893
11,September,981.742071
10,October,985.878171


In [44]:
df_average_births.to_csv('Sarah_avg_birth_by_month.csv',index=False)

In [37]:
# Task 2b: Save the aggregated dataframe in task 2a above into CSV file without index (index=False) in the same folder as this file
# The filename shall be your nickname + _avg_birth_by_month.csv
# Example: azhar_avg_birth_by_month.csv
# No space is allowed. But make sure your nickname is recognizable for evaluation.

df_average_births.to_csv('Exercise\Sarah_avg_birth_by_month.csv',index=False)

  df_average_births.to_csv('Exercise\Sarah_avg_birth_by_month.csv',index=False)


In [39]:
# Task 3a: Create a dataframe to calculate average birth by the following generation group:
# --> Silent Generation - from 1928 to 1945 included
# --> Baby Boomers - from 1946 to 1964 included
# --> Gen X - from 1965 to 1980 included
# --> Gen Y - from 1981 to 1996 included
# The expected dataframe is something like this:

# |----------------|-----------|
# |GENERATION      | AVG_BIRTH |
# |----------------|-----------|
# |Gen X           |           |
# |Gen Y           |           |
# ......

# Hint: Use for-loop and if/else

df['year'] = df.date.dt.year



In [41]:
df.head()

Unnamed: 0,date,state,births,month_name,year
0,1920-01-01,Malaysia,96,January,1920
1,1920-01-02,Malaysia,115,January,1920
2,1920-01-03,Malaysia,111,January,1920
3,1920-01-04,Malaysia,101,January,1920
4,1920-01-05,Malaysia,95,January,1920


In [46]:
for i, x in enumerate(df['year']):
    if x <= 1945:
        df.loc[i, 'generation'] = 'Silent Gen'
    elif x <= 1964:
        df.loc[i, 'generation'] = 'Boomers'
    elif x <= 1980:
        df.loc[i, 'generation'] = 'Gen X'
    elif x <= 1996:
        df.loc[i, 'generation'] = 'Gen Y'
    else:
        df.loc[i, 'generation'] = 'Others'

In [48]:
df.head(10)

Unnamed: 0,date,state,births,month_name,year,generation
0,1920-01-01,Malaysia,96,January,1920,Silent Gen
1,1920-01-02,Malaysia,115,January,1920,Silent Gen
2,1920-01-03,Malaysia,111,January,1920,Silent Gen
3,1920-01-04,Malaysia,101,January,1920,Silent Gen
4,1920-01-05,Malaysia,95,January,1920,Silent Gen
5,1920-01-06,Malaysia,91,January,1920,Silent Gen
6,1920-01-07,Malaysia,85,January,1920,Silent Gen
7,1920-01-08,Malaysia,83,January,1920,Silent Gen
8,1920-01-09,Malaysia,96,January,1920,Silent Gen
9,1920-01-10,Malaysia,123,January,1920,Silent Gen


In [51]:
df_average_generation = df\
                        .groupby(['generation']) \
                        .agg(
                            avg_birth=('births', 'mean'),
                            )\
                        .reset_index()

df_average_generation

Unnamed: 0,generation,avg_birth
0,Boomers,764.404899
1,Gen X,1057.826318
2,Gen Y,1417.433778
3,Others,1407.644005
4,Silent Gen,265.74971


In [53]:
# Task 3b: Save the aggregated dataframe in task 3a above into CSV file without index (index=False) in the same folder as this file
# The filename shall be your nickname + _avg_birth_by_generation.csv
# Example: azhar_avg_birth_by_generation.csv
# No space is allowed. But make sure your nickname is recognizable for evaluation.
df_average_generation.to_csv('Sarah_avg_birth_by_generation.csv',index=False)

In [15]:
subset_cols = ['date','state','births','month_name']

for col in subset_cols:
    print(df[col].unique())

<DatetimeArray>
['1920-01-01 00:00:00', '1920-01-02 00:00:00', '1920-01-03 00:00:00',
 '1920-01-04 00:00:00', '1920-01-05 00:00:00', '1920-01-06 00:00:00',
 '1920-01-07 00:00:00', '1920-01-08 00:00:00', '1920-01-09 00:00:00',
 '1920-01-10 00:00:00',
 ...
 '2023-07-22 00:00:00', '2023-07-23 00:00:00', '2023-07-24 00:00:00',
 '2023-07-25 00:00:00', '2023-07-26 00:00:00', '2023-07-27 00:00:00',
 '2023-07-28 00:00:00', '2023-07-29 00:00:00', '2023-07-30 00:00:00',
 '2023-07-31 00:00:00']
Length: 37833, dtype: datetime64[ns]
['Malaysia']
[  96  115  111 ... 1791 1754 1801]
['January' 'February' 'March' 'April' 'May' 'June' 'July' 'August'
 'September' 'October' 'November' 'December']


In [16]:
subset_cols = ['date','state','births','month_name']

for col in subset_cols:
    unique_values = df[col].unique()
    length_unique_values = len(unique_values)
    print(f"{unique_values}: {length_unique_values}")
    print()

<DatetimeArray>
['1920-01-01 00:00:00', '1920-01-02 00:00:00', '1920-01-03 00:00:00',
 '1920-01-04 00:00:00', '1920-01-05 00:00:00', '1920-01-06 00:00:00',
 '1920-01-07 00:00:00', '1920-01-08 00:00:00', '1920-01-09 00:00:00',
 '1920-01-10 00:00:00',
 ...
 '2023-07-22 00:00:00', '2023-07-23 00:00:00', '2023-07-24 00:00:00',
 '2023-07-25 00:00:00', '2023-07-26 00:00:00', '2023-07-27 00:00:00',
 '2023-07-28 00:00:00', '2023-07-29 00:00:00', '2023-07-30 00:00:00',
 '2023-07-31 00:00:00']
Length: 37833, dtype: datetime64[ns]: 37833

['Malaysia']: 1

[  96  115  111 ... 1791 1754 1801]: 1826

['January' 'February' 'March' 'April' 'May' 'June' 'July' 'August'
 'September' 'October' 'November' 'December']: 12

