In [1]:
import pandas as pd

In [2]:
# The data source is from Malaysia DOSM (https://data.gov.my/data-catalogue/births)
URL_DATA = 'https://storage.data.gov.my/demography/births.parquet'

# Extract data from source
df = pd.read_parquet(URL_DATA)

print(df.head())

         date     state  births
0  1920-01-01  Malaysia      96
1  1920-01-02  Malaysia     115
2  1920-01-03  Malaysia     111
3  1920-01-04  Malaysia     101
4  1920-01-05  Malaysia      95


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37833 entries, 0 to 37832
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    37833 non-null  object
 1   state   37833 non-null  object
 2   births  37833 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 886.8+ KB


In [4]:
# Task 1: Based on the lesson learned previously, convert the "DATE" column data type into datetime format
df.date = pd.to_datetime(df.date)

In [5]:
df.dtypes

date      datetime64[ns]
state             object
births             int64
dtype: object

In [34]:
# Task 2a: Extract the Month name (e.g. January/February/March etc) into another column called "MONTH_NAME"
# Then get average of births aggregate/group by MONTH_NAME
# The expected dataframe is something like this:

# |----------------|-----------|
# |MONTH_NAME      | AVG_BIRTH |
# |----------------|-----------|
# |JANUARY XXX     |           |
# |FEBRUARY XXX    |           |
# |MARCH XXX       |           |
#   .....           

df['month_name'] = df.date.dt.month_name()
df['month_number'] = df.date.dt.month

In [35]:
df.head()

Unnamed: 0,date,state,births,year,generation,month_name,month_number
0,1920-01-01,Malaysia,96,1920,Silent Gen,January,1
1,1920-01-02,Malaysia,115,1920,Silent Gen,January,1
2,1920-01-03,Malaysia,111,1920,Silent Gen,January,1
3,1920-01-04,Malaysia,101,1920,Silent Gen,January,1
4,1920-01-05,Malaysia,95,1920,Silent Gen,January,1


In [36]:
df1 = df.groupby(['month_name', 'month_number']).agg(
    avg_birth = ('births','mean')
).reset_index()

In [41]:
df1.sort_values('month_number')

Unnamed: 0,month_name,month_number,avg_birth
4,January,1,928.116935
3,February,2,904.976515
7,March,3,926.42897
0,April,4,945.3875
8,May,5,959.320099
6,June,6,960.348718
5,July,7,946.085608
1,August,8,951.493893
11,September,9,981.742071
10,October,10,985.878171


In [43]:
# Task 2b: Save the aggregated dataframe in task 2a above into CSV file without index (index=False) in the same folder as this file
# The filename shall be your nickname + _avg_birth_by_month.csv
# Example: azhar_avg_birth_by_month.csv
# No space is allowed. But make sure your nickname is recognizable for evaluation.
df1.to_parquet('df_agg.parquet', index=False)

In [7]:
# Task 3a: Create a dataframe to calculate average birth by the following generation group:
# --> Silent Generation - from 1928 to 1945 included
# --> Baby Boomers - from 1946 to 1964 included
# --> Gen X - from 1965 to 1980 included
# --> Gen Y - from 1981 to 1996 included
# The expected dataframe is something like this:

# |----------------|-----------|
# |GENERATION      | AVG_BIRTH |
# |----------------|-----------|
# |Gen X           |           |
# |Gen Y           |           |
# ......

# Hint: Use for-loop and if/else

df['year'] = df.date.dt.year

In [44]:
df.head()

Unnamed: 0,date,state,births,year,generation,month_name,month_number
0,1920-01-01,Malaysia,96,1920,Silent Gen,January,1
1,1920-01-02,Malaysia,115,1920,Silent Gen,January,1
2,1920-01-03,Malaysia,111,1920,Silent Gen,January,1
3,1920-01-04,Malaysia,101,1920,Silent Gen,January,1
4,1920-01-05,Malaysia,95,1920,Silent Gen,January,1


In [48]:
df.tail()

Unnamed: 0,date,state,births,year,generation,month_name,month_number
37828,2023-07-27,Malaysia,1199,2023,OTHERS,July,7
37829,2023-07-28,Malaysia,1220,2023,OTHERS,July,7
37830,2023-07-29,Malaysia,927,2023,OTHERS,July,7
37831,2023-07-30,Malaysia,938,2023,OTHERS,July,7
37832,2023-07-31,Malaysia,1098,2023,OTHERS,July,7


In [49]:
for i, x in enumerate(df['year']):
    # Enumerate = generate a row number when we run a for loop.
    # Here it is represented by i≠
    if x <= 1945:
        df.loc[i, 'generation'] = 'Silent Gen'
    elif x <= 1964:
        df.loc[i, 'generation'] = 'Boomers'
    elif x <= 1980:
        df.loc[i, 'generation'] = 'Gen X'
    elif x <= 1996:
        df.loc[i, 'generation'] = 'Gen Y'
    else:
        df.loc[i, 'generation'] = 'OTHERS'

In [53]:
df2 = df.groupby('generation')['births'].mean().reset_index()

In [54]:
# Task 3b: Save the aggregated dataframe in task 3a above into CSV file without index (index=False) in the same folder as this file
# The filename shall be your nickname + _avg_birth_by_generation.csv
# Example: azhar_avg_birth_by_generation.csv
# No space is allowed. But make sure your nickname is recognizable for evaluation.

df2.to_csv('generation.csv', index=False)