In [73]:
import pandas as pd

In [74]:
# The data source is from Malaysia DOSM (https://data.gov.my/data-catalogue/births)
URL_DATA = 'https://storage.data.gov.my/demography/births.parquet'

# Extract data from source
df = pd.read_parquet(URL_DATA)

print(df.head())

         date     state  births
0  1920-01-01  Malaysia      96
1  1920-01-02  Malaysia     115
2  1920-01-03  Malaysia     111
3  1920-01-04  Malaysia     101
4  1920-01-05  Malaysia      95


In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37833 entries, 0 to 37832
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    37833 non-null  object
 1   state   37833 non-null  object
 2   births  37833 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 886.8+ KB


In [76]:
# Task 1: Based on the lesson learned previously, convert the "DATE" column data type into datetime format
df["date"] = pd.to_datetime(df["date"])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37833 entries, 0 to 37832
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    37833 non-null  datetime64[ns]
 1   state   37833 non-null  object        
 2   births  37833 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 886.8+ KB


In [77]:
# Task 2a: Extract the Month name (e.g. January/February/March etc) into another column called "MONTH_NAME"
# Then get average of births aggregate/group by MONTH_NAME
# The expected dataframe is something like this:

# |----------------|-----------|
# |MONTH_NAME      | AVG_BIRTH |
# |----------------|-----------|
# |JANUARY XXX     |           |
# |FEBRUARY XXX    |           |
# |MARCH XXX       |           |
#   .....           

# insert MONTH_NAME column
df["MONTH_NAME"] = df["date"].dt.strftime("%B %Y")

# group by MONTH_NAME and calculate average birth as AVG_BIRTH
df_monthly_avg_birth = df.groupby("MONTH_NAME").agg(AVG_BIRTH = ('births', 'mean')).reset_index()

# insert month index column to sort MONTH_NAME
df_monthly_avg_birth["month_index"] = pd.to_datetime(df_monthly_avg_birth["MONTH_NAME"], format="%B %Y")

# sort dataframe by month_index and then delete the column
df_monthly_avg_birth = df_monthly_avg_birth.sort_values("month_index", ascending=True).drop("month_index", axis=1)

display(df_monthly_avg_birth)


Unnamed: 0,MONTH_NAME,AVG_BIRTH
414,January 1920,93.935484
310,February 1920,95.862069
726,March 1920,93.354839
0,April 1920,95.133333
830,May 1920,100.419355
...,...,...
829,March 2023,1281.193548
103,April 2023,1193.566667
933,May 2023,1245.548387
725,June 2023,1263.800000


In [78]:
# Task 2b: Save the aggregated dataframe in task 2a above into CSV file without index (index=False) in the same folder as this file
# The filename shall be your nickname + _avg_birth_by_month.csv
# Example: azhar_avg_birth_by_month.csv
# No space is allowed. But make sure your nickname is recognizable for evaluation.

df_monthly_avg_birth.to_csv("fairuz_avg_birth_by_month.csv", index=False)

In [79]:
# Task 3a: Create a dataframe to calculate average birth by the following generation group:
# --> Silent Generation - from 1928 to 1945 included
# --> Baby Boomers - from 1946 to 1964 included
# --> Gen X - from 1965 to 1980 included
# --> Gen Y - from 1981 to 1996 included
# The expected dataframe is something like this:

# |----------------|-----------|
# |GENERATION      | AVG_BIRTH |
# |----------------|-----------|
# |Gen X           |           |
# |Gen Y           |           |
# ......

# Hint: Use for-loop and if/else

# insert year column
df["year"] = df["date"].dt.year
df.head()


Unnamed: 0,date,state,births,MONTH_NAME,year
0,1920-01-01,Malaysia,96,January 1920,1920
1,1920-01-02,Malaysia,115,January 1920,1920
2,1920-01-03,Malaysia,111,January 1920,1920
3,1920-01-04,Malaysia,101,January 1920,1920
4,1920-01-05,Malaysia,95,January 1920,1920


In [80]:
def assign_generation(year):
    if year <= 1945:
        result = "Silent Generation"
    elif year >= 1946 and year <= 1964:
        result = "Baby Boomers"
    elif year >= 1965 and year <= 1980:
        result = "Gen X"
    elif year >= 1981 and year <= 1996:
        result = "Gen Y"
    else:
        result = "Others"
    return result

df["generation"] = df["year"].apply(assign_generation)

df.head()

Unnamed: 0,date,state,births,MONTH_NAME,year,generation
0,1920-01-01,Malaysia,96,January 1920,1920,Silent Generation
1,1920-01-02,Malaysia,115,January 1920,1920,Silent Generation
2,1920-01-03,Malaysia,111,January 1920,1920,Silent Generation
3,1920-01-04,Malaysia,101,January 1920,1920,Silent Generation
4,1920-01-05,Malaysia,95,January 1920,1920,Silent Generation


In [81]:
df_avg_birth_gen = df.groupby("generation")[["births"]].mean().reset_index()
df_avg_birth_gen.head()

Unnamed: 0,generation,births
0,Baby Boomers,764.404899
1,Gen X,1057.826318
2,Gen Y,1417.433778
3,Others,1407.644005
4,Silent Generation,265.74971


In [82]:
# Task 3b: Save the aggregated dataframe in task 3a above into CSV file without index (index=False) in the same folder as this file
# The filename shall be your nickname + _avg_birth_by_generation.csv
# Example: azhar_avg_birth_by_generation.csv
# No space is allowed. But make sure your nickname is recognizable for evaluation.

df_avg_birth_gen.to_csv("fairuz_avg_birth_by_generation.csv", index=False)