In [154]:
import pandas as pd

In [155]:
# The data source is from Malaysia DOSM (https://data.gov.my/data-catalogue/births)
URL_DATA = 'https://storage.data.gov.my/demography/births.parquet'

# Extract data from source
df = pd.read_parquet(URL_DATA)

print(df.head())

         date     state  births
0  1920-01-01  Malaysia      96
1  1920-01-02  Malaysia     115
2  1920-01-03  Malaysia     111
3  1920-01-04  Malaysia     101
4  1920-01-05  Malaysia      95


In [157]:
# Combine mismatch unique values for different columns and merge them into a single data frame
subset_columns = ['date', 'births']

for col in subset_columns:
    print(f"{col}: {df[col].unique()}")
    print()

date: [datetime.date(1920, 1, 1) datetime.date(1920, 1, 2)
 datetime.date(1920, 1, 3) ... datetime.date(2023, 7, 29)
 datetime.date(2023, 7, 30) datetime.date(2023, 7, 31)]

births: [  96  115  111 ... 1791 1754 1801]



In [158]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37833 entries, 0 to 37832
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    37833 non-null  object
 1   state   37833 non-null  object
 2   births  37833 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 886.8+ KB


In [159]:
# Task 1: Based on the lesson learned previously, convert the "DATE" column data type into datetime format
df['date'] = pd.to_datetime(df['date'])

In [160]:
df.dtypes

date      datetime64[ns]
state             object
births             int64
dtype: object

In [143]:
# Task 2a: Extract the Month name (e.g. January/February/March etc) into another column called "MONTH_NAME"
# Then get average of births aggregate/group by MONTH_NAME
# The expected dataframe is something like this:

# |----------------|-----------|
# |MONTH_NAME      | AVG_BIRTH |
# |----------------|-----------|
# |JANUARY XXX     |           |
# |FEBRUARY XXX    |           |
# |MARCH XXX       |           |
#   .....           

...


Ellipsis

In [161]:
#2a
df['MONTH_NAME'] = df['date'].dt.month_name()

In [150]:
df.sample(2)

Unnamed: 0,MONTH_NAME,AVG_BIRTH
7,March,926.42897
2,December,951.100846


In [162]:
df1 = df\
    .groupby(['MONTH_NAME'])\
    .agg(
        AVG_BIRTH=('births', 'mean')
    )\
    .reset_index()\
    #.sort_values('MONTH_NAME')
df1

Unnamed: 0,MONTH_NAME,AVG_BIRTH
0,April,945.3875
1,August,951.493893
2,December,951.100846
3,February,904.976515
4,January,928.116935
5,July,946.085608
6,June,960.348718
7,March,926.42897
8,May,959.320099
9,November,963.218123


In [None]:
# Task 2b: Save the aggregated dataframe in task 2a above into CSV file without index (index=False) in the same folder as this file
# The filename shall be your nickname + _avg_birth_by_month.csv
# Example: azhar_avg_birth_by_month.csv
# No space is allowed. But make sure your nickname is recognizable for evaluation.

...

In [163]:
df1.to_csv('dr_agg.csv', index=False)

In [13]:
# Task 3a: Create a dataframe to calculate average birth by the following generation group:
# --> Silent Generation - from 1928 to 1945 included
# --> Baby Boomers - from 1946 to 1964 included
# --> Gen X - from 1965 to 1980 included
# --> Gen Y - from 1981 to 1996 included
# The expected dataframe is something like this:

# |----------------|-----------|
# |GENERATION      | AVG_BIRTH |
# |----------------|-----------|
# |Gen X           |           |
# |Gen Y           |           |
# ......

# Hint: Use for-loop and if/else

...

In [14]:
# Task 3b: Save the aggregated dataframe in task 3a above into CSV file without index (index=False) in the same folder as this file
# The filename shall be your nickname + _avg_birth_by_generation.csv
# Example: azhar_avg_birth_by_generation.csv
# No space is allowed. But make sure your nickname is recognizable for evaluation.

...