In [1]:
import pandas as pd

In [2]:
# The data source is from Malaysia DOSM (https://data.gov.my/data-catalogue/births)
URL_DATA = 'https://storage.data.gov.my/demography/births.parquet'

# Extract data from source
df = pd.read_parquet("https://storage.data.gov.my/demography/births.parquet")

print(df.head())

         date     state  births
0  1920-01-01  Malaysia      96
1  1920-01-02  Malaysia     115
2  1920-01-03  Malaysia     111
3  1920-01-04  Malaysia     101
4  1920-01-05  Malaysia      95


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37833 entries, 0 to 37832
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    37833 non-null  object
 1   state   37833 non-null  object
 2   births  37833 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 886.8+ KB


In [4]:
df.shape

(37833, 3)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37833 entries, 0 to 37832
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    37833 non-null  object
 1   state   37833 non-null  object
 2   births  37833 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 886.8+ KB


In [7]:
# Task 1: Based on the lesson learned previously, convert the "DATE" column data type into datetime format
df["date"] = pd.to_datetime(df["date"],
                                  format= '%m/%d/%Y %I:%M:%S %p')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37833 entries, 0 to 37832
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    37833 non-null  datetime64[ns]
 1   state   37833 non-null  object        
 2   births  37833 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 886.8+ KB


In [9]:
df.head(3)

Unnamed: 0,date,state,births
0,1920-01-01,Malaysia,96
1,1920-01-02,Malaysia,115
2,1920-01-03,Malaysia,111


In [21]:
# Task 2a: Extract the Month name (e.g. January/February/March etc) into another column called "MONTH_NAME"
# Then get average of births aggregate/group by MONTH_NAME
# The expected dataframe is something like this:

# |----------------|-----------|
# |MONTH_NAME      | AVG_BIRTH |
# |----------------|-----------|
# |JANUARY XXX     |           |
# |FEBRUARY XXX    |           |
# |MARCH XXX       |           |
#   .....           

...
df['MONTH_NAME2'] = df['date'].dt.month_name()
df['YEAR_NAME2'] = df['date'].dt.year
df.sample(3)

Unnamed: 0,date,state,births,MONTH_NAME,YEAR_NAME,MONTH_NAME2,YEAR_NAME2
25226,1989-01-24,Malaysia,1265,January,1989,January,1989
20510,1976-02-26,Malaysia,997,February,1976,February,1976
7697,1941-01-27,Malaysia,321,January,1941,January,1941


In [32]:
# AGGREGATE
MONTH_NAME3 = df\
                     .groupby( ['MONTH_NAME']  ) \
                     .agg(
                         AVG_BIRTH = ('births', 'mean'),
                         )\
                     .reset_index()
    
MONTH_NAME3

Unnamed: 0,MONTH_NAME,AVG_BIRTH
0,April,945.3875
1,August,951.493893
2,December,951.100846
3,February,904.976515
4,January,928.116935
5,July,946.085608
6,June,960.348718
7,March,926.42897
8,May,959.320099
9,November,963.218123


In [33]:
# Task 2b: Save the aggregated dataframe in task 2a above into CSV file without index (index=False) in the same folder as this file
# The filename shall be your nickname + _avg_birth_by_month.csv
# Example: azhar_avg_birth_by_month.csv
# No space is allowed. But make sure your nickname is recognizable for evaluation.

MONTH_NAME3.to_csv(r'C:\Users\ak0143\OneDrive\Kamal\TRAINING\DE 101\GitHub2-Branch\python-pandas\Kamal_avg_birth_by_month.csv', index=False)
# Best to use parquet

In [34]:
# Task 3a: Create a dataframe to calculate average birth by the following generation group:
# --> Silent Generation - from 1928 to 1945 included
# --> Baby Boomers - from 1946 to 1964 included
# --> Gen X - from 1965 to 1980 included
# --> Gen Y - from 1981 to 1996 included
# The expected dataframe is something like this:

# |----------------|-----------|
# |GENERATION      | AVG_BIRTH |
# |----------------|-----------|
# |Gen X           |           |
# |Gen Y           |           |
# ......

# Hint: Use for-loop and if/else



Unnamed: 0,YEAR_NAME,AVG_BIRTH
0,1920,106.010929
1,1921,116.468493
2,1922,105.115068
3,1923,164.986301
4,1924,209.871585
...,...,...
99,2019,1372.906849
100,2020,1327.043716
101,2021,1254.643836
102,2022,1199.205479


In [55]:
for i, x in enumerate(Gen_Agg['YEAR_NAME']):
    if x <= 1945:
        df.loc[i, 'Generation'] = 'Silent Gen'
    elif x <= 1964:
        df.loc[i, 'Generation'] = 'Boomers'
    elif x <= 1980:
        df.loc[i, 'Generation'] = 'Gen X'
    elif x <= 1996:
        df.loc[i,'Generation'] = 'Gen Y'
    else:
        df.loc[i,'Generation'] = 'OTHERS'

df.head(3)



Unnamed: 0,date,state,births,MONTH_NAME,YEAR_NAME,MONTH_NAME2,YEAR_NAME2,Generation
0,1920-01-01,Malaysia,96,January,1920,January,1920,Silent Gen
1,1920-01-02,Malaysia,115,January,1920,January,1920,Silent Gen
2,1920-01-03,Malaysia,111,January,1920,January,1920,Silent Gen


In [56]:
# AGGREGATE BY GENERATION
Gen_Agg = df\
                     .groupby( ['Generation']  ) \
                     .agg(
                         AVG_BIRTH = ('births', 'mean'),
                         )\
                     .reset_index()
    
Gen_Agg

Unnamed: 0,Generation,AVG_BIRTH
0,Boomers,90.894737
1,Gen X,100.375
2,Gen Y,95.0
3,OTHERS,93.111111
4,Silent Gen,94.076923


In [46]:
# for row in Gen_Agg.iterrows():
#     year_name = row[1]['YEAR_NAME']
#     avg_birth = row[1]['AVG_BIRTH']
    
#     if year_name <= 1945
#     elif year_name >=1946 and year_name <= 1964
#     elif year_name >=1965 and year_name <= 1980
#     elif year_name >=1981 and year_name <= 1996

#     # print(f"Year: {year_name}. Avg Birth {avg_birth}")
#     # print(row)
#     # print('----')


Year: 1920.0. Avg Birth 106.01092896174863
(0, YEAR_NAME    1920.000000
AVG_BIRTH     106.010929
Name: 0, dtype: float64)
----
Year: 1921.0. Avg Birth 116.46849315068494
(1, YEAR_NAME    1921.000000
AVG_BIRTH     116.468493
Name: 1, dtype: float64)
----
Year: 1922.0. Avg Birth 105.11506849315069
(2, YEAR_NAME    1922.000000
AVG_BIRTH     105.115068
Name: 2, dtype: float64)
----
Year: 1923.0. Avg Birth 164.986301369863
(3, YEAR_NAME    1923.000000
AVG_BIRTH     164.986301
Name: 3, dtype: float64)
----
Year: 1924.0. Avg Birth 209.87158469945356
(4, YEAR_NAME    1924.000000
AVG_BIRTH     209.871585
Name: 4, dtype: float64)
----
Year: 1925.0. Avg Birth 135.31506849315068
(5, YEAR_NAME    1925.000000
AVG_BIRTH     135.315068
Name: 5, dtype: float64)
----
Year: 1926.0. Avg Birth 154.07123287671232
(6, YEAR_NAME    1926.000000
AVG_BIRTH     154.071233
Name: 6, dtype: float64)
----
Year: 1927.0. Avg Birth 161.74794520547945
(7, YEAR_NAME    1927.000000
AVG_BIRTH     161.747945
Name: 7, dtype: 

In [57]:
# Task 3b: Save the aggregated dataframe in task 3a above into CSV file without index (index=False) in the same folder as this file
# The filename shall be your nickname + _avg_birth_by_generation.csv
# Example: azhar_avg_birth_by_generation.csv
# No space is allowed. But make sure your nickname is recognizable for evaluation.

Gen_Agg.to_csv(r'C:\Users\ak0143\OneDrive\Kamal\TRAINING\DE 101\GitHub2-Branch\python-pandas\Kamal_avg_birth_by_generation.csv', index=False)
# Best to use parquet