In [8]:
import pandas as pd

In [9]:
# The data source is from Malaysia DOSM (https://data.gov.my/data-catalogue/births)
URL_DATA = 'https://storage.data.gov.my/demography/births.parquet'

# Extract data from source
df = pd.read_parquet(URL_DATA)

print(df.head())

         date     state  births
0  1920-01-01  Malaysia      96
1  1920-01-02  Malaysia     115
2  1920-01-03  Malaysia     111
3  1920-01-04  Malaysia     101
4  1920-01-05  Malaysia      95


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37833 entries, 0 to 37832
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    37833 non-null  object
 1   state   37833 non-null  object
 2   births  37833 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 886.8+ KB


In [11]:
# Task 1: Based on the lesson learned previously, convert the "DATE" column data type into datetime format
df['date'] = pd.to_datetime(df['date']) #change type to datetime

#can access using dot notation df.date

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37833 entries, 0 to 37832
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    37833 non-null  datetime64[ns]
 1   state   37833 non-null  object        
 2   births  37833 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 886.8+ KB


In [12]:
# Task 2a: Extract the Month name (e.g. January/February/March etc) into another column called "MONTH_NAME"
# Then get average of births aggregate/group by MONTH_NAME
# The expected dataframe is something like this:

# |----------------|-----------|
# |MONTH_NAME      | AVG_BIRTH |
# |----------------|-----------|
# |JANUARY XXX     |           |
# |FEBRUARY XXX    |           |
# |MARCH XXX       |           |
#   .....           


df['Month_Name'] = df['date'].dt.month_name()

df['Month'] = df['date'].dt.month

df.sample(10)


Unnamed: 0,date,state,births,Month_Name,Month
32857,2009-12-16,Malaysia,1511,December,12
30417,2003-04-12,Malaysia,1266,April,4
18609,1970-12-13,Malaysia,820,December,12
15877,1963-06-21,Malaysia,875,June,6
16309,1964-08-26,Malaysia,876,August,8
28446,1997-11-18,Malaysia,1669,November,11
25202,1988-12-31,Malaysia,1807,December,12
21543,1978-12-25,Malaysia,1060,December,12
31798,2007-01-22,Malaysia,1256,January,1
14926,1960-11-12,Malaysia,911,November,11


In [13]:
df_month_average = df.groupby(['Month_Name','Month'])\
    .agg(average_birth = ('births', 'mean'))\
    .reset_index()
    
df_month_average.sort_values(['Month'])


Unnamed: 0,Month_Name,Month,average_birth
4,January,1,928.116935
3,February,2,904.976515
7,March,3,926.42897
0,April,4,945.3875
8,May,5,959.320099
6,June,6,960.348718
5,July,7,946.085608
1,August,8,951.493893
11,September,9,981.742071
10,October,10,985.878171


In [14]:
# Task 2b: Save the aggregated dataframe in task 2a above into CSV file without index (index=False) in the same folder as this file
# The filename shall be your nickname + _avg_birth_by_month.csv
# Example: azhar_avg_birth_by_month.csv
# No space is allowed. But make sure your nickname is recognizable for evaluation.

df_month_average.to_csv('syafeeq_avg_birth_by_month.csv', index = False)

In [15]:
df.dtypes

date          datetime64[ns]
state                 object
births                 int64
Month_Name            object
Month                  int32
dtype: object

In [16]:
# Task 3a: Create a dataframe to calculate average birth by the following generation group:
# --> Silent Generation - from 1928 to 1945 included
# --> Baby Boomers - from 1946 to 1964 included
# --> Gen X - from 1965 to 1980 included
# --> Gen Y - from 1981 to 1996 included
# The expected dataframe is something like this:

# |----------------|-----------|
# |GENERATION      | AVG_BIRTH |
# |----------------|-----------|
# |Gen X           |           |
# |Gen Y           |           |
# ......

# Hint: Use for-loop and if/else

df['year'] = df['date'].dt.year
df.head()


Unnamed: 0,date,state,births,Month_Name,Month,year
0,1920-01-01,Malaysia,96,January,1,1920
1,1920-01-02,Malaysia,115,January,1,1920
2,1920-01-03,Malaysia,111,January,1,1920
3,1920-01-04,Malaysia,101,January,1,1920
4,1920-01-05,Malaysia,95,January,1,1920


In [17]:
temp_df = []

for x in df['year']:
    if x <= 1945: 
        temp_df.append('Silent Generation')
    elif x <= 1964: 
        temp_df.append('Baby Boomers')
    elif x <= 1980: 
        temp_df.append('Gen X')
    elif x <= 1996: 
        temp_df.append('Gen Y')
    else:
        temp_df.append('Others')

df['generation'] = pd.DataFrame(temp_df)

df.sample(10)

Unnamed: 0,date,state,births,Month_Name,Month,year,generation
10953,1949-12-27,Malaysia,543,December,12,1949,Baby Boomers
31413,2006-01-02,Malaysia,1189,January,1,2006,Others
20929,1977-04-20,Malaysia,1201,April,4,1977,Gen X
20132,1975-02-13,Malaysia,996,February,2,1975,Gen X
25444,1989-08-30,Malaysia,1303,August,8,1989,Gen Y
21312,1978-05-08,Malaysia,1084,May,5,1978,Gen X
33336,2011-04-09,Malaysia,1312,April,4,2011,Others
8165,1942-05-10,Malaysia,504,May,5,1942,Silent Generation
6143,1936-10-26,Malaysia,375,October,10,1936,Silent Generation
30030,2002-03-21,Malaysia,1412,March,3,2002,Others


In [18]:
#iterate by each row
for i,x in df['year'].items():
    print(i,x)

0 1920
1 1920
2 1920
3 1920
4 1920
5 1920
6 1920
7 1920
8 1920
9 1920
10 1920
11 1920
12 1920
13 1920
14 1920
15 1920
16 1920
17 1920
18 1920
19 1920
20 1920
21 1920
22 1920
23 1920
24 1920
25 1920
26 1920
27 1920
28 1920
29 1920
30 1920
31 1920
32 1920
33 1920
34 1920
35 1920
36 1920
37 1920
38 1920
39 1920
40 1920
41 1920
42 1920
43 1920
44 1920
45 1920
46 1920
47 1920
48 1920
49 1920
50 1920
51 1920
52 1920
53 1920
54 1920
55 1920
56 1920
57 1920
58 1920
59 1920
60 1920
61 1920
62 1920
63 1920
64 1920
65 1920
66 1920
67 1920
68 1920
69 1920
70 1920
71 1920
72 1920
73 1920
74 1920
75 1920
76 1920
77 1920
78 1920
79 1920
80 1920
81 1920
82 1920
83 1920
84 1920
85 1920
86 1920
87 1920
88 1920
89 1920
90 1920
91 1920
92 1920
93 1920
94 1920
95 1920
96 1920
97 1920
98 1920
99 1920
100 1920
101 1920
102 1920
103 1920
104 1920
105 1920
106 1920
107 1920
108 1920
109 1920
110 1920
111 1920
112 1920
113 1920
114 1920
115 1920
116 1920
117 1920
118 1920
119 1920
120 1920
121 1920
122 1920
123

In [19]:
df_generation_average = df.groupby(['generation'])\
    .agg(average_birth = ('births', 'mean'))\
    .reset_index()
    
df_generation_average.sort_values(['generation'])


Unnamed: 0,generation,average_birth
0,Baby Boomers,764.404899
1,Gen X,1057.826318
2,Gen Y,1417.433778
3,Others,1407.644005
4,Silent Generation,265.74971


In [20]:
#use 1 line

df.groupby('generation')['births'].mean().reset_index()

Unnamed: 0,generation,births
0,Baby Boomers,764.404899
1,Gen X,1057.826318
2,Gen Y,1417.433778
3,Others,1407.644005
4,Silent Generation,265.74971


In [21]:
# Task 3b: Save the aggregated dataframe in task 3a above into CSV file without index (index=False) in the same folder as this file
# The filename shall be your nickname + _avg_birth_by_generation.csv
# Example: azhar_avg_birth_by_generation.csv
# No space is allowed. But make sure your nickname is recognizable for evaluation.

df.to_csv('syafeeq_avg_birth_by_generation.csv', index = False)