In [1]:
import pandas as pd

In [2]:
# The data source is from Malaysia DOSM (https://data.gov.my/data-catalogue/births)
URL_DATA = 'https://storage.data.gov.my/demography/births.parquet'

# Extract data from source
df = pd.read_parquet(URL_DATA)

print(df.head())

         date     state  births
0  1920-01-01  Malaysia      96
1  1920-01-02  Malaysia     115
2  1920-01-03  Malaysia     111
3  1920-01-04  Malaysia     101
4  1920-01-05  Malaysia      95


In [5]:
# combine mismatch unique values for different columns and merge them into a sigle dataframe
subset_columns = ["date","births"]

for col in subset_columns:
    print(f"{col}:{df[col].unique()}")

date:[datetime.date(1920, 1, 1) datetime.date(1920, 1, 2)
 datetime.date(1920, 1, 3) ... datetime.date(2023, 7, 29)
 datetime.date(2023, 7, 30) datetime.date(2023, 7, 31)]
births:[  96  115  111 ... 1791 1754 1801]


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37833 entries, 0 to 37832
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    37833 non-null  object
 1   state   37833 non-null  object
 2   births  37833 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 886.8+ KB


In [7]:
import pandas as pd

# Assuming df is your original DataFrame
# Create a new DataFrame to store the combined unique values
combined_df = pd.DataFrame()

# Iterate through each column in the original DataFrame
for column in df.columns:
    # Get unique values for the current column
    unique_values = df[column].unique()
    
    # Create a temporary DataFrame for this column's unique values
    temp_df = pd.DataFrame({column: unique_values})
    
    # Merge the temporary DataFrame with the combined DataFrame
    if combined_df.empty:
        combined_df = temp_df
    else:
        combined_df = pd.merge(combined_df, temp_df, how='outer', left_index=True, right_index=True)

# Fill NaN values with a placeholder (e.g., '-') if desired
combined_df = combined_df.fillna('-')

# Reset the index to create a clean, numeric index
combined_df = combined_df.reset_index(drop=True)

print(combined_df)

             date     state births
0      1920-01-01  Malaysia   96.0
1      1920-01-02         -  115.0
2      1920-01-03         -  111.0
3      1920-01-04         -  101.0
4      1920-01-05         -   95.0
...           ...       ...    ...
37828  2023-07-27         -      -
37829  2023-07-28         -      -
37830  2023-07-29         -      -
37831  2023-07-30         -      -
37832  2023-07-31         -      -

[37833 rows x 3 columns]


In [16]:
from datetime import datetime

date = datetime.strptime("24-08-10", "%y-%m-%d")


In [22]:
# Task 2a: Extract the Month name (e.g. January/February/March etc) into another column called "MONTH_NAME"
# Then get average of births aggregate/group by MONTH_NAME
# The expected dataframe is something like this:

# |----------------|-----------|
# |MONTH_NAME      | AVG_BIRTH |
# |----------------|-----------|
# |JANUARY XXX     |           |
# |FEBRUARY XXX    |           |
# |MARCH XXX       |           |
#   .....           

import pandas as pd

# Assuming your DataFrame is named df and has 'date' and 'births' columns

# Step 1: Extract the Month name
df['MONTH_NAME'] = df['date'].dt.strftime('%B')

# Step 2: Use groupby to calculate statistics by month
births_by_month = df.groupby('MONTH_NAME')['births'].agg([
    ('AVERAGE_BIRTHS', 'mean')
]).reset_index()

# Step 3: Sort the results by month order
month_order = ['January', 'February', 'March', 'April', 'May', 'June', 
               'July', 'August', 'September', 'October', 'November', 'December']
births_by_month['month_order'] = pd.Categorical(births_by_month['MONTH_NAME'], categories=month_order, ordered=True)
births_by_month = births_by_month.sort_values('month_order').drop('month_order', axis=1)

# Step 4: Round the numeric columns to 2 decimal places
births_by_month['AVERAGE_BIRTHS'] = births_by_month['AVERAGE_BIRTHS'].round(2)

# Display the result
print(births_by_month)


   MONTH_NAME  AVERAGE_BIRTHS
4     January          928.12
3    February          904.98
7       March          926.43
0       April          945.39
8         May          959.32
6        June          960.35
5        July          946.09
1      August          951.49
11  September          981.74
10    October          985.88
9    November          963.22
2    December          951.10


In [23]:
# Task 2b: Save the aggregated dataframe in task 2a above into CSV file without index (index=False) in the same folder as this file
# The filename shall be your nickname + _avg_birth_by_month.csv
# Example: azhar_avg_birth_by_month.csv
# No space is allowed. But make sure your nickname is recognizable for evaluation.

import pandas as pd

# Assuming you've already created the aggregated DataFrame as shown in the previous example
# Let's call it 'births_by_month'

# Replace 'your_nickname' with your actual nickname
nickname = "hariz"

# Create the filename
filename = f"{nickname}_avg_birth_by_month.csv"

# Save the DataFrame to CSV without the index
births_by_month.to_csv(filename, index=False)

print(f"File saved as: {filename}")

File saved as: hariz_avg_birth_by_month.csv


In [24]:
# Task 3a: Create a dataframe to calculate average birth by the following generation group:
# --> Silent Generation - from 1928 to 1945 included
# --> Baby Boomers - from 1946 to 1964 included
# --> Gen X - from 1965 to 1980 included
# --> Gen Y - from 1981 to 1996 included
# The expected dataframe is something like this:

# |----------------|-----------|
# |GENERATION      | AVG_BIRTH |
# |----------------|-----------|
# |Gen X           |           |
# |Gen Y           |           |
# ......

# Hint: Use for-loop and if/else

import pandas as pd
import numpy as np

# Assuming your DataFrame is named df and has 'date' and 'births' columns

# Define the generation ranges
generations = {
    'Silent Generation': (1928, 1945),
    'Baby Boomers': (1946, 1964),
    'Gen X': (1965, 1980),
    'Gen Y': (1981, 1996)
}

# Initialize lists to store results
gen_names = []
avg_births = []

# Loop through the generations
for gen_name, (start_year, end_year) in generations.items():
    # Filter the DataFrame for the current generation
    gen_df = df[(df['date'].dt.year >= start_year) & (df['date'].dt.year <= end_year)]
    
    if not gen_df.empty:
        gen_names.append(gen_name)
        avg_births.append(gen_df['births'].mean())
    else:
        print(f"No data found for {gen_name}")

# Create the result DataFrame
result_df = pd.DataFrame({
    'GENERATION': gen_names,
    'AVG_BIRTH': avg_births
})

# Round the AVG_BIRTH column to 2 decimal places
result_df['AVG_BIRTH'] = result_df['AVG_BIRTH'].round(2)

# Display the result
print(result_df)

          GENERATION  AVG_BIRTH
0  Silent Generation     319.76
1       Baby Boomers     764.40
2              Gen X    1057.83
3              Gen Y    1417.43


In [25]:
# Task 3b: Save the aggregated dataframe in task 3a above into CSV file without index (index=False) in the same folder as this file
# The filename shall be your nickname + _avg_birth_by_generation.csv
# Example: azhar_avg_birth_by_generation.csv
# No space is allowed. But make sure your nickname is recognizable for evaluation.

# Replace 'your_nickname' with your actual nickname
nickname = "Hariz"

# Create the filename
filename = f"{nickname}_avg_birth_by_generation.csv"

# Save the DataFrame to CSV without the index
result_df.to_csv(filename, index=False)

print(f"File saved as: {filename}")

File saved as: Hariz_avg_birth_by_generation.csv
