# Filipino Family Income and Expenditure: Regional Average


# 1. Import Python packages

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re

# 2. Loading and Categorizing the Dataset

In [2]:
data = pd.read_csv('Family Income and Expenditure.csv')

Categorizing the data into: expenditures, income, household head data, number of appliances, property data, and family composition

In [3]:
from data_utils import _expenditures_data, _income_data, _householdhead_data, _appliances_data, _property_information, _family_composition

expenditures_data = _expenditures_data(data)
income_data = _income_data(data)
householdhead_data = _householdhead_data(data)
appliances_data = _appliances_data(data)
property_data = _property_information(data)
family_composition_data = _family_composition(data)

# Checking if all of the columns have been categorized 
collection = expenditures_data +  income_data + appliances_data +  householdhead_data + family_composition_data +  property_data
missing = [element for element in data.columns if element not in collection]
print(missing)


['Region']


# 3. Exploring the Dataset: Regional Averages

In this section, we will perform data grouping based on regions. Grouping the data per region allows us to explore regional disparities, identify unique economic conditions, and understand how family income and expenditures vary across different parts of the country. We seek to answer the following questions:

1. What are the regions with the highest and lowest average family income?
2. How do family expenditures differ across regions?
3. What are the major expense categories for families in different regions?
4. Is there a correlation between family income and certain types of expenditures?
5. Are there any significant outliers in the data that indicate unique economic conditions in certain regions?

In [8]:
from ipywidgets import interact, fixed
from regional_utils import bar_chart_regional_average, create_dataframe_min_max

## 3.1 Visualizing expenditures per region using bar charts

We will first visualize the total household income and various expenditures per region using bar charts. The dropdown menu can be used to select a certain expenditure. The regions with lowest and highest values are denoted by red and green, respectively. 



In [5]:
keys = ['Total Household Income'] + expenditures_data
interact(bar_chart_regional_average, data=fixed(data), key=keys)


interactive(children=(Dropdown(description='key', options=('Total Household Income', 'Total Food Expenditure',…

<function regional_utils.bar_chart_regional_average(data, key: list)>

In [9]:
max_min_dataframe = create_dataframe_min_max(data, keys)
max_min_dataframe

Unnamed: 0,Total Household Income,Total Food Expenditure,Bread and Cereals Expenditure,Total Rice Expenditure,Meat Expenditure,Total Fish and marine products Expenditure,Fruit Expenditure,Vegetables Expenditure,Restaurant and hotels Expenditure,Alcoholic Beverages Expenditure,Tobacco Expenditure,"Clothing, Footwear and Other Wear Expenditure",Housing and water Expenditure,Medical Care Expenditure,Transportation Expenditure,Communication Expenditure,Education Expenditure,Miscellaneous Goods and Services Expenditure,Special Occasions Expenditure,Crop Farming and Gardening expenses
0,NCR,NCR,VIII - Eastern Visayas,VIII - Eastern Visayas,NCR,ARMM,NCR,CAR,NCR,II - Cagayan Valley,III - Central Luzon,NCR,NCR,IVA - CALABARZON,NCR,NCR,NCR,NCR,IVA - CALABARZON,ARMM
1,ARMM,X - Northern Mindanao,II - Cagayan Valley,IX - Zasmboanga Peninsula,ARMM,X - Northern Mindanao,X - Northern Mindanao,X - Northern Mindanao,ARMM,ARMM,CAR,ARMM,ARMM,ARMM,ARMM,ARMM,ARMM,ARMM,ARMM,NCR


NCR has the highest average total household income and total food expenditures. The region also has the highest expenditures in meat, fruits, restaurant and hotels, clothing, housing and water, transportation, communication, education, and miscellaneous goods and services. Meanwhile, the region has the lowest expenditures in crop farming and gardening.

The high average total household income and total food expenditures in the region may be attributed to its status as the country's economic and political center, leading to higher-paying job opportunities that are not available in other regions. Due to the influx of population, there is higher demand for housing which leads to increase in rent and property values.


## 3.2 Visualizing the relation between income and expenditures using scatter plot

In [10]:
from regional_utils import regional_average_dependence
interact(regional_average_dependence, data=fixed(data), key=expenditures_data)

interactive(children=(Dropdown(description='key', options=('Total Food Expenditure', 'Bread and Cereals Expend…

<function regional_utils.regional_average_dependence(data, key)>

# 3.3 Visualizing the differences in income and expenditure through a choroplath map

In [11]:
import geopandas as gpd
from regional_utils import get_string_inside_parenthesis, make_map_text

In [12]:
regions = gpd.GeoDataFrame.from_file('map/ph-regions-2015.shp')
regions.REGION = regions.REGION.apply(get_string_inside_parenthesis)

psgg_code = pd.read_csv('map/psgg_codes.csv', dtype=object)
map_names = psgg_code.loc[:, ['psgg_code', 'region']]
map_names['region'] = map_names['region'].apply(lambda x: make_map_text(x))
map_names.set_index('psgg_code', inplace=True, drop=True)
expenditures_income_data = expenditures_data.append('Total Household Income')

# Removing NIR in the regions shapes
regions_clean = regions.drop(regions.index[-1]) # Drop the last row
regional_averages = data.groupby("Region")[expenditures_data].mean().reset_index()

regional_averages["Region"] = [entry.split()[0] for entry in regional_averages["Region"]] # Representing the regions with numbers
regional_averages.loc[6, "Region"] = "IV-A" # Match the entry in regions_clean
regional_averages.loc[7, "Region"] = "IV-B" # match the entry in regions_clean
regional_averages.loc[2, "Region"] = "XIII" # match the entry in regions_clean

# Adding regional averages to the regions_clean dataframe
merged_df = pd.merge(regions_clean, regional_averages, left_on='REGION', right_on='Region', how='left')
region_order = merged_df['Region']

In [13]:
def choropleth(merged_df, key):
    plt.style.use('ggplot')
    fig, ax = plt.subplots(figsize=(10, 10))

    merged_df.plot(ax=ax, cmap='viridis', column =key, linewidth=1, legend=True)

    for i, point in merged_df.iterrows():
        point_centroid = point.geometry.centroid
        reg_n = region_order[i]
        ax.text(s=reg_n, x=point_centroid.x, y=point_centroid.y, fontsize='large')

    ax.set_title('PH Administrative Regions (Present)', fontfamily='helvetica', fontsize=20)
    ax.set_axis_off()


In [14]:
interact(choropleth, merged_df=fixed(merged_df), key=expenditures_data)

interactive(children=(Dropdown(description='key', options=('Total Food Expenditure', 'Bread and Cereals Expend…

<function __main__.choropleth(merged_df, key)>