#### Calculating the percent share of agriculture in total employment per municipality or province for the year 2022 (PSA Labor Force Survey)

Step 1: Import pandas

In [1]:
import pandas as pd

Step 2: Combine all csv files in folder into one DF

In [2]:
# Import glob module to find files in directory
import glob

In [3]:
# Store all csv filenames in list
file_list = glob.glob(r"L*.csv")
file_list

['LFS PUF April 2022.csv',
 'LFS PUF August 2022.CSV',
 'LFS PUF December 2022.CSV',
 'LFS PUF February 2022.csv',
 'LFS PUF January 2022.csv',
 'LFS PUF July 2022.CSV',
 'LFS PUF June 2022.csv',
 'LFS PUF March 2022.csv',
 'LFS PUF May 2022.csv',
 'LFS PUF November 2022.CSV',
 'LFS PUF October 2022.CSV',
 'LFS PUF September 2022.CSV']

In [4]:
# Create empty dataframe to store all combined DFs
lfs_2022_df = pd.DataFrame()

# Define function to check if column name...
# ...contains 'REG', 'WORK', 'PROVMUN', or 'PKB'
def check_str(col_name):
     substrs = ['REG','_WORK', '_PROVMUN', '_PKB']
     return any(x in col_name for x in substrs)

# For each csv file in folder...
for fp in file_list:
    # ...read into a DF, w/ specified cols & NaN values...
    df = pd.read_csv(fp, usecols=check_str,
                    na_values=[' ', '  ', '   ', '    ', '     ', '      '])
    
    # ...then append to empty DF for whole year
    lfs_2022_df = pd.concat([lfs_2022_df, df], ignore_index=True)
    
# Display head of merged DF
lfs_2022_df.head(20)

Unnamed: 0,PUFREG,PUFC11_WORK,PUFC12A_PROVMUN,PUFC16_PKB,PUFC09_WORK,PUFC09A_WORK,PUFC11A_PROVMUN,PUFC15_PKB
0,1,1.0,2802.0,1.0,,,,
1,1,,,,,,,
2,1,,,,,,,
3,1,,,,,,,
4,1,1.0,2802.0,1.0,,,,
5,1,1.0,2820.0,82.0,,,,
6,1,2.0,,,,,,
7,1,1.0,2802.0,85.0,,,,
8,1,1.0,2802.0,85.0,,,,
9,1,2.0,,,,,,


Step 3: Merge all alike columns and drop extra columns

In [5]:
# Examine first 20 rows where PUFC11_WORK is not null
lfs_2022_df.loc[~lfs_2022_df['PUFC11_WORK'].isnull()].head(20)

Unnamed: 0,PUFREG,PUFC11_WORK,PUFC12A_PROVMUN,PUFC16_PKB,PUFC09_WORK,PUFC09A_WORK,PUFC11A_PROVMUN,PUFC15_PKB
0,1,1.0,2802.0,1.0,,,,
4,1,1.0,2802.0,1.0,,,,
5,1,1.0,2820.0,82.0,,,,
6,1,2.0,,,,,,
7,1,1.0,2802.0,85.0,,,,
8,1,1.0,2802.0,85.0,,,,
9,1,2.0,,,,,,
10,1,2.0,,,,,,
11,1,1.0,2802.0,1.0,,,,
12,1,1.0,2802.0,1.0,,,,


In [6]:
# For all rows where PUFC11_WORK is null, copy values from PUFC09_WORK and PUFC09A_WORK
lfs_2022_df.loc[lfs_2022_df['PUFC11_WORK'].isnull(), 'PUFC11_WORK'] =\
    lfs_2022_df['PUFC09_WORK']

lfs_2022_df.loc[lfs_2022_df['PUFC11_WORK'].isnull(), 'PUFC11_WORK'] =\
    lfs_2022_df['PUFC09A_WORK']

lfs_2022_df.tail(10)

Unnamed: 0,PUFREG,PUFC11_WORK,PUFC12A_PROVMUN,PUFC16_PKB,PUFC09_WORK,PUFC09A_WORK,PUFC11A_PROVMUN,PUFC15_PKB
1655195,17,2.0,,,2.0,,,
1655196,17,,,,,,,
1655197,17,2.0,,,2.0,,,
1655198,17,1.0,,,1.0,1.0,5914.0,41.0
1655199,17,2.0,,,2.0,,,
1655200,17,2.0,,,2.0,,,
1655201,17,1.0,,,1.0,1.0,5914.0,41.0
1655202,17,1.0,,,1.0,1.0,1014.0,96.0
1655203,17,2.0,,,2.0,,,
1655204,17,,,,,,,


In [7]:
# Drop extra WORK columns
lfs_2022_df = lfs_2022_df.drop(['PUFC09_WORK', 'PUFC09A_WORK'], axis=1)
lfs_2022_df.tail(10)

Unnamed: 0,PUFREG,PUFC11_WORK,PUFC12A_PROVMUN,PUFC16_PKB,PUFC11A_PROVMUN,PUFC15_PKB
1655195,17,2.0,,,,
1655196,17,,,,,
1655197,17,2.0,,,,
1655198,17,1.0,,,5914.0,41.0
1655199,17,2.0,,,,
1655200,17,2.0,,,,
1655201,17,1.0,,,5914.0,41.0
1655202,17,1.0,,,1014.0,96.0
1655203,17,2.0,,,,
1655204,17,,,,,


Repeat Step 3 for PROVMUN and PKB columns

In [8]:
# For all rows where PUFC12A_PROVMUN is null, copy values from PUFC11A_PROVMUN
lfs_2022_df.loc[lfs_2022_df['PUFC12A_PROVMUN'].isnull(), 'PUFC12A_PROVMUN'] =\
    lfs_2022_df['PUFC11A_PROVMUN']

lfs_2022_df.tail(10)

Unnamed: 0,PUFREG,PUFC11_WORK,PUFC12A_PROVMUN,PUFC16_PKB,PUFC11A_PROVMUN,PUFC15_PKB
1655195,17,2.0,,,,
1655196,17,,,,,
1655197,17,2.0,,,,
1655198,17,1.0,5914.0,,5914.0,41.0
1655199,17,2.0,,,,
1655200,17,2.0,,,,
1655201,17,1.0,5914.0,,5914.0,41.0
1655202,17,1.0,1014.0,,1014.0,96.0
1655203,17,2.0,,,,
1655204,17,,,,,


In [9]:
# For all rows where PUFC6_PKB is null, copy values from PUFC15_PKB
lfs_2022_df.loc[lfs_2022_df['PUFC16_PKB'].isnull(), 'PUFC16_PKB'] =\
    lfs_2022_df['PUFC15_PKB']

lfs_2022_df.tail(10)

Unnamed: 0,PUFREG,PUFC11_WORK,PUFC12A_PROVMUN,PUFC16_PKB,PUFC11A_PROVMUN,PUFC15_PKB
1655195,17,2.0,,,,
1655196,17,,,,,
1655197,17,2.0,,,,
1655198,17,1.0,5914.0,41.0,5914.0,41.0
1655199,17,2.0,,,,
1655200,17,2.0,,,,
1655201,17,1.0,5914.0,41.0,5914.0,41.0
1655202,17,1.0,1014.0,96.0,1014.0,96.0
1655203,17,2.0,,,,
1655204,17,,,,,


In [10]:
# Drop extra PROVMUN and PKB columns
lfs_2022_df = lfs_2022_df.drop(['PUFC11A_PROVMUN', 'PUFC15_PKB'], axis=1)
lfs_2022_df.tail(10)

Unnamed: 0,PUFREG,PUFC11_WORK,PUFC12A_PROVMUN,PUFC16_PKB
1655195,17,2.0,,
1655196,17,,,
1655197,17,2.0,,
1655198,17,1.0,5914.0,41.0
1655199,17,2.0,,
1655200,17,2.0,,
1655201,17,1.0,5914.0,41.0
1655202,17,1.0,1014.0,96.0
1655203,17,2.0,,
1655204,17,,,


Step 4: Save to new DF only the rows where WORK = 1 (employed)

In [11]:
work_df = lfs_2022_df.loc[lfs_2022_df['PUFC11_WORK'] == 1]
work_df.sample(20)

Unnamed: 0,PUFREG,PUFC11_WORK,PUFC12A_PROVMUN,PUFC16_PKB
660507,9,1.0,7319.0,1.0
147820,14,1.0,1102.0,87.0
185153,1,1.0,3314.0,46.0
1348617,6,1.0,7902.0,84.0
428595,3,1.0,7701.0,41.0
85370,9,1.0,7221.0,1.0
159305,15,1.0,3631.0,84.0
1300473,6,1.0,1911.0,41.0
46621,5,1.0,4119.0,84.0
199226,7,1.0,1212.0,1.0


Step 5: Rename columns and drop work indicator column

In [12]:
col_names = {
    'PUFREG': 'REGION',
    'PUFC12A_PROVMUN': 'PROV_MUN',
    'PUFC16_PKB': 'INDUSTRY'
}

work_df = work_df.rename(columns=col_names)[['REGION', 'PROV_MUN', 'INDUSTRY']]
work_df.sample(20)

Unnamed: 0,REGION,PROV_MUN,INDUSTRY
1624053,6,7903.0,96.0
1302761,7,1247.0,1.0
345067,1,5532.0,85.0
358512,2,3108.0,1.0
712807,10,4302.0,86.0
1393541,6,3022.0,1.0
835943,13,7403.0,56.0
1391226,5,4114.0,3.0
1050814,17,5323.0,47.0
1389529,5,514.0,56.0


Step 6: Parse province out of PROV_MUN column

In [13]:
# Remove decimal place from prov-muni by converting to int
work_df['PROV_MUN'] = work_df['PROV_MUN'].astype(int)

# Convert to string, pad w/ leading zeroes up to 4 chars,
# then slice out first 2 chars as province code
work_df['PROVINCE'] = work_df['PROV_MUN'].astype(str)\
                      .str.zfill(4).str.slice(0,2)

work_df.sample(20)

Unnamed: 0,REGION,PROV_MUN,INDUSTRY,PROVINCE
1224550,16,202,47.0,2
779934,12,6311,47.0,63
321104,1,2803,1.0,28
342235,1,3403,41.0,34
234469,3,5408,49.0,54
273775,17,5305,1.0,53
1289795,2,3122,1.0,31
500874,5,4103,96.0,41
607181,8,2612,84.0,26
1092587,4,5802,41.0,58


In [14]:
# Count number of unique municipality codes
len(work_df['PROV_MUN'].unique())

1637

In [15]:
# Count number of unique province codes
len(work_df['PROVINCE'].unique())

87

Step 7: Group DF by municipality

In [16]:
by_muni = work_df.groupby('PROV_MUN')
len(by_muni)

1637

Step 8: Calculate percent share of agriculture in total employment per municipality

In [17]:
# Create empty dataframe for aggregated values
agshare_by_muni = pd.DataFrame()

# Iterate over ach municipality
for key, group in by_muni:

    # Create empty series to store each calculation
    c = pd.Series()

    # Add region & muni code to series of values
    c['REGION'] = group['REGION'].median().astype(int)
    c['PROVINCE'] = group['PROVINCE'].astype(int).median()
    c['PROV_MUN'] = key

    # Count total number of employed (i.e. length per group)
    c['TOTAL_EMPLOYED'] = group['INDUSTRY'].count()

    # Count number of people employed in agriculture
    c['AGRI_EMPLOYED'] = group.loc[group['INDUSTRY'] < 4, 'INDUSTRY'].count()

    # Calculate ratio between agri & total employment
    # and round off values to 2 decimal places
    c['PERCENT_AGRI'] = c['AGRI_EMPLOYED'] / c['TOTAL_EMPLOYED'] * 100
    c['PERCENT_AGRI'] = round(c['PERCENT_AGRI'], 2)

    # Convert series into dataframe and transpose into a row
    row = c.to_frame().transpose()

    # Append new row into agshare_by_muni dataframe
    agshare_by_muni = pd.concat([agshare_by_muni, row], ignore_index=True)

agshare_by_muni

Unnamed: 0,REGION,PROVINCE,PROV_MUN,TOTAL_EMPLOYED,AGRI_EMPLOYED,PERCENT_AGRI
0,14.0,1.0,101.0,1725.0,286.0,16.58
1,14.0,1.0,102.0,161.0,111.0,68.94
2,14.0,1.0,103.0,492.0,181.0,36.79
3,14.0,1.0,104.0,104.0,52.0,50.00
4,14.0,1.0,105.0,52.0,28.0,53.85
...,...,...,...,...,...,...
1632,11.0,86.0,8603.0,2296.0,1248.0,54.36
1633,11.0,86.0,8604.0,1026.0,498.0,48.54
1634,11.0,86.0,8605.0,438.0,251.0,57.31
1635,9.0,97.0,9701.0,2236.0,470.0,21.02


Step 9: Read metadata file into DF of province & municipality names

In [18]:
# Read metadata xlsx file as DF

names_fp = r"lfs_november_2022_metadata(dictionary).xlsx"

geo_names = pd.read_excel(names_fp, sheet_name=r"lfs_november_2022_valueset",
                          skiprows=168, skipfooter=284)

# Define dict to rename columns
new_cols = {
    'Unnamed: 2': 'LOCATION',
    'Unnamed: 3': 'LOC_CODE'
}

geo_names = geo_names.rename(columns=new_cols)[['LOCATION', 'LOC_CODE']]

geo_names.sample(10)

Unnamed: 0,LOCATION,LOC_CODE
742,3515 LANAO DEL NORTE - Nunungan,3515.0
1070,4908 NUEVA ECIJA - City of Gapan,4908.0
9,0110 ABRA - Lagangilang,110.0
70,0501 ALBAY - Bacacay,501.0
35,0209 AGUSAN DEL NORTE - Nasipit,209.0
1110,5101 OCCIDENTAL MINDORO - Abra De Ilog,5101.0
741,3514 LANAO DEL NORTE - Munai,3514.0
516,2619 EASTERN SAMAR - Salcedo,2619.0
1603,8105 APAYAO - Luna,8105.0
1503,7204 ZAMBOANGA DEL NORTE - La Libertad,7204.0


In [19]:
# Split LOCATION column into province and municipality
names = geo_names['LOCATION'].str.split(' - ', expand=True)

# Put first item of split list into PROV_NAME column
# But only get all chars after 4-digit code and space
geo_names['PROV_NAME'] = names[0].str.slice(start=6)

# Put second item of split list into MUN_NAME column
geo_names['MUN_NAME'] = names[1]

geo_names.sample(10)


Unnamed: 0,LOCATION,LOC_CODE,PROV_NAME,MUN_NAME
721,3424 LAGUNA - City of San Pablo,3424.0,LAGUNA,City of San Pablo
399,2111 CAVITE - Kawit,2111.0,CAVITE,Kawit
1471,6915 TARLAC - Santa Ignacia,6915.0,TARLAC,Santa Ignacia
409,2121 CAVITE - Ternate,2121.0,CAVITE,Ternate
101,0614 ANTIQUE - San Remigio,614.0,ANTIQUE,San Remigio
262,1409 BULACAN - Hagonoy,1409.0,BULACAN,Hagonoy
228,1245 BOHOL - Tubigon,1245.0,BOHOL,Tubigon
630,3045 ILOILO - Tigbauan,3045.0,ILOILO,Tigbauan
781,3632 LANAO DEL SUR - Calanogas,3632.0,LANAO DEL SUR,Calanogas
1037,4717 NORTH COTABATO - Aleosan,4717.0,NORTH COTABATO,Aleosan


In [20]:
# Drop unneeded LOCATION column
geo_names.drop(columns=['LOCATION'], inplace=True)

geo_names.sample(10)

Unnamed: 0,LOC_CODE,PROV_NAME,MUN_NAME
397,2109.0,CAVITE,City of Imus
1338,6105.0,SIQUIJOR,San Juan
384,2007.0,CATANDUANES,Panganiban (Payo)
1206,5525.0,PANGASINAN,Manaoag
950,4323.0,MISAMIS ORIENTAL,Sugbongcogon
240,1309.0,BUKIDNON,Kitaotao
94,607.0,ANTIQUE,Tobias Fornier (Dao)
1092,4930.0,NUEVA ECIJA,Talavera
47,309.0,AGUSAN DEL SUR,San Luis
1080,4918.0,NUEVA ECIJA,Nampicuan


Step 10: Join names DF with employment DF (on muni code)

In [21]:
agshare_by_muni = agshare_by_muni.merge(right=geo_names, left_on='PROV_MUN', right_on='LOC_CODE')

agshare_by_muni.sample(10)

Unnamed: 0,REGION,PROVINCE,PROV_MUN,TOTAL_EMPLOYED,AGRI_EMPLOYED,PERCENT_AGRI,LOC_CODE,PROV_NAME,MUN_NAME
847,15.0,38.0,3820.0,179.0,152.0,84.92,3820.0,MAGUINDANAO,Mamasapano
367,6.0,19.0,1908.0,186.0,73.0,39.25,1908.0,CAPIZ,Mambusao
929,10.0,43.0,4308.0,623.0,231.0,37.08,4308.0,MISAMIS ORIENTAL,City of Gingoog
25,14.0,1.0,126.0,157.0,115.0,73.25,126.0,ABRA,Tubo
909,10.0,42.0,4205.0,404.0,158.0,39.11,4205.0,MISAMIS OCCIDENTAL,Clarin
723,4.0,34.0,3430.0,6.0,0.0,0.0,3430.0,LAGUNA,Victoria
303,2.0,15.0,1527.0,509.0,237.0,46.56,1527.0,CAGAYAN,Solana
1493,9.0,72.0,7204.0,2.0,0.0,0.0,7204.0,ZAMBOANGA DEL NORTE,La Libertad
80,5.0,5.0,511.0,160.0,50.0,31.25,511.0,ALBAY,Manito
843,15.0,38.0,3816.0,74.0,45.0,60.81,3816.0,MAGUINDANAO,Talayan


Step 11: Clean up province and municipality names

In [22]:
# Remove parentheticals from province & municipality names
# agshare_by_muni['PROV_NAME'] = agshare_by_muni['PROV_NAME'].str.replace(r"\(.+\)", "")

aliases = agshare_by_muni['PROV_NAME'].str.split(' \(', expand=True)
agshare_by_muni['PROV_NAME'] = aliases[0].str.strip()

list(agshare_by_muni['PROV_NAME'].unique())

  aliases = agshare_by_muni['PROV_NAME'].str.split(' \(', expand=True)


['ABRA',
 'AGUSAN DEL NORTE',
 'AGUSAN DEL SUR',
 'AKLAN',
 'ALBAY',
 'ANTIQUE',
 'BASILAN',
 'BATAAN',
 'BATANES',
 'BATANGAS',
 'BENGUET',
 'BOHOL',
 'BUKIDNON',
 'BULACAN',
 'CAGAYAN',
 'CAMARINES NORTE',
 'CAMARINES SUR',
 'CAMIGUIN',
 'CAPIZ',
 'CATANDUANES',
 'CAVITE',
 'CEBU',
 'DAVAO DEL NORTE',
 'DAVAO DEL SUR',
 'DAVAO ORIENTAL',
 'EASTERN SAMAR',
 'IFUGAO',
 'ILOCOS NORTE',
 'ILOCOS SUR',
 'ILOILO',
 'ISABELA',
 'KALINGA',
 'LA UNION',
 'LAGUNA',
 'LANAO DEL NORTE',
 'LANAO DEL SUR',
 'LEYTE',
 'MAGUINDANAO',
 'City of Manila',
 'MARINDUQUE',
 'MASBATE',
 'MISAMIS OCCIDENTAL',
 'MISAMIS ORIENTAL',
 'MOUNTAIN PROVINCE',
 'NEGROS OCCIDENTAL',
 'NEGROS ORIENTAL',
 'NORTH COTABATO',
 'NORTHERN SAMAR',
 'NUEVA ECIJA',
 'NUEVA VIZCAYA',
 'OCCIDENTAL MINDORO',
 'ORIENTAL MINDORO',
 'PALAWAN',
 'PAMPANGA',
 'PANGASINAN',
 'QUEZON',
 'QUIRINO',
 'RIZAL',
 'ROMBLON',
 'SAMAR',
 'SIQUIJOR',
 'SORSOGON',
 'SOUTH COTABATO',
 'SOUTHERN LEYTE',
 'SULTAN KUDARAT',
 'SULU',
 'SURIGAO DEL NOR

In [23]:
# Do the same for municipality names
aliases = agshare_by_muni['MUN_NAME'].str.split(' \(', expand=True)
agshare_by_muni['MUN_NAME'] = aliases[0].str.strip()

agshare_by_muni['MUN_NAME'].head(20)

  aliases = agshare_by_muni['MUN_NAME'].str.split(' \(', expand=True)


0         Bangued
1         Boliney
2           Bucay
3          Bucloc
4       Daguioman
5         Danglas
6         Dolores
7          La Paz
8           Lacub
9     Lagangilang
10        Lagayan
11       Langiden
12    Licuan-Baay
13           Luba
14      Malibcong
15         Manabo
16     Peñarrubia
17        Pidigan
18          Pilar
19     Sallapadan
Name: MUN_NAME, dtype: object

Step 9B: Clean up some rows

In [24]:
# Define dict w/ keys = old names and values = fixed names
city_names = {
    'Calaca': 'City of Calaca',
    'Pres. Carlos P. Garcia': 'President Carlos P. Garcia',
    'Baliuag': 'City of Baliwag',
    'Carmona': 'City of Carmona',
    'Datu Saudi-Ampatuan': 'Datu Saudi Ampatuan',
    'Pio V. Corpuz': 'Pio V. Corpus',
    'Bacungan': 'Leon T. Postigo',
    'Hinoba-an': 'Hinoba-An'
}

agshare_by_muni.replace({'MUN_NAME': city_names}, inplace=True)

new_names = list(city_names.values())

agshare_by_muni.loc[agshare_by_muni['MUN_NAME'].isin(new_names)]

Unnamed: 0,REGION,PROVINCE,PROV_MUN,TOTAL_EMPLOYED,AGRI_EMPLOYED,PERCENT_AGRI,LOC_CODE,PROV_NAME,MUN_NAME
142,4.0,10.0,1007.0,110.0,16.0,14.55,1007.0,BATANGAS,City of Calaca
217,7.0,12.0,1235.0,115.0,36.0,31.3,1235.0,BOHOL,President Carlos P. Garcia
255,3.0,14.0,1403.0,514.0,21.0,4.09,1403.0,BULACAN,City of Baliwag
391,4.0,21.0,2104.0,285.0,5.0,1.75,2104.0,CAVITE,City of Carmona
853,15.0,38.0,3826.0,115.0,69.0,60.0,3826.0,MAGUINDANAO,Datu Saudi Ampatuan
899,5.0,41.0,4116.0,211.0,133.0,63.03,4116.0,MASBATE,Pio V. Corpus
969,6.0,45.0,4512.0,107.0,66.0,61.68,4512.0,NEGROS OCCIDENTAL,Hinoba-An
1515,9.0,72.0,7226.0,7.0,1.0,14.29,7226.0,ZAMBOANGA DEL NORTE,Leon T. Postigo


In [25]:
# Assign municipality names for City of Isabela and City of Cotabato
agshare_by_muni.loc[agshare_by_muni['PROV_NAME'] == 'CITY OF ISABELA', 'MUN_NAME'] \
    = 'City of Isabela'

agshare_by_muni.loc[agshare_by_muni['PROV_NAME'] == 'COTABATO CITY', 'MUN_NAME'] \
    = 'City of Cotabato'

agshare_by_muni.loc[agshare_by_muni['MUN_NAME'].isin(['City of Isabela', 'City of Cotabato'])]

Unnamed: 0,REGION,PROVINCE,PROV_MUN,TOTAL_EMPLOYED,AGRI_EMPLOYED,PERCENT_AGRI,LOC_CODE,PROV_NAME,MUN_NAME
1635,9.0,97.0,9701.0,2236.0,470.0,21.02,9701.0,CITY OF ISABELA,City of Isabela
1636,12.0,98.0,9804.0,4091.0,243.0,5.94,9804.0,COTABATO CITY,City of Cotabato


In [26]:
# Merge all Manila sub-municipalities into one municipality (City of Manila)
manila = agshare_by_muni.loc[agshare_by_muni['PROVINCE'] == 39]\
        .groupby('PROVINCE', as_index=False)\
        .agg({
            'REGION': 'min',
            'PROVINCE': 'min',
            'PROV_MUN': 'min',
            'TOTAL_EMPLOYED': 'sum',
            'AGRI_EMPLOYED': 'sum',
            'PERCENT_AGRI': 'min',
            'LOC_CODE': 'min',
            'PROV_NAME': 'first',
            'MUN_NAME': 'first'
        })

# Re-calculate PERCENT_AGRI and assign municipality name
manila['PERCENT_AGRI'] = round(manila['AGRI_EMPLOYED'] / manila['TOTAL_EMPLOYED'] * 100, 2)
manila['MUN_NAME'].replace({'Tondo I/II': 'City of Manila'}, inplace=True)

# Append new row into agshare DF
agshare_by_muni = pd.concat([agshare_by_muni, manila], ignore_index=True)

# Drop all Manila sub-municipalities
drop_indexes = agshare_by_muni.loc[(agshare_by_muni['PROVINCE'] == 39) & (agshare_by_muni['MUN_NAME'] != 'City of Manila')].index
agshare_by_muni.drop(drop_indexes, inplace=True)

agshare_by_muni.loc[agshare_by_muni['PROVINCE'] == 39]

Unnamed: 0,REGION,PROVINCE,PROV_MUN,TOTAL_EMPLOYED,AGRI_EMPLOYED,PERCENT_AGRI,LOC_CODE,PROV_NAME,MUN_NAME
1637,13.0,39.0,3901.0,8087.0,21.0,0.26,3901.0,City of Manila,City of Manila


Step 12: Join agshare table with PGSC codes

In [27]:
# Define filepath to xlsx file with municipality PSGC codes
psgc_fp = r"C:\Users\HOWARD\Desktop\DENR CCIMS\C - VULNERABILITY\Health Vulnerability\ABM3_PercentRural_PSA_2020.xlsx"

# Read excel file as DF
psgc_df = pd.read_excel(psgc_fp, # sheet_name='PSGC',
                        usecols=['ADM2', 'ADM3_EN', 'ADM3_CODE'])

psgc_df.head(10)

Unnamed: 0,ADM2,ADM3_EN,ADM3_CODE
0,"NCR, First District, City of Manila",City of Manila,133900000
1,"NCR, Second District",City of Mandaluyong,137401000
2,"NCR, Second District",City of Marikina,137402000
3,"NCR, Second District",City of Pasig,137403000
4,"NCR, Second District",Quezon City,137404000
5,"NCR, Second District",City of San Juan,137405000
6,"NCR, Third District",City of Caloocan,137501000
7,"NCR, Third District",City of Malabon,137502000
8,"NCR, Third District",City of Navotas,137503000
9,"NCR, Third District",City of Valenzuela,137504000


Clean up municipality names and codes before joining

In [28]:
# Clean up data: convert 0 into -
psgc_df['ADM3_EN'] = psgc_df['ADM3_EN'].replace('0', '-', regex=True)

# Remove parentheticals from municipality names
aliases = psgc_df['ADM3_EN'].str.split(' \(', expand=True)
psgc_df['ADM3_EN'] = aliases[0].str.strip()

# Convert muni code to string
psgc_df['ADM3_CODE'] = psgc_df['ADM3_CODE'].astype(str).str.zfill(9)

# Check all rows where code starts with '0215' (Cagayan)
psgc_df.loc[psgc_df['ADM3_CODE'].str.startswith('0215')].head(30)

  aliases = psgc_df['ADM3_EN'].str.split(' \(', expand=True)


Unnamed: 0,ADM2,ADM3_EN,ADM3_CODE
225,Cagayan,Abulug,21501000
226,Cagayan,Alcala,21502000
227,Cagayan,Allacapan,21503000
228,Cagayan,Amulung,21504000
229,Cagayan,Aparri,21505000
230,Cagayan,Baggao,21506000
231,Cagayan,Ballesteros,21507000
232,Cagayan,Buguey,21508000
233,Cagayan,Calayan,21509000
234,Cagayan,Camalaniugan,21510000


In [29]:
# In ADM2, change Cotabato to North Cotabato
psgc_df.loc[psgc_df['ADM2'] == 'Cotabato', 'ADM2'] = 'North Cotabato'

# Change ADM2 of Cotabato City to Cotabato City
psgc_df.loc[psgc_df['ADM3_EN'] == 'City of Cotabato', 'ADM2'] = 'Cotabato City'

# Change ADM2 of City of Manila to City of Manila
psgc_df.loc[psgc_df['ADM3_EN'] == 'City of Manila', 'ADM2'] = 'City of Manila'

# Change ADM3_EN of Sto. Tomas (in Pampanga) to Santo Tomas
psgc_df.loc[psgc_df['ADM3_EN'] == 'Sto. Tomas', 'ADM3_EN'] = 'Santo Tomas'

Perform join

In [30]:
# Join province PSGC codes to province names
agshare_by_muni_psgc = agshare_by_muni.merge(psgc_df, how='left', 
                                            left_on=[agshare_by_muni['PROV_NAME'].str.lower(), agshare_by_muni['MUN_NAME'].str.lower()],
                                            right_on=[psgc_df['ADM2'].str.lower(), psgc_df['ADM3_EN'].str.lower()])
                                             # left_on='MUN_NAME', 
                                             # right_on='ADM3_EN')

agshare_by_muni_psgc.loc[agshare_by_muni_psgc['PROV_NAME'] == 'CAGAYAN']

Unnamed: 0,key_0,key_1,REGION,PROVINCE,PROV_MUN,TOTAL_EMPLOYED,AGRI_EMPLOYED,PERCENT_AGRI,LOC_CODE,PROV_NAME,MUN_NAME,ADM2,ADM3_EN,ADM3_CODE
277,cagayan,abulug,2.0,15.0,1501.0,224.0,119.0,53.12,1501.0,CAGAYAN,Abulug,Cagayan,Abulug,21501000
278,cagayan,alcala,2.0,15.0,1502.0,300.0,181.0,60.33,1502.0,CAGAYAN,Alcala,Cagayan,Alcala,21502000
279,cagayan,allacapan,2.0,15.0,1503.0,209.0,125.0,59.81,1503.0,CAGAYAN,Allacapan,Cagayan,Allacapan,21503000
280,cagayan,amulung,2.0,15.0,1504.0,153.0,120.0,78.43,1504.0,CAGAYAN,Amulung,Cagayan,Amulung,21504000
281,cagayan,aparri,2.0,15.0,1505.0,332.0,124.0,37.35,1505.0,CAGAYAN,Aparri,Cagayan,Aparri,21505000
282,cagayan,baggao,2.0,15.0,1506.0,320.0,183.0,57.19,1506.0,CAGAYAN,Baggao,Cagayan,Baggao,21506000
283,cagayan,ballesteros,2.0,15.0,1507.0,300.0,127.0,42.33,1507.0,CAGAYAN,Ballesteros,Cagayan,Ballesteros,21507000
284,cagayan,buguey,2.0,15.0,1508.0,147.0,98.0,66.67,1508.0,CAGAYAN,Buguey,Cagayan,Buguey,21508000
285,cagayan,calayan,2.0,15.0,1509.0,2.0,1.0,50.0,1509.0,CAGAYAN,Calayan,Cagayan,Calayan,21509000
286,cagayan,camalaniugan,2.0,15.0,1510.0,100.0,57.0,57.0,1510.0,CAGAYAN,Camalaniugan,Cagayan,Camalaniugan,21510000


In [31]:
# Check if some rows failed to join
agshare_by_muni_psgc.loc[agshare_by_muni_psgc['ADM3_CODE'].isnull()]#['MUN_NAME'].values

Unnamed: 0,key_0,key_1,REGION,PROVINCE,PROV_MUN,TOTAL_EMPLOYED,AGRI_EMPLOYED,PERCENT_AGRI,LOC_CODE,PROV_NAME,MUN_NAME,ADM2,ADM3_EN,ADM3_CODE


In [32]:
agshare_by_muni_psgc.columns

Index(['key_0', 'key_1', 'REGION', 'PROVINCE', 'PROV_MUN', 'TOTAL_EMPLOYED',
       'AGRI_EMPLOYED', 'PERCENT_AGRI', 'LOC_CODE', 'PROV_NAME', 'MUN_NAME',
       'ADM2', 'ADM3_EN', 'ADM3_CODE'],
      dtype='object')

In [33]:
agshare_by_muni_psgc.shape

(1624, 14)

In [34]:
agshare_by_muni_psgc['ADM3_CODE_PH'] = 'PH' + agshare_by_muni_psgc['ADM3_CODE']

agshare_by_muni_psgc.head(10)

Unnamed: 0,key_0,key_1,REGION,PROVINCE,PROV_MUN,TOTAL_EMPLOYED,AGRI_EMPLOYED,PERCENT_AGRI,LOC_CODE,PROV_NAME,MUN_NAME,ADM2,ADM3_EN,ADM3_CODE,ADM3_CODE_PH
0,abra,bangued,14.0,1.0,101.0,1725.0,286.0,16.58,101.0,ABRA,Bangued,Abra,Bangued,140101000,PH140101000
1,abra,boliney,14.0,1.0,102.0,161.0,111.0,68.94,102.0,ABRA,Boliney,Abra,Boliney,140102000,PH140102000
2,abra,bucay,14.0,1.0,103.0,492.0,181.0,36.79,103.0,ABRA,Bucay,Abra,Bucay,140103000,PH140103000
3,abra,bucloc,14.0,1.0,104.0,104.0,52.0,50.0,104.0,ABRA,Bucloc,Abra,Bucloc,140104000,PH140104000
4,abra,daguioman,14.0,1.0,105.0,52.0,28.0,53.85,105.0,ABRA,Daguioman,Abra,Daguioman,140105000,PH140105000
5,abra,danglas,14.0,1.0,106.0,53.0,11.0,20.75,106.0,ABRA,Danglas,Abra,Danglas,140106000,PH140106000
6,abra,dolores,14.0,1.0,107.0,233.0,87.0,37.34,107.0,ABRA,Dolores,Abra,Dolores,140107000,PH140107000
7,abra,la paz,14.0,1.0,108.0,442.0,163.0,36.88,108.0,ABRA,La Paz,Abra,La Paz,140108000,PH140108000
8,abra,lacub,14.0,1.0,109.0,154.0,114.0,74.03,109.0,ABRA,Lacub,Abra,Lacub,140109000,PH140109000
9,abra,lagangilang,14.0,1.0,110.0,283.0,92.0,32.51,110.0,ABRA,Lagangilang,Abra,Lagangilang,140110000,PH140110000


In [35]:
# Select only relevant columns for final DF (for export)
final_df = agshare_by_muni_psgc[['REGION', 'PROV_NAME', 'ADM3_EN',
                                 'ADM3_CODE_PH', 'TOTAL_EMPLOYED',
                                 'AGRI_EMPLOYED', 'PERCENT_AGRI']]
# Define dict to rename some columns
col_names = {
    'PROV_NAME': 'ADM2_EN',
    'ADM3_CODE_PH': 'ADM3_CODE',
    'REGION': 'REGION_NO' # 'ADM1_CODE'
}

# Apply renaming dict
final_df = final_df.rename(columns=col_names)

# Convert REGION_NO to int to remove decimal

final_df['REGION_NO'] = final_df['REGION_NO'].astype(int)

# final_df['ADM1_CODE'] = 'PH' + final_df['ADM1_CODE'].astype(int).astype(str).str.zfill(2) + '0000000'

final_df.head(20)

Unnamed: 0,REGION_NO,ADM2_EN,ADM3_EN,ADM3_CODE,TOTAL_EMPLOYED,AGRI_EMPLOYED,PERCENT_AGRI
0,14,ABRA,Bangued,PH140101000,1725.0,286.0,16.58
1,14,ABRA,Boliney,PH140102000,161.0,111.0,68.94
2,14,ABRA,Bucay,PH140103000,492.0,181.0,36.79
3,14,ABRA,Bucloc,PH140104000,104.0,52.0,50.0
4,14,ABRA,Daguioman,PH140105000,52.0,28.0,53.85
5,14,ABRA,Danglas,PH140106000,53.0,11.0,20.75
6,14,ABRA,Dolores,PH140107000,233.0,87.0,37.34
7,14,ABRA,La Paz,PH140108000,442.0,163.0,36.88
8,14,ABRA,Lacub,PH140109000,154.0,114.0,74.03
9,14,ABRA,Lagangilang,PH140110000,283.0,92.0,32.51


In [36]:
final_df.to_csv(r'agshare_by_muni.csv', index=False)