#### Calculating the percent share of agriculture in total employment per municipality or province for the year 2022 (PSA Labor Force Survey)

Step 1: Import pandas

In [1]:
import pandas as pd

Step 2: Combine all csv files in folder into one DF

In [2]:
# Import glob module to find files in directory
import glob

In [3]:
# Store all csv filenames in list
file_list = glob.glob(r"L*.csv")
file_list

['LFS PUF April 2022.csv',
 'LFS PUF August 2022.CSV',
 'LFS PUF December 2022.CSV',
 'LFS PUF February 2022.csv',
 'LFS PUF January 2022.csv',
 'LFS PUF July 2022.CSV',
 'LFS PUF June 2022.csv',
 'LFS PUF March 2022.csv',
 'LFS PUF May 2022.csv',
 'LFS PUF November 2022.CSV',
 'LFS PUF October 2022.CSV',
 'LFS PUF September 2022.CSV']

In [4]:
# Create empty dataframe to store all combined DFs
lfs_2022_df = pd.DataFrame()

# Define function to check if column name...
# ...contains 'REG', 'WORK', 'PROVMUN', or 'PKB'
def check_str(col_name):
     substrs = ['REG','_WORK', '_PROVMUN', '_PKB']
     return any(x in col_name for x in substrs)

# For each csv file in folder...
for fp in file_list:
    # ...read into a DF, w/ specified cols & NaN values...
    df = pd.read_csv(fp, usecols=check_str,
                    na_values=[' ', '  ', '   ', '    ', '     ', '      '])
    
    # ...then append to empty DF for whole year
    lfs_2022_df = pd.concat([lfs_2022_df, df], ignore_index=True)
    
# Display head of merged DF
lfs_2022_df.head(20)

Unnamed: 0,PUFREG,PUFC11_WORK,PUFC12A_PROVMUN,PUFC16_PKB,PUFC09_WORK,PUFC09A_WORK,PUFC11A_PROVMUN,PUFC15_PKB
0,1,1.0,2802.0,1.0,,,,
1,1,,,,,,,
2,1,,,,,,,
3,1,,,,,,,
4,1,1.0,2802.0,1.0,,,,
5,1,1.0,2820.0,82.0,,,,
6,1,2.0,,,,,,
7,1,1.0,2802.0,85.0,,,,
8,1,1.0,2802.0,85.0,,,,
9,1,2.0,,,,,,


Step 3: Merge all alike columns and drop extra columns

In [5]:
# Examine first 20 rows where PUFC11_WORK is not null
lfs_2022_df.loc[~lfs_2022_df['PUFC11_WORK'].isnull()].head(20)

Unnamed: 0,PUFREG,PUFC11_WORK,PUFC12A_PROVMUN,PUFC16_PKB,PUFC09_WORK,PUFC09A_WORK,PUFC11A_PROVMUN,PUFC15_PKB
0,1,1.0,2802.0,1.0,,,,
4,1,1.0,2802.0,1.0,,,,
5,1,1.0,2820.0,82.0,,,,
6,1,2.0,,,,,,
7,1,1.0,2802.0,85.0,,,,
8,1,1.0,2802.0,85.0,,,,
9,1,2.0,,,,,,
10,1,2.0,,,,,,
11,1,1.0,2802.0,1.0,,,,
12,1,1.0,2802.0,1.0,,,,


In [6]:
# For all rows where PUFC11_WORK is null, copy values from PUFC09_WORK and PUFC09A_WORK
lfs_2022_df.loc[lfs_2022_df['PUFC11_WORK'].isnull(), 'PUFC11_WORK'] =\
    lfs_2022_df['PUFC09_WORK']

lfs_2022_df.loc[lfs_2022_df['PUFC11_WORK'].isnull(), 'PUFC11_WORK'] =\
    lfs_2022_df['PUFC09A_WORK']

lfs_2022_df.tail(10)

Unnamed: 0,PUFREG,PUFC11_WORK,PUFC12A_PROVMUN,PUFC16_PKB,PUFC09_WORK,PUFC09A_WORK,PUFC11A_PROVMUN,PUFC15_PKB
1655195,17,2.0,,,2.0,,,
1655196,17,,,,,,,
1655197,17,2.0,,,2.0,,,
1655198,17,1.0,,,1.0,1.0,5914.0,41.0
1655199,17,2.0,,,2.0,,,
1655200,17,2.0,,,2.0,,,
1655201,17,1.0,,,1.0,1.0,5914.0,41.0
1655202,17,1.0,,,1.0,1.0,1014.0,96.0
1655203,17,2.0,,,2.0,,,
1655204,17,,,,,,,


In [7]:
# Drop extra WORK columns
lfs_2022_df = lfs_2022_df.drop(['PUFC09_WORK', 'PUFC09A_WORK'], axis=1)
lfs_2022_df.tail(10)

Unnamed: 0,PUFREG,PUFC11_WORK,PUFC12A_PROVMUN,PUFC16_PKB,PUFC11A_PROVMUN,PUFC15_PKB
1655195,17,2.0,,,,
1655196,17,,,,,
1655197,17,2.0,,,,
1655198,17,1.0,,,5914.0,41.0
1655199,17,2.0,,,,
1655200,17,2.0,,,,
1655201,17,1.0,,,5914.0,41.0
1655202,17,1.0,,,1014.0,96.0
1655203,17,2.0,,,,
1655204,17,,,,,


Repeat Step 3 for PROVMUN and PKB columns

In [8]:
# For all rows where PUFC12A_PROVMUN is null, copy values from PUFC11A_PROVMUN
lfs_2022_df.loc[lfs_2022_df['PUFC12A_PROVMUN'].isnull(), 'PUFC12A_PROVMUN'] =\
    lfs_2022_df['PUFC11A_PROVMUN']

lfs_2022_df.tail(10)

Unnamed: 0,PUFREG,PUFC11_WORK,PUFC12A_PROVMUN,PUFC16_PKB,PUFC11A_PROVMUN,PUFC15_PKB
1655195,17,2.0,,,,
1655196,17,,,,,
1655197,17,2.0,,,,
1655198,17,1.0,5914.0,,5914.0,41.0
1655199,17,2.0,,,,
1655200,17,2.0,,,,
1655201,17,1.0,5914.0,,5914.0,41.0
1655202,17,1.0,1014.0,,1014.0,96.0
1655203,17,2.0,,,,
1655204,17,,,,,


In [9]:
# For all rows where PUFC6_PKB is null, copy values from PUFC15_PKB
lfs_2022_df.loc[lfs_2022_df['PUFC16_PKB'].isnull(), 'PUFC16_PKB'] =\
    lfs_2022_df['PUFC15_PKB']

lfs_2022_df.tail(10)

Unnamed: 0,PUFREG,PUFC11_WORK,PUFC12A_PROVMUN,PUFC16_PKB,PUFC11A_PROVMUN,PUFC15_PKB
1655195,17,2.0,,,,
1655196,17,,,,,
1655197,17,2.0,,,,
1655198,17,1.0,5914.0,41.0,5914.0,41.0
1655199,17,2.0,,,,
1655200,17,2.0,,,,
1655201,17,1.0,5914.0,41.0,5914.0,41.0
1655202,17,1.0,1014.0,96.0,1014.0,96.0
1655203,17,2.0,,,,
1655204,17,,,,,


In [10]:
# Drop extra PROVMUN and PKB columns
lfs_2022_df = lfs_2022_df.drop(['PUFC11A_PROVMUN', 'PUFC15_PKB'], axis=1)
lfs_2022_df.tail(10)

Unnamed: 0,PUFREG,PUFC11_WORK,PUFC12A_PROVMUN,PUFC16_PKB
1655195,17,2.0,,
1655196,17,,,
1655197,17,2.0,,
1655198,17,1.0,5914.0,41.0
1655199,17,2.0,,
1655200,17,2.0,,
1655201,17,1.0,5914.0,41.0
1655202,17,1.0,1014.0,96.0
1655203,17,2.0,,
1655204,17,,,


Step 4: Save to new DF only the rows where WORK = 1 (employed)

In [11]:
work_df = lfs_2022_df.loc[lfs_2022_df['PUFC11_WORK'] == 1]
work_df.sample(20)

Unnamed: 0,PUFREG,PUFC11_WORK,PUFC12A_PROVMUN,PUFC16_PKB
1437525,2,1.0,5701.0,1.0
686082,10,1.0,1316.0,1.0
1315493,12,1.0,6303.0,10.0
169835,16,1.0,312.0,1.0
444817,4,1.0,2112.0,1.0
1291169,3,1.0,807.0,47.0
1348372,6,1.0,4523.0,47.0
220557,14,1.0,1102.0,47.0
2364,1,1.0,2917.0,1.0
770445,12,1.0,4714.0,1.0


Step 5: Rename columns and drop work indicator column

In [12]:
col_names = {
    'PUFREG': 'REGION',
    'PUFC12A_PROVMUN': 'PROV_MUN',
    'PUFC16_PKB': 'INDUSTRY'
}

work_df = work_df.rename(columns=col_names)[['REGION', 'PROV_MUN', 'INDUSTRY']]
work_df.sample(20)

Unnamed: 0,REGION,PROV_MUN,INDUSTRY
1409894,13,3905.0,96.0
641759,8,6414.0,96.0
127156,13,7402.0,10.0
1586308,15,6602.0,47.0
427745,3,7114.0,47.0
135925,13,7603.0,96.0
559069,6,4530.0,84.0
544257,6,3034.0,49.0
128820,13,7405.0,47.0
1281685,15,3810.0,84.0


Step 6: Parse province out of PROV_MUN column

In [13]:
# Remove decimal place from prov-muni by converting to int
work_df['PROV_MUN'] = work_df['PROV_MUN'].astype(int)

# Convert to string, pad w/ leading zeroes up to 4 chars,
# then slice out first 2 chars as province code
work_df['PROVINCE'] = work_df['PROV_MUN'].astype(str)\
                      .str.zfill(4).str.slice(0,2)

work_df.sample(20)

Unnamed: 0,REGION,PROV_MUN,INDUSTRY,PROVINCE
851060,13,7503,33.0,75
187745,3,807,64.0,8
327022,1,2823,1.0,28
1491897,7,2228,95.0,22
1045447,17,5316,47.0,53
481807,5,1605,47.0,16
1146404,9,8309,47.0,83
1081476,3,6918,47.0,69
589688,7,2230,81.0,22
779854,12,6311,1.0,63


In [14]:
# Count number of unique municipality codes
len(work_df['PROV_MUN'].unique())

1637

In [15]:
# Count number of unique province codes
len(work_df['PROVINCE'].unique())

87

Step 7: Group DF by municipality

In [16]:
by_muni = work_df.groupby('PROV_MUN')
len(by_muni)

1637

Step 8: Calculate percent share of agriculture in total employment per municipality

In [17]:
# Create empty dataframe for aggregated values
agshare_by_muni = pd.DataFrame()

# Iterate over ach municipality
for key, group in by_muni:

    # Create empty series to store each calculation
    c = pd.Series()

    # Add region & muni code to series of values
    c['REGION'] = group['REGION'].median().astype(int)
    c['PROVINCE'] = group['PROVINCE'].astype(int).median()
    c['PROV_MUN'] = key

    # Count total number of employed (i.e. length per group)
    c['TOTAL_EMPLOYED'] = group['INDUSTRY'].count()

    # Count number of people employed in agriculture
    c['AGRI_EMPLOYED'] = group.loc[group['INDUSTRY'] < 4, 'INDUSTRY'].count()

    # Calculate ratio between agri & total employment
    # and round off values to 2 decimal places
    c['PERCENT_AGRI'] = c['AGRI_EMPLOYED'] / c['TOTAL_EMPLOYED'] * 100
    c['PERCENT_AGRI'] = round(c['PERCENT_AGRI'], 2)

    # Convert series into dataframe and transpose into a row
    row = c.to_frame().transpose()

    # Append new row into agshare_by_muni dataframe
    agshare_by_muni = pd.concat([agshare_by_muni, row], ignore_index=True)

agshare_by_muni

Unnamed: 0,REGION,PROVINCE,PROV_MUN,TOTAL_EMPLOYED,AGRI_EMPLOYED,PERCENT_AGRI
0,14.0,1.0,101.0,1725.0,286.0,16.58
1,14.0,1.0,102.0,161.0,111.0,68.94
2,14.0,1.0,103.0,492.0,181.0,36.79
3,14.0,1.0,104.0,104.0,52.0,50.00
4,14.0,1.0,105.0,52.0,28.0,53.85
...,...,...,...,...,...,...
1632,11.0,86.0,8603.0,2296.0,1248.0,54.36
1633,11.0,86.0,8604.0,1026.0,498.0,48.54
1634,11.0,86.0,8605.0,438.0,251.0,57.31
1635,9.0,97.0,9701.0,2236.0,470.0,21.02


Step 9: Read metadata file into DF of province & municipality names

In [18]:
# Read metadata xlsx file as DF

names_fp = r"lfs_november_2022_metadata(dictionary).xlsx"

geo_names = pd.read_excel(names_fp, sheet_name=r"lfs_november_2022_valueset",
                          skiprows=168, skipfooter=284)

# Define dict to rename columns
new_cols = {
    'Unnamed: 2': 'LOCATION',
    'Unnamed: 3': 'LOC_CODE'
}

geo_names = geo_names.rename(columns=new_cols)[['LOCATION', 'LOC_CODE']]

geo_names.sample(10)

Unnamed: 0,LOCATION,LOC_CODE
1496,7111 ZAMBALES - San Marcelino,7111.0
166,1031 BATANGAS - City of Tanauan,1031.0
1214,5533 PANGASINAN - San Fabian,5533.0
147,1012 BATANGAS - Lemery,1012.0
1617,8301 ZAMBOANGA SIBUGAY - Alicia,8301.0
1604,8106 APAYAO - Pudtol,8106.0
984,4521 NEGROS OCCIDENTAL - Pontevedra,4521.0
1305,5915 ROMBLON - Santa Fe,5915.0
1019,4624 NEGROS ORIENTAL - Vallehermoso,4624.0
192,1209 BOHOL - Buenavista,1209.0


In [19]:
# Split LOCATION column into province and municipality
names = geo_names['LOCATION'].str.split(' - ', expand=True)

# Put first item of split list into PROV_NAME column
# But only get all chars after 4-digit code and space
geo_names['PROV_NAME'] = names[0].str.slice(start=6)

# Put second item of split list into MUN_NAME column
geo_names['MUN_NAME'] = names[1]

geo_names.sample(10)


Unnamed: 0,LOCATION,LOC_CODE,PROV_NAME,MUN_NAME
1389,6504 SULTAN KUDARAT - Isulan (Capital),6504.0,SULTAN KUDARAT,Isulan (Capital)
1584,7806 BILIRAN - Kawayan,7806.0,BILIRAN,Kawayan
1471,6915 TARLAC - Santa Ignacia,6915.0,TARLAC,Santa Ignacia
1295,5905 ROMBLON - Concepcion,5905.0,ROMBLON,Concepcion
204,1221 BOHOL - Duero,1221.0,BOHOL,Duero
1068,4906 NUEVA ECIJA - Cuyapo,4906.0,NUEVA ECIJA,Cuyapo
1353,6215 SORSOGON - Santa Magdalena,6215.0,SORSOGON,Santa Magdalena
76,0507 ALBAY - Libon,507.0,ALBAY,Libon
403,2115 CAVITE - Naic,2115.0,CAVITE,Naic
135,0906 BATANES - Uyugan,906.0,BATANES,Uyugan


In [20]:
# Drop unneeded LOCATION column
geo_names.drop(columns=['LOCATION'], inplace=True)

geo_names.sample(10)

Unnamed: 0,LOC_CODE,PROV_NAME,MUN_NAME
136,1001.0,BATANGAS,Agoncillo
175,1106.0,BENGUET,Itogon
1278,5802.0,RIZAL,City of Antipolo (Capital)
1458,6902.0,TARLAC,Bamban
1404,6607.0,SULU,Old Panamao
1442,6805.0,SURIGAO DEL SUR,Cantilan
1357,6306.0,SOUTH COTABATO,City of Koronadal (Capital)
1203,5522.0,PANGASINAN,Lingayen (Capital)
279,1502.0,CAGAYAN,Alcala
1304,5914.0,ROMBLON,San Jose


Step 10: Join names DF with employment DF (on muni code)

In [21]:
agshare_by_muni = agshare_by_muni.merge(right=geo_names, left_on='PROV_MUN', right_on='LOC_CODE')

agshare_by_muni.sample(10)

Unnamed: 0,REGION,PROVINCE,PROV_MUN,TOTAL_EMPLOYED,AGRI_EMPLOYED,PERCENT_AGRI,LOC_CODE,PROV_NAME,MUN_NAME
627,6.0,30.0,3044.0,197.0,76.0,38.58,3044.0,ILOILO,Sara
1284,17.0,59.0,5904.0,348.0,115.0,33.05,5904.0,ROMBLON,Calatrava
705,4.0,34.0,3412.0,47.0,9.0,19.15,3412.0,LAGUNA,Luisiana
769,15.0,36.0,3624.0,264.0,75.0,28.41,3624.0,LANAO DEL SUR,Ditsaan-Ramain
858,15.0,38.0,3831.0,96.0,60.0,62.5,3831.0,MAGUINDANAO,Datu Anggal Midtimbang
1014,7.0,46.0,4625.0,136.0,71.0,52.21,4625.0,NEGROS ORIENTAL,Zamboanguita
114,15.0,7.0,710.0,193.0,164.0,84.97,710.0,BASILAN,Hadji Mohammad Ajul
1285,17.0,59.0,5905.0,27.0,11.0,40.74,5905.0,ROMBLON,Concepcion
106,15.0,7.0,702.0,1349.0,487.0,36.1,702.0,BASILAN,City of Lamitan (Capital)
737,10.0,35.0,3514.0,237.0,161.0,67.93,3514.0,LANAO DEL NORTE,Munai


Step 11: Clean up province and municipality names

In [22]:
# Remove parentheticals from province & municipality names
# agshare_by_muni['PROV_NAME'] = agshare_by_muni['PROV_NAME'].str.replace(r"\(.+\)", "")

aliases = agshare_by_muni['PROV_NAME'].str.split(' \(', expand=True)
agshare_by_muni['PROV_NAME'] = aliases[0].str.strip()

list(agshare_by_muni['PROV_NAME'].unique())

  aliases = agshare_by_muni['PROV_NAME'].str.split(' \(', expand=True)


['ABRA',
 'AGUSAN DEL NORTE',
 'AGUSAN DEL SUR',
 'AKLAN',
 'ALBAY',
 'ANTIQUE',
 'BASILAN',
 'BATAAN',
 'BATANES',
 'BATANGAS',
 'BENGUET',
 'BOHOL',
 'BUKIDNON',
 'BULACAN',
 'CAGAYAN',
 'CAMARINES NORTE',
 'CAMARINES SUR',
 'CAMIGUIN',
 'CAPIZ',
 'CATANDUANES',
 'CAVITE',
 'CEBU',
 'DAVAO DEL NORTE',
 'DAVAO DEL SUR',
 'DAVAO ORIENTAL',
 'EASTERN SAMAR',
 'IFUGAO',
 'ILOCOS NORTE',
 'ILOCOS SUR',
 'ILOILO',
 'ISABELA',
 'KALINGA',
 'LA UNION',
 'LAGUNA',
 'LANAO DEL NORTE',
 'LANAO DEL SUR',
 'LEYTE',
 'MAGUINDANAO',
 'City of Manila',
 'MARINDUQUE',
 'MASBATE',
 'MISAMIS OCCIDENTAL',
 'MISAMIS ORIENTAL',
 'MOUNTAIN PROVINCE',
 'NEGROS OCCIDENTAL',
 'NEGROS ORIENTAL',
 'NORTH COTABATO',
 'NORTHERN SAMAR',
 'NUEVA ECIJA',
 'NUEVA VIZCAYA',
 'OCCIDENTAL MINDORO',
 'ORIENTAL MINDORO',
 'PALAWAN',
 'PAMPANGA',
 'PANGASINAN',
 'QUEZON',
 'QUIRINO',
 'RIZAL',
 'ROMBLON',
 'SAMAR',
 'SIQUIJOR',
 'SORSOGON',
 'SOUTH COTABATO',
 'SOUTHERN LEYTE',
 'SULTAN KUDARAT',
 'SULU',
 'SURIGAO DEL NOR

In [23]:
# Do the same for municipality names
aliases = agshare_by_muni['MUN_NAME'].str.split(' \(', expand=True)
agshare_by_muni['MUN_NAME'] = aliases[0].str.strip()

agshare_by_muni['MUN_NAME'].head(20)

  aliases = agshare_by_muni['MUN_NAME'].str.split(' \(', expand=True)


0         Bangued
1         Boliney
2           Bucay
3          Bucloc
4       Daguioman
5         Danglas
6         Dolores
7          La Paz
8           Lacub
9     Lagangilang
10        Lagayan
11       Langiden
12    Licuan-Baay
13           Luba
14      Malibcong
15         Manabo
16     Peñarrubia
17        Pidigan
18          Pilar
19     Sallapadan
Name: MUN_NAME, dtype: object

Step 9B: Clean up some rows

In [24]:
# Define dict w/ keys = old names and values = fixed names
city_names = {
    'Calaca': 'City of Calaca',
    'Pres. Carlos P. Garcia': 'President Carlos P. Garcia',
    'Baliuag': 'City of Baliwag',
    'Carmona': 'City of Carmona',
    'Datu Saudi-Ampatuan': 'Datu Saudi Ampatuan',
    'Pio V. Corpuz': 'Pio V. Corpus',
    'Bacungan': 'Leon T. Postigo',
    'Hinoba-an': 'Hinoba-An'
}

agshare_by_muni.replace({'MUN_NAME': city_names}, inplace=True)

new_names = list(city_names.values())

agshare_by_muni.loc[agshare_by_muni['MUN_NAME'].isin(new_names)]

Unnamed: 0,REGION,PROVINCE,PROV_MUN,TOTAL_EMPLOYED,AGRI_EMPLOYED,PERCENT_AGRI,LOC_CODE,PROV_NAME,MUN_NAME
142,4.0,10.0,1007.0,110.0,16.0,14.55,1007.0,BATANGAS,City of Calaca
217,7.0,12.0,1235.0,115.0,36.0,31.3,1235.0,BOHOL,President Carlos P. Garcia
255,3.0,14.0,1403.0,514.0,21.0,4.09,1403.0,BULACAN,City of Baliwag
391,4.0,21.0,2104.0,285.0,5.0,1.75,2104.0,CAVITE,City of Carmona
853,15.0,38.0,3826.0,115.0,69.0,60.0,3826.0,MAGUINDANAO,Datu Saudi Ampatuan
899,5.0,41.0,4116.0,211.0,133.0,63.03,4116.0,MASBATE,Pio V. Corpus
969,6.0,45.0,4512.0,107.0,66.0,61.68,4512.0,NEGROS OCCIDENTAL,Hinoba-An
1515,9.0,72.0,7226.0,7.0,1.0,14.29,7226.0,ZAMBOANGA DEL NORTE,Leon T. Postigo


In [25]:
# Assign municipality names for City of Isabela and City of Cotabato
agshare_by_muni.loc[agshare_by_muni['PROV_NAME'] == 'CITY OF ISABELA', 'MUN_NAME'] \
    = 'City of Isabela'

agshare_by_muni.loc[agshare_by_muni['PROV_NAME'] == 'COTABATO CITY', 'MUN_NAME'] \
    = 'City of Cotabato'

agshare_by_muni.loc[agshare_by_muni['MUN_NAME'].isin(['City of Isabela', 'City of Cotabato'])]

Unnamed: 0,REGION,PROVINCE,PROV_MUN,TOTAL_EMPLOYED,AGRI_EMPLOYED,PERCENT_AGRI,LOC_CODE,PROV_NAME,MUN_NAME
1635,9.0,97.0,9701.0,2236.0,470.0,21.02,9701.0,CITY OF ISABELA,City of Isabela
1636,12.0,98.0,9804.0,4091.0,243.0,5.94,9804.0,COTABATO CITY,City of Cotabato


In [26]:
# Merge all Manila sub-municipalities into one municipality (City of Manila)
manila = agshare_by_muni.loc[agshare_by_muni['PROVINCE'] == 39]\
        .groupby('PROVINCE', as_index=False)\
        .agg({
            'REGION': 'min',
            'PROVINCE': 'min',
            'PROV_MUN': 'min',
            'TOTAL_EMPLOYED': 'sum',
            'AGRI_EMPLOYED': 'sum',
            'PERCENT_AGRI': 'min',
            'LOC_CODE': 'min',
            'PROV_NAME': 'first',
            'MUN_NAME': 'first'
        })

# Re-calculate PERCENT_AGRI and assign municipality name
manila['PERCENT_AGRI'] = round(manila['AGRI_EMPLOYED'] / manila['TOTAL_EMPLOYED'] * 100, 2)
manila['MUN_NAME'].replace({'Tondo I/II': 'City of Manila'}, inplace=True)

# Append new row into agshare DF
agshare_by_muni = pd.concat([agshare_by_muni, manila], ignore_index=True)

# Drop all Manila sub-municipalities
drop_indexes = agshare_by_muni.loc[(agshare_by_muni['PROVINCE'] == 39) & (agshare_by_muni['MUN_NAME'] != 'City of Manila')].index
agshare_by_muni.drop(drop_indexes, inplace=True)

agshare_by_muni.loc[agshare_by_muni['PROVINCE'] == 39]

Unnamed: 0,REGION,PROVINCE,PROV_MUN,TOTAL_EMPLOYED,AGRI_EMPLOYED,PERCENT_AGRI,LOC_CODE,PROV_NAME,MUN_NAME
1637,13.0,39.0,3901.0,8087.0,21.0,0.26,3901.0,City of Manila,City of Manila


Step 12: Join agshare table with PGSC codes

In [27]:
# Define filepath to xlsx file with municipality PSGC codes
psgc_fp = r"C:\Users\HOWARD\Desktop\DENR CCIMS\C - VULNERABILITY\Health Vulnerability\ABM3_PercentRural_PSA_2020.xlsx"

# Read excel file as DF
psgc_df = pd.read_excel(psgc_fp, # sheet_name='PSGC',
                        usecols=['ADM3_EN', 'ADM3_CODE'])

psgc_df.head(10)

Unnamed: 0,ADM3_EN,ADM3_CODE
0,City of Manila,133900000
1,City of Mandaluyong,137401000
2,City of Marikina,137402000
3,City of Pasig,137403000
4,Quezon City,137404000
5,City of San Juan,137405000
6,City of Caloocan,137501000
7,City of Malabon,137502000
8,City of Navotas,137503000
9,City of Valenzuela,137504000


Clean up municipality names and codes before joining

In [None]:
# Clean up data: convert 0 into -
psgc_df['ADM3_EN'] = psgc_df['ADM3_EN'].replace('0', '-', regex=True)

# Remove parentheticals from municipality names
aliases = psgc_df['ADM3_EN'].str.split(' \(', expand=True)
psgc_df['ADM3_EN'] = aliases[0].str.strip()

# Convert muni code to string
psgc_df['ADM3_CODE'] = psgc_df['ADM3_CODE'].astype(str).str.zfill(9)

# Check all rows where code starts with '0215' (Cagayan)
psgc_df.loc[psgc_df['ADM3_CODE'].str.startswith('0215')].head(30)

Perform join

In [None]:
# Join province PSGC codes to province names
agshare_by_muni_psgc = agshare_by_muni.merge(psgc_df, how='left', 
                                             left_on='MUN_NAME', 
                                             right_on='ADM3_EN')

agshare_by_muni_psgc.loc[agshare_by_muni_psgc['PROV_NAME'] == 'CAGAYAN']

In [30]:
# Check if some rows failed to join
agshare_by_muni_psgc.loc[agshare_by_muni_psgc['ADM3_CODE'].isnull()]['MUN_NAME'].values

array([], dtype=object)

In [31]:
agshare_by_muni_psgc.columns

Index(['REGION', 'PROVINCE', 'PROV_MUN', 'TOTAL_EMPLOYED', 'AGRI_EMPLOYED',
       'PERCENT_AGRI', 'LOC_CODE', 'PROV_NAME', 'MUN_NAME', 'ADM3_EN',
       'ADM3_CODE'],
      dtype='object')

In [32]:
agshare_by_muni_psgc['ADM3_CODE_PH'] = 'PH' + agshare_by_muni_psgc['ADM3_CODE']

agshare_by_muni_psgc.head(10)

Unnamed: 0,REGION,PROVINCE,PROV_MUN,TOTAL_EMPLOYED,AGRI_EMPLOYED,PERCENT_AGRI,LOC_CODE,PROV_NAME,MUN_NAME,ADM3_EN,ADM3_CODE,ADM3_CODE_PH
0,14.0,1.0,101.0,1725.0,286.0,16.58,101.0,ABRA,Bangued,Bangued,140101000,PH140101000
1,14.0,1.0,102.0,161.0,111.0,68.94,102.0,ABRA,Boliney,Boliney,140102000,PH140102000
2,14.0,1.0,103.0,492.0,181.0,36.79,103.0,ABRA,Bucay,Bucay,140103000,PH140103000
3,14.0,1.0,104.0,104.0,52.0,50.0,104.0,ABRA,Bucloc,Bucloc,140104000,PH140104000
4,14.0,1.0,105.0,52.0,28.0,53.85,105.0,ABRA,Daguioman,Daguioman,140105000,PH140105000
5,14.0,1.0,106.0,53.0,11.0,20.75,106.0,ABRA,Danglas,Danglas,140106000,PH140106000
6,14.0,1.0,107.0,233.0,87.0,37.34,107.0,ABRA,Dolores,Dolores,140107000,PH140107000
7,14.0,1.0,107.0,233.0,87.0,37.34,107.0,ABRA,Dolores,Dolores,45615000,PH045615000
8,14.0,1.0,107.0,233.0,87.0,37.34,107.0,ABRA,Dolores,Dolores,82606000,PH082606000
9,14.0,1.0,108.0,442.0,163.0,36.88,108.0,ABRA,La Paz,La Paz,140108000,PH140108000


In [36]:
# Select only relevant columns for final DF (for export)
final_df = agshare_by_muni_psgc[['REGION', 'PROV_NAME', 'ADM3_EN',
                                 'ADM3_CODE_PH', 'TOTAL_EMPLOYED',
                                 'AGRI_EMPLOYED', 'PERCENT_AGRI']]
# Define dict to rename some columns
col_names = {
    'PROV_NAME': 'ADM2_EN',
    'ADM3_CODE_PH': 'ADM3_CODE',
    'REGION': 'REGION_NO' # 'ADM1_CODE'
}

# Apply renaming dict
final_df = final_df.rename(columns=col_names)

# Convert REGION_NO to int to remove decimal

final_df['REGION_NO'] = final_df['REGION_NO'].astype(int)

# final_df['ADM1_CODE'] = 'PH' + final_df['ADM1_CODE'].astype(int).astype(str).str.zfill(2) + '0000000'

final_df.head(20)

Unnamed: 0,REGION_NO,ADM2_EN,ADM3_EN,ADM3_CODE,TOTAL_EMPLOYED,AGRI_EMPLOYED,PERCENT_AGRI
0,14,ABRA,Bangued,PH140101000,1725.0,286.0,16.58
1,14,ABRA,Boliney,PH140102000,161.0,111.0,68.94
2,14,ABRA,Bucay,PH140103000,492.0,181.0,36.79
3,14,ABRA,Bucloc,PH140104000,104.0,52.0,50.0
4,14,ABRA,Daguioman,PH140105000,52.0,28.0,53.85
5,14,ABRA,Danglas,PH140106000,53.0,11.0,20.75
6,14,ABRA,Dolores,PH140107000,233.0,87.0,37.34
7,14,ABRA,Dolores,PH045615000,233.0,87.0,37.34
8,14,ABRA,Dolores,PH082606000,233.0,87.0,37.34
9,14,ABRA,La Paz,PH140108000,442.0,163.0,36.88


In [37]:
final_df.to_csv(r'agshare_by_muni.csv', index=False)