#### Calculating the percent share of agriculture in total employment per municipality or province for the year 2022 (PSA Labor Force Survey)

Step 1: Import pandas

In [None]:
import pandas as pd

Step 2: Combine all csv files in folder into one DF

In [None]:
# Import glob module to find files in directory
import glob

In [None]:
# Store all csv filenames in list
file_list = glob.glob(r"L*.csv")
file_list

In [None]:
# Create empty dataframe to store all combined DFs
lfs_2022_df = pd.DataFrame()

# Define function to check if column name...
# ...contains 'REG', 'WORK', 'PROVMUN', or 'PKB'
def check_str(col_name):
     substrs = ['REG','_WORK', '_PROVMUN', '_PKB']
     return any(x in col_name for x in substrs)

# For each csv file in folder...
for fp in file_list:
    # ...read into a DF, w/ specified cols & NaN values...
    df = pd.read_csv(fp, usecols=check_str,
                    na_values=[' ', '  ', '   ', '    ', '     ', '      '])
    
    # ...then append to empty DF for whole year
    lfs_2022_df = pd.concat([lfs_2022_df, df], ignore_index=True)
    
# Display head of merged DF
lfs_2022_df.head(20)

Step 3: Merge all alike columns and drop extra columns

In [None]:
# Examine first 20 rows where PUFC11_WORK is not null
lfs_2022_df.loc[~lfs_2022_df['PUFC11_WORK'].isnull()].head(20)

In [None]:
# For all rows where PUFC11_WORK is null, copy values from PUFC09_WORK and PUFC09A_WORK
lfs_2022_df.loc[lfs_2022_df['PUFC11_WORK'].isnull(), 'PUFC11_WORK'] =\
    lfs_2022_df['PUFC09_WORK']

lfs_2022_df.loc[lfs_2022_df['PUFC11_WORK'].isnull(), 'PUFC11_WORK'] =\
    lfs_2022_df['PUFC09A_WORK']

lfs_2022_df.tail(10)

In [None]:
# Drop extra WORK columns
lfs_2022_df = lfs_2022_df.drop(['PUFC09_WORK', 'PUFC09A_WORK'], axis=1)
lfs_2022_df.tail(10)

Repeat Step 3 for PROVMUN and PKB columns

In [None]:
# For all rows where PUFC12A_PROVMUN is null, copy values from PUFC11A_PROVMUN
lfs_2022_df.loc[lfs_2022_df['PUFC12A_PROVMUN'].isnull(), 'PUFC12A_PROVMUN'] =\
    lfs_2022_df['PUFC11A_PROVMUN']

lfs_2022_df.tail(10)

In [None]:
# For all rows where PUFC6_PKB is null, copy values from PUFC15_PKB
lfs_2022_df.loc[lfs_2022_df['PUFC16_PKB'].isnull(), 'PUFC16_PKB'] =\
    lfs_2022_df['PUFC15_PKB']

lfs_2022_df.tail(10)

In [None]:
# Drop extra PROVMUN and PKB columns
lfs_2022_df = lfs_2022_df.drop(['PUFC11A_PROVMUN', 'PUFC15_PKB'], axis=1)
lfs_2022_df.tail(10)

Step 4: Save to new DF only the rows where WORK = 1 (employed)

In [None]:
work_df = lfs_2022_df.loc[lfs_2022_df['PUFC11_WORK'] == 1]
work_df.sample(20)

Step 5: Rename columns and drop work indicator column

In [None]:
col_names = {
    'PUFREG': 'REGION',
    'PUFC12A_PROVMUN': 'PROV_MUN',
    'PUFC16_PKB': 'INDUSTRY'
}

work_df = work_df.rename(columns=col_names)[['REGION', 'PROV_MUN', 'INDUSTRY']]
work_df.sample(20)

Step 6: Parse province out of PROV_MUN column

In [None]:
# Remove decimal place from prov-muni by converting to int
work_df['PROV_MUN'] = work_df['PROV_MUN'].astype(int)

# Convert to string, pad w/ leading zeroes up to 4 chars,
# then slice out first 2 chars as province code
work_df['PROVINCE'] = work_df['PROV_MUN'].astype(str)\
                      .str.zfill(4).str.slice(0,2)

work_df.sample(20)

In [None]:
# Count number of unique municipality codes
len(work_df['PROV_MUN'].unique())

In [None]:
# Count number of unique province codes
len(work_df['PROVINCE'].unique())

Step 7: Group DF by municipality

In [None]:
by_muni = work_df.groupby('PROV_MUN')
len(by_muni)

Step 8: Calculate percent share of agriculture in total employment per municipality

In [None]:
# Create empty dataframe for aggregated values
agshare_by_muni = pd.DataFrame()

# Iterate over ach municipality
for key, group in by_muni:

    # Create empty series to store each calculation
    c = pd.Series()

    # Add region & muni code to series of values
    c['REGION'] = group['REGION'].median().astype(int)
    c['PROVINCE'] = group['PROVINCE'].astype(int).median()
    c['PROV_MUN'] = key

    # Count total number of employed (i.e. length per group)
    c['TOTAL_EMPLOYED'] = group['INDUSTRY'].count()

    # Count number of people employed in agriculture
    c['AGRI_EMPLOYED'] = group.loc[group['INDUSTRY'] < 4, 'INDUSTRY'].count()

    # Calculate ratio between agri & total employment
    # and round off values to 2 decimal places
    c['PERCENT_AGRI'] = c['AGRI_EMPLOYED'] / c['TOTAL_EMPLOYED'] * 100
    c['PERCENT_AGRI'] = round(c['PERCENT_AGRI'], 2)

    # Convert series into dataframe and transpose into a row
    row = c.to_frame().transpose()

    # Append new row into agshare_by_muni dataframe
    agshare_by_muni = pd.concat([agshare_by_muni, row], ignore_index=True)

agshare_by_muni

Step 9: Read metadata file into DF of province & municipality names

In [None]:
# Read metadata xlsx file as DF

names_fp = r"lfs_november_2022_metadata(dictionary).xlsx"

geo_names = pd.read_excel(names_fp, sheet_name=r"lfs_november_2022_valueset",
                          skiprows=168, skipfooter=284)

# Define dict to rename columns
new_cols = {
    'Unnamed: 2': 'LOCATION',
    'Unnamed: 3': 'LOC_CODE'
}

geo_names = geo_names.rename(columns=new_cols)[['LOCATION', 'LOC_CODE']]

geo_names.sample(10)

In [None]:
# Split LOCATION column into province and municipality
names = geo_names['LOCATION'].str.split(' - ', expand=True)

# Put first item of split list into PROV_NAME column
# But only get all chars after 4-digit code and space
geo_names['PROV_NAME'] = names[0].str.slice(start=6)

# Put second item of split list into MUN_NAME column
geo_names['MUN_NAME'] = names[1]

geo_names.sample(10)


In [None]:
# Drop unneeded LOCATION column
geo_names.drop(columns=['LOCATION'], inplace=True)

geo_names.sample(10)

Step 10: Join names DF with employment DF (on muni code)

In [None]:
agshare_by_muni = agshare_by_muni.merge(right=geo_names, left_on='PROV_MUN', right_on='LOC_CODE')

agshare_by_muni.sample(10)

In [None]:
# Remove parentheticals from province & municipality names
agshare_by_muni['PROV_NAME'] = agshare_by_muni['PROV_NAME'].str.replace(r"\(.+\)", "")

agshare_by_muni.loc[agshare_by_muni['PROVINCE'] == 60].sample(10)