In [1]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import association_rules




In [2]:
# Read the dataset
df = pd.read_excel('data/Final_Test_Cleaned_DF.xlsx')


In [3]:
df.shape

(2040, 6)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2040 entries, 0 to 2039
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Geolocation       2040 non-null   object 
 1   Major Occupation  2040 non-null   object 
 2   Year              2040 non-null   int64  
 3   Quarter           2040 non-null   object 
 4   Female            2040 non-null   float64
 5   Male              2040 non-null   float64
dtypes: float64(2), int64(1), object(3)
memory usage: 95.8+ KB


In [5]:
# Make the Year into a string
df['Year'] = df['Year'].astype(str)


In [6]:
# Display the dataset using the year and quarter columns
df[['Year', 'Quarter']].value_counts()

Year  Quarter
2019  Q1         170
      Q2         170
      Q3         170
      Q4         170
2020  Q1         170
      Q2         170
      Q3         170
      Q4         170
2021  Q1         170
      Q2         170
      Q3         170
      Q4         170
Name: count, dtype: int64

In [7]:

# Function to create binary columns for each unique value in specified columns
def create_binary_columns(df, columns):
    binary_df = pd.DataFrame()
    
    for col in columns:
        unique_values = df[col].unique()
        for value in unique_values:
            binary_df[f"{col}_{value}"] = (df[col] == value).astype(int)
    
    return binary_df


# Preprocess the data
# Create binary columns for Geolocation and Major Occupation
columns_to_binarize = ['Geolocation', 'Major Occupation']
binary_df = create_binary_columns(df, columns_to_binarize)

# Add binary columns for employment levels (based on quartiles)
df['Female_Level'] = pd.qcut(df['Female'], q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High'])
df['Male_Level'] = pd.qcut(df['Male'], q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High'])

binary_df = pd.concat([
    binary_df,
    pd.get_dummies(df['Female_Level'], prefix='Female'),
    pd.get_dummies(df['Male_Level'], prefix='Male'),
    pd.get_dummies(df['Year'], prefix='Year'),
    pd.get_dummies(df['Quarter'], prefix='Quarter')
], axis=1)

# Generate frequent itemsets using FP-Growth
min_support = 0.01  # Minimum support threshold (1%)
frequent_itemsets = fpgrowth(binary_df, min_support=min_support, use_colnames=True)

# Generate association rules
min_confidence = 0.5  # Minimum confidence threshold (50%)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)

# Sort rules by lift (measure of rule strength)
rules = rules.sort_values('lift', ascending=False)





In [8]:

# Display the top 20 rules
print("\nTop 20 Association Rules:")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
print(rules.head(20))

# Save results to Excel
rules.to_excel('employment_association_rules.xlsx', index=False)
print("\nResults have been saved to 'employment_association_rules.xlsx'")



Top 20 Association Rules:
                                           antecedents  \
223                     (Female_Medium-High, Male_Low)   
58                  (Female_Low, Male_Low, Quarter_Q4)   
52                  (Quarter_Q3, Female_Low, Male_Low)   
34                   (Female_Low, Male_Low, Year_2019)   
42   (Quarter_Q1, Major Occupation_Armed Forces Occ...   
36   (Major Occupation_Armed Forces Occupations, Ye...   
48   (Major Occupation_Armed Forces Occupations, Qu...   
70                   (Female_Low, Male_Low, Year_2021)   
66   (Major Occupation_Armed Forces Occupations, Ye...   
30         (Major Occupation_Armed Forces Occupations)   
28                              (Female_Low, Male_Low)   
54   (Quarter_Q3, Major Occupation_Armed Forces Occ...   
72   (Major Occupation_Armed Forces Occupations, Ye...   
60   (Major Occupation_Armed Forces Occupations, Qu...   
40                  (Female_Low, Quarter_Q1, Male_Low)   
64                   (Female_Low, Male_Low, Y

In [9]:
# Additional analysis: Find specific patterns
# 1. Rules related to gender employment levels
gender_rules = rules[rules['antecedents'].astype(str).str.contains('Female|Male') | 
                    rules['consequents'].astype(str).str.contains('Female|Male')]
print("\nTop 10 Gender-related Rules:")
print(gender_rules.head(10))

# 2. Region-specific patterns
region_rules = rules[rules['antecedents'].astype(str).str.contains('Geolocation') | 
                    rules['consequents'].astype(str).str.contains('Geolocation')]
print("\nTop 10 Region-specific Rules:")
print(region_rules.head(10)) 


Top 10 Gender-related Rules:
                                           antecedents  \
223                     (Female_Medium-High, Male_Low)   
58                  (Female_Low, Male_Low, Quarter_Q4)   
52                  (Quarter_Q3, Female_Low, Male_Low)   
34                   (Female_Low, Male_Low, Year_2019)   
42   (Quarter_Q1, Major Occupation_Armed Forces Occ...   
36   (Major Occupation_Armed Forces Occupations, Ye...   
48   (Major Occupation_Armed Forces Occupations, Qu...   
70                   (Female_Low, Male_Low, Year_2021)   
66   (Major Occupation_Armed Forces Occupations, Ye...   
30         (Major Occupation_Armed Forces Occupations)   

                                     consequents  antecedent support  \
223             (Major Occupation_Professionals)            0.012745   
58   (Major Occupation_Armed Forces Occupations)            0.028922   
52   (Major Occupation_Armed Forces Occupations)            0.029902   
34   (Major Occupation_Armed Forces Occupat

In [10]:
region_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
391,"(Male_Low, Geolocation_NCR)",(Female_Low),0.011765,0.250000,0.011765,1.000000,4.000000,1.0,0.008824,inf,0.758929,0.047059,1.000000,0.523529
390,"(Female_Low, Geolocation_NCR)",(Male_Low),0.011765,0.250000,0.011765,1.000000,4.000000,1.0,0.008824,inf,0.758929,0.047059,1.000000,0.523529
411,"(Geolocation_REGION IVA, Year_2019)","(Male_High, Female_High)",0.019608,0.142647,0.010784,0.550000,3.855670,1.0,0.007987,1.905229,0.755455,0.071197,0.475129,0.312801
426,"(Geolocation_REGION VI, Male_Medium-Low)",(Female_Medium-High),0.017647,0.250000,0.016667,0.944444,3.777778,1.0,0.012255,13.500000,0.748503,0.066406,0.925926,0.505556
412,"(Geolocation_REGION IVA, Male_Medium-High)",(Female_High),0.017157,0.250000,0.015196,0.885714,3.542857,1.0,0.010907,6.562500,0.730271,0.060311,0.847619,0.473249
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
434,"(Geolocation_REGION X, Male_Medium-Low)",(Female_Medium-High),0.019118,0.250000,0.010294,0.538462,2.153846,1.0,0.005515,1.625000,0.546155,0.039773,0.384615,0.289819
400,"(Male_Medium-Low, Geolocation_REGION II)",(Female_Low),0.022059,0.250000,0.011765,0.533333,2.133333,1.0,0.006250,1.607143,0.543233,0.045198,0.377778,0.290196
404,"(Geolocation_REGION III, Female_High)",(Male_Medium-High),0.026961,0.250000,0.014216,0.527273,2.109091,1.0,0.007475,1.586538,0.540433,0.054104,0.369697,0.292068
381,"(Geolocation_CARAGA, Male_Medium-Low)",(Female_Low),0.022549,0.250000,0.011765,0.521739,2.086957,1.0,0.006127,1.568182,0.532849,0.045113,0.362319,0.284399
