In [49]:
!pip install pyfpgrowth



# Find frequent QA combinations that correlate to Outcome

In [50]:
import numpy as np
import pandas as pd
import pyfpgrowth
import itertools

## Load pivoted data

In [64]:
data = pd.read_csv('survey_data.csv')

Filter out NaN outcomes. They don't support your further analyses here.

In [65]:
data = data.loc[~data.outcome.isna()]

## Create lists to store outcomes

In [66]:
risks = list(set(itertools.chain.from_iterable(data['outcome'].str.split('|'))))
risks

['Gender equality',
 'Economic stability',
 'Displacement & migration',
 'Internal security',
 'Justice & rule of law',
 'Environment & climate',
 'Social cohesion, equality & non-discrimination',
 'Public health',
 'Regional & global influences',
 'Food security, agriculture & land',
 'Infrastructure & access to social services',
 'All of the above',
 'Democratic space',
 'Political stability']

Convert outcomes to list and then replace "All of the above" with a list of all risk factors.

In [67]:
data['outcome'] = data['outcome'].str.split('|').apply(lambda x: set(risks).intersection(set(x)))
data['outcome'] = data['outcome'].apply(lambda x: set([x for x in risks if x!='All of the above']) if "All of the above" in x else x)

## Map question answers to ordinal scale

In [68]:
cols = list(data.columns[4:])

def rep(x):
    
    ### Possible answers to standardize
    # identity and demographics - drop column
    # national/UN coping capacity - very high, high, moderate, low, none, don't know
    # risk increase potential - yes: very strong, yes: strong, yes: moderate, yes: minor, no, don't know
    
    if x is np.nan:
        return "Missing"
    
    x = x.lower()
    
    if ('very' in x) and ('high' in x or 'strong' in x):
        return "Very high"
    
    if ('high' in x or 'strong' in x):
        return "High"
    
    if ('moderate' in x):
        return "Moderate"
    
    if ('low' in x or 'minor' in x):
        return "Minor"
    
    if ("no" in x):
        return "No"
    
    if ("none" in x):
        return "None"
    
    if ("don't" in x):
        return "Don't Know"
    

for c in cols:
    # treat q143 separately
    if 'q143' in c:
        continue
    
    # drop q170
    if 'q170' in c:
        del data[c]
        continue
    
    data[c] = data[c].apply(rep)

## Explode the Outcome column (which is an iterable) to separate rows

In [69]:
data = data.explode('outcome', ignore_index=True)

Remove identifier columns and prepare data for FP-Growth.

In [70]:
data_fp = data.drop(['Country','Survey_ID','Participant_ID'], axis=1).copy()
data_fp.shape

(3527, 41)

## Necessary step for pre-pending question id to answers.

If you remove this step, it will result in erroneous item sets in the pattern mining algorithm.

In [71]:
for col in data_fp.columns:
    if col not in ['outcome','q143']:
        data_fp[col] = col + '=' + data_fp[col]

## Select questions with low % missing values

In [73]:
table = data_fp[['outcome','q109','q111','q112','q114','q115','q117','q118','q132','q133','q135','q136','q138','q139','q141','q143','q3','q7']]

## Find frequent patterns

Threshold will depend on the number of rows in table from previous cell. Ensure sufficient data. If your support is too high, you will not find any patterns because the data is super-sparse with high number of missing values.

In [83]:
thresh = 0.05

In [84]:
patterns = pyfpgrowth.find_frequent_patterns(table.to_numpy(), thresh*table.shape[0])

In [85]:
patterns

{('q136=Very high',): 187,
 ('Public health',): 189,
 ('Internal security',): 205,
 ('Regional & global influences',): 217,
 ('q115=Moderate',): 251,
 ('q109=Very high',): 254,
 ('Displacement & migration', 'q112=Missing'): 184,
 ('Displacement & migration', 'q115=Missing'): 200,
 ('q115=Very high', 'q117=Minor'): 178,
 ('q112=Missing', 'q115=Very high'): 187,
 ('q115=Very high', 'q135=Minor'): 206,
 ('1: No', 'q135=Missing'): 186,
 ('1: No', 'q115=Missing', 'q135=Missing'): 180,
 ('1: No', 'q115=Missing'): 198,
 ('1: No', 'q112=Missing'): 208,
 ('1: No', 'q112=Missing', 'q138=Missing'): 194,
 ('1: No', 'q136=Missing'): 210,
 ('1: No', 'q114=Missing', 'q136=Missing'): 206,
 ('1: No', 'q138=Missing'): 210,
 ('1: No', 'q114=Missing'): 224,
 ('q118=Very high', 'q135=Missing'): 178,
 ('q115=Missing', 'q118=Very high', 'q135=Missing'): 178,
 ('q118=Very high', 'q139=Very high'): 179,
 ('q114=Missing', 'q118=Very high'): 186,
 ('q115=Missing', 'q118=Very high'): 191,
 ('q118=Very high', 'q13

Get relevant patterns
Those that include the outcome and not all predictors are missing values

In [86]:
relevant_patterns = dict()

for k,v in patterns.items():
    items = set(k)
    
    predictors = set(k).difference(set(risks))
    flag = sum([1 if 'Missing' in a else 0 for a in k]) == 0
    
    if flag and len(set(k).intersection(set(risks))) > 0 and len(k) > 1:
        relevant_patterns[k] = v

Now we can print the relevant patterns. The pre-pended question ids should help you identify the questions that correlate to outcome with your given support threshold.

In [87]:
relevant_patterns

{('Democratic space', 'q139=High'): 180,
 ('Democratic space', 'q111=Minor'): 188,
 ('Political stability', 'q7=Minor'): 186,
 ('Political stability', 'q3=High'): 195,
 ('Political stability', 'q7=Moderate'): 226}