In [126]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [127]:
keep_cols = [
    'atgs_cat',
    # 'Q16i_recoded',
    'gamble_freq_cat',
    'Q18_cat',
    # 'r_switch_dir',  # Too skewed
    # 'r_indifference_point',
    'lotto_count',
    'Q20_cat',
    # 's_switch_dir',
    # 's_indifference_point',
    'delay_count',
]

In [128]:
filters = {
    'is_gambler': {
        'opts': ['0', '1'],
        'min_supp': 0.08,
        'cols': [i for i in (keep_cols + ['is_gambler']) if i not in ['gamble_freq_cat']],
    },
    'pgsi_binary': {
        'opts': ['0', '1'],
        'min_supp': 0.08,
        'cols': keep_cols + ['pgsi_binary'],
    }
}

In [129]:
for k, v in filters.items():
    for i in v['opts']:
        filename = 'baseline_clean.csv'
        df = pd.read_csv(filename, dtype=str)
        print(f'{k}: {i}')
        df = df[df[k] == i]
        df = df[v['cols']]
        df = pd.get_dummies(df)
        df = df.astype('bool')
        frequent_itemsets = apriori(df, min_support=v['min_supp'], use_colnames=True)
        rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.1)
        print(rules.shape)
        cols = ['antecedents','consequents']
        out = rules[cols].applymap(lambda x: tuple(x))
        for c in ['lift', 'support']:
            out[c] = [i for i in rules[c]]
        out['type'] = f'{k}_{i}'
        out = out.drop_duplicates()
        print(out.shape)
        out.to_csv(f'associations_{k}_{i}_lift.csv')

is_gambler: 0
(96, 10)
(96, 5)
is_gambler: 1
(30, 10)
(30, 5)
pgsi_binary: 0
(42, 10)
(42, 5)
pgsi_binary: 1
(240, 10)
(240, 5)
