In [1]:
import os
os.chdir("/home/ubuntu/code/pump_post_midterm/pump")

import json
import bnlearn as bn
import pandas as pd
import networkx as nx
from tqdm import tqdm


In [2]:
persona_filename = 'opinions_qa/persona_val/American_Trends_Panel_W26/date0729_midterm_personas_full_sonnet.json'

with open(persona_filename, 'r') as f:
    data = json.load(f)
data[list(data.keys())[0]]

[{'name': 'Core Personal Values',
  'description': "The participant's fundamental values, principles, and beliefs that guide their life decisions and shape their worldview on issues such as personal safety, freedom, self-reliance, non-violence, societal responsibility, and environmental conservation.",
  'level': 'high',
  'candidate_values': ['values personal safety/security',
   'values personal freedom/individual rights',
   'values self-reliance/self-sufficiency',
   'values non-violence',
   'values societal responsibility',
   'values environmental conservation'],
  'inferred_value': 'values personal safety/security'},
 {'name': 'Worldview and Belief System',
  'description': "The participant's overall belief system, worldview, and attitudes towards societal issues, crime, violence, political ideologies, cultural values, and the role of socioeconomic factors.",
  'level': 'high',
  'candidate_values': ['conservative worldview',
   'liberal worldview',
   'moderate worldview',
   

# Data Preprocessing

In [3]:
# verify data

def verify(data):
    ok_cnt = 0
    wrong_cnt = 0
    for user in data:
        for persona in data[user]:
            if all(_ not in persona['candidate_values'] for _ in [persona['inferred_value'], persona['inferred_value'][1:-1]]) :
                wrong_cnt += 1
                # print(persona)
                # print(persona['inferred_value'])
                # print(persona['candidate_values'])
                # print()
            else:
                ok_cnt += 1

    print(ok_cnt, wrong_cnt)

verify(data)

209730 2838


In [4]:
# clean data

def clean(data):
    for user in data.keys():
        personas = data[user]
        clean = []
        for p in personas:
            inf_val = p['inferred_value']
            cands = p['candidate_values']
            if inf_val not in cands:
                if inf_val[1:-1] in cands and all(_ in ['\'', '\"'] for _ in [inf_val[0], inf_val[-1]]):
                    p['inferred_value'] = inf_val[1:-1]
                    clean.append(p)
            else:
                clean.append(p)
        data[user] = clean

clean(data)
verify(data)

209730 0


In [5]:
# categorize_data

idx2opt_mapping = {}
opt2idx_mapping = {}

for persona in data[list(data.keys())[0]]:
    p_name = persona['name']
    cands = persona['candidate_values']
    idx2opt = {k+1: v for k, v in enumerate(cands)}  # 0 for potential NAs
    idx2opt[0] = "Unknown"
    idx2opt_mapping[p_name] = idx2opt
    opt2idx_mapping[p_name] = {v: k for k, v in idx2opt.items()}

def categorize(data):
    for user in data:
        personas = data[user]
        clean = []
        for p in personas:
            p_name = p['name']
            inf_val = p['inferred_value']
            p['inferred_value'] = opt2idx_mapping[p_name][inf_val]
            clean.append(p)
        data[user] = clean

categorize(data)

In [6]:
res = []

for user in data.keys():
    entry = {'user': user}
    for persona in data[user]:
        entry[f"{persona['level']}/{persona['name']}"] = persona['inferred_value']
    res.append(entry)

raw_df = pd.DataFrame(res)
raw_df.fillna(0, inplace=True)
raw_df = raw_df.astype(int)
df = raw_df[[_ for _ in raw_df.columns if _ is not "user"]]
df

Unnamed: 0,high/Core Personal Values,high/Worldview and Belief System,high/Risk Attitudes and Personality Traits,high/Attitudes Towards Guns and Gun Culture,high/Cultural Background and Traditions,high/Risk Attitudes and Tolerance,high/Political Ideology on Gun Ownership and Regulation,high/Trust in Authorities and Institutions,high/Sense of Responsibility,mid/Childhood Environment and Influences,...,low/Household Composition with Children,low/Financial and Economic Circumstances,low/Health and Lifestyle,low/Employment Status,low/Age and Life Experience,low/Geographic Risk Factors,low/Firearm-Related Recreational Activities,low/Occupational and Social Influences,low/Shooting Experience and Proficiency,low/Personal Experiences with Guns and Gun-Related Incidents
0,1,3,1,3,2,1,3,2,1,4,...,2,2,2,1,1,2,3,3,3,3
1,1,3,1,3,2,1,3,1,1,4,...,2,1,1,0,1,2,3,3,3,3
2,1,3,1,3,2,1,3,2,1,4,...,2,1,2,0,1,2,3,3,3,3
3,1,1,2,1,1,2,1,2,2,2,...,2,2,2,2,2,2,2,2,2,2
4,1,3,1,3,2,2,3,2,1,2,...,2,2,2,0,1,2,3,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4163,1,3,1,3,2,1,3,1,1,4,...,2,1,1,0,1,2,3,3,3,3
4164,1,1,1,3,2,2,1,2,2,4,...,2,2,1,0,1,2,3,2,3,2
4165,1,3,1,3,2,2,3,2,1,4,...,2,2,2,0,1,2,2,2,2,2
4166,1,3,1,3,2,2,3,2,1,2,...,2,2,2,2,1,2,3,2,3,2


# Creating BN

In [7]:

def get_bn(df, method, score):

    model = bn.structure_learning.fit(df, methodtype=method, scoretype=score)
    # Plot detected DAG
    pos = nx.spring_layout(model['adjmat'])
    # G = bn.plot(model, pos=pos)
    # dirs = []
    # for edge in G['edge_properties']:
    #     s, t = edge
    #     s = s.split('/')[0]
    #     t = t.split('/')[0]
    #     dirs.append(f"{s}_{t}")
    # from collections import Counter
    # sorted(Counter(dirs).items())


    # Compute edge strength using chi-square independence test
    model1 = bn.independence_test(model, df, alpha=0.05, prune=True)
    pos = nx.spring_layout(model['adjmat'])
    # G1 = bn.plot(model, pos=pos)
    # dirs = []
    # for edge in G1['edge_properties']:
    #     s, t = edge
    #     s = s.split('/')[0]
    #     t = t.split('/')[0]
    #     dirs.append(f"{s}_{t}")
    # from collections import Counter
    # sorted(Counter(dirs).items())


    param_model = bn.parameter_learning.fit(model1, df)

    return param_model


In [11]:
param_model = get_bn(df, method='hc', score='bic')

[bnlearn] >Computing best DAG using [hc]
[bnlearn] >Set scoring type at [bic]
[bnlearn] >Compute structure scores for model comparison (higher is better).
[bnlearn] >Compute edge strength with [chi_square]
[bnlearn] >Edge [mid/Childhood Environment and Influences <-> mid/Moral Values and Ethical Principles] [P=0.42328] is excluded because it was not significant (P<0.05) with [chi_square]
[bnlearn] >Parameter learning> Computing parameters using [bayes]
[bnlearn] >Converting [<class 'pgmpy.base.DAG.DAG'>] to BayesianNetwork model.
[bnlearn] >Converting adjmat to BayesianNetwork.


KeyboardInterrupt: 

# Get new values (EM)

In [None]:
for _ in range(1):

    Xtest = bn.sampling(param_model, 10)
    cols = Xtest.columns
    variables=[col for col in cols if col.startswith('mid') or col.startswith('high')]
    for var in tqdm(variables):
        Pout = bn.predict(param_model, df[[col for col in cols if col not in variables]], variables=[var])
        df[var] = Pout[var]
        raw_df[var] = Pout[var]
    
    # param_model = get_bn(df)


  0%|          | 0/26 [00:00<?, ?it/s]

[bnlearn]> Remaining columns for inference: 16


100%|██████████| 753/753 [00:05<00:00, 150.15it/s]
  4%|▍         | 1/26 [00:05<02:07,  5.08s/it]

[bnlearn]> Remaining columns for inference: 16


100%|██████████| 753/753 [00:04<00:00, 151.17it/s]
  8%|▊         | 2/26 [00:10<02:01,  5.06s/it]

[bnlearn]> Remaining columns for inference: 16


100%|██████████| 753/753 [00:05<00:00, 150.35it/s]
 12%|█▏        | 3/26 [00:15<01:56,  5.06s/it]

[bnlearn]> Remaining columns for inference: 16


100%|██████████| 753/753 [00:04<00:00, 151.11it/s]
 15%|█▌        | 4/26 [00:20<01:51,  5.06s/it]

[bnlearn]> Remaining columns for inference: 16


100%|██████████| 753/753 [00:04<00:00, 150.86it/s]
 19%|█▉        | 5/26 [00:25<01:46,  5.06s/it]

[bnlearn]> Remaining columns for inference: 16


100%|██████████| 753/753 [00:04<00:00, 151.61it/s]
 23%|██▎       | 6/26 [00:30<01:40,  5.05s/it]

[bnlearn]> Remaining columns for inference: 16


100%|██████████| 753/753 [00:05<00:00, 146.38it/s]
 27%|██▋       | 7/26 [00:35<01:36,  5.10s/it]

[bnlearn]> Remaining columns for inference: 16


100%|██████████| 753/753 [00:04<00:00, 152.31it/s]
 31%|███       | 8/26 [00:40<01:31,  5.07s/it]

[bnlearn]> Remaining columns for inference: 16


100%|██████████| 753/753 [00:05<00:00, 143.57it/s]
 35%|███▍      | 9/26 [00:45<01:27,  5.14s/it]

[bnlearn]> Remaining columns for inference: 16


100%|██████████| 753/753 [00:04<00:00, 152.10it/s]
 38%|███▊      | 10/26 [00:50<01:21,  5.10s/it]

[bnlearn]> Remaining columns for inference: 16


100%|██████████| 753/753 [00:05<00:00, 148.11it/s]
 42%|████▏     | 11/26 [00:56<01:16,  5.12s/it]

[bnlearn]> Remaining columns for inference: 16


100%|██████████| 753/753 [00:05<00:00, 147.11it/s]
 46%|████▌     | 12/26 [01:01<01:11,  5.14s/it]

[bnlearn]> Remaining columns for inference: 16


100%|██████████| 753/753 [00:03<00:00, 195.15it/s]
 50%|█████     | 13/26 [01:05<01:01,  4.77s/it]

[bnlearn]> Remaining columns for inference: 16


100%|██████████| 753/753 [00:04<00:00, 152.21it/s]
 54%|█████▍    | 14/26 [01:10<00:58,  4.84s/it]

[bnlearn]> Remaining columns for inference: 16


100%|██████████| 753/753 [00:04<00:00, 156.08it/s]
 58%|█████▊    | 15/26 [01:14<00:53,  4.85s/it]

[bnlearn]> Remaining columns for inference: 16


100%|██████████| 753/753 [00:05<00:00, 148.69it/s]
 62%|██████▏   | 16/26 [01:20<00:49,  4.94s/it]

[bnlearn]> Remaining columns for inference: 16


100%|██████████| 753/753 [00:04<00:00, 152.65it/s]
 65%|██████▌   | 17/26 [01:25<00:44,  4.95s/it]

[bnlearn]> Remaining columns for inference: 16


100%|██████████| 753/753 [00:04<00:00, 152.16it/s]
 69%|██████▉   | 18/26 [01:30<00:39,  4.97s/it]

[bnlearn]> Remaining columns for inference: 16


100%|██████████| 753/753 [00:04<00:00, 151.31it/s]
 73%|███████▎  | 19/26 [01:35<00:34,  4.99s/it]

[bnlearn]> Remaining columns for inference: 16


100%|██████████| 753/753 [00:05<00:00, 148.18it/s]
 77%|███████▋  | 20/26 [01:40<00:30,  5.04s/it]

[bnlearn]> Remaining columns for inference: 16


100%|██████████| 753/753 [00:05<00:00, 145.93it/s]
 81%|████████  | 21/26 [01:45<00:25,  5.09s/it]

[bnlearn]> Remaining columns for inference: 16


100%|██████████| 753/753 [00:05<00:00, 148.37it/s]
 85%|████████▍ | 22/26 [01:50<00:20,  5.11s/it]

[bnlearn]> Remaining columns for inference: 16


100%|██████████| 753/753 [00:04<00:00, 152.36it/s]
 88%|████████▊ | 23/26 [01:55<00:15,  5.08s/it]

[bnlearn]> Remaining columns for inference: 16


100%|██████████| 753/753 [00:05<00:00, 149.37it/s]
 92%|█████████▏| 24/26 [02:00<00:10,  5.08s/it]

[bnlearn]> Remaining columns for inference: 16


100%|██████████| 753/753 [00:04<00:00, 155.92it/s]
 96%|█████████▌| 25/26 [02:05<00:05,  5.03s/it]

[bnlearn]> Remaining columns for inference: 16


100%|██████████| 753/753 [00:05<00:00, 149.48it/s]
100%|██████████| 26/26 [02:10<00:00,  5.03s/it]


In [None]:
raw_df

Unnamed: 0,user,high/Core Personal Values,high/Worldview and Belief System,high/Risk Attitudes and Personality Traits,high/Attitudes Towards Guns and Gun Culture,high/Cultural Background and Traditions,high/Risk Attitudes and Tolerance,high/Political Ideology on Gun Ownership and Regulation,high/Trust in Authorities and Institutions,high/Sense of Responsibility,...,low/Household Composition with Children,low/Financial and Economic Circumstances,low/Health and Lifestyle,low/Employment Status,low/Age and Life Experience,low/Geographic Risk Factors,low/Firearm-Related Recreational Activities,low/Occupational and Social Influences,low/Shooting Experience and Proficiency,low/Personal Experiences with Guns and Gun-Related Incidents
0,2820,1,3,1,3,2,1,3,2,1,...,2,2,2,1,1,2,3,3,3,3
1,3718,1,3,1,3,2,1,3,1,1,...,2,1,1,0,1,2,3,3,3,3
2,362,1,3,1,3,2,1,3,2,1,...,2,1,2,0,1,2,3,3,3,3
3,1758,1,1,2,1,1,2,1,2,1,...,2,2,2,2,2,2,2,2,2,2
4,124,1,3,1,3,1,2,3,2,1,...,2,2,2,0,1,2,3,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4163,4162,1,3,1,3,2,1,3,1,1,...,2,1,1,0,1,2,3,3,3,3
4164,4163,1,3,1,3,2,2,3,2,1,...,2,2,1,0,1,2,3,2,3,2
4165,4164,1,3,1,3,2,2,3,2,1,...,2,2,2,0,1,2,2,2,2,2
4166,4166,1,3,1,3,1,2,3,2,1,...,2,2,2,2,1,2,3,2,3,2


In [15]:
for var in raw_df.columns:
    if '/' not in var or var.split('/')[1] not in idx2opt_mapping: continue
    try:
        raw_df[var] = raw_df[var].apply(lambda x: idx2opt_mapping[var.split('/')[1]][x])
    except:
        print(var)
        print(idx2opt_mapping[var.split('/')[1]])

raw_df

Unnamed: 0,user,high/Core Personal Values,high/Worldview and Belief System,high/Risk Attitudes and Personality Traits,high/Attitudes Towards Guns and Gun Culture,high/Cultural Background and Traditions,high/Risk Attitudes and Tolerance,high/Political Ideology on Gun Ownership and Regulation,high/Trust in Authorities and Institutions,high/Sense of Responsibility,...,low/Household Composition with Children,low/Financial and Economic Circumstances,low/Health and Lifestyle,low/Employment Status,low/Age and Life Experience,low/Geographic Risk Factors,low/Firearm-Related Recreational Activities,low/Occupational and Social Influences,low/Shooting Experience and Proficiency,low/Personal Experiences with Guns and Gun-Related Incidents
0,2820,values personal safety/security,moderate worldview,risk-averse,neutral,anti-gun culture,risk-averse,moderate,low trust,high responsibility,...,no children present,unstable finances,poor health,employed with job security,younger,low-risk area,non-participant,no influence,non-shooter,no experiences
1,3718,values personal safety/security,moderate worldview,risk-averse,neutral,anti-gun culture,risk-averse,moderate,high trust,high responsibility,...,no children present,stable finances,good health,Unknown,younger,low-risk area,non-participant,no influence,non-shooter,no experiences
2,362,values personal safety/security,moderate worldview,risk-averse,neutral,anti-gun culture,risk-averse,moderate,low trust,high responsibility,...,no children present,stable finances,poor health,Unknown,younger,low-risk area,non-participant,no influence,non-shooter,no experiences
3,1758,values personal safety/security,conservative worldview,risk-taking,pro-gun,pro-gun culture,risk-taking,conservative,low trust,high responsibility,...,no children present,unstable finances,poor health,employed without job security,older,low-risk area,occasional participant,moderate influence,novice shooter,negative experiences
4,124,values personal safety/security,moderate worldview,risk-averse,neutral,pro-gun culture,risk-taking,moderate,low trust,high responsibility,...,no children present,unstable finances,poor health,Unknown,younger,low-risk area,non-participant,moderate influence,novice shooter,negative experiences
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4163,4162,values personal safety/security,moderate worldview,risk-averse,neutral,anti-gun culture,risk-averse,moderate,high trust,high responsibility,...,no children present,stable finances,good health,Unknown,younger,low-risk area,non-participant,no influence,non-shooter,no experiences
4164,4163,values personal safety/security,moderate worldview,risk-averse,neutral,anti-gun culture,risk-taking,moderate,low trust,high responsibility,...,no children present,unstable finances,good health,Unknown,younger,low-risk area,non-participant,moderate influence,non-shooter,negative experiences
4165,4164,values personal safety/security,moderate worldview,risk-averse,neutral,anti-gun culture,risk-taking,moderate,low trust,high responsibility,...,no children present,unstable finances,poor health,Unknown,younger,low-risk area,occasional participant,moderate influence,novice shooter,negative experiences
4166,4166,values personal safety/security,moderate worldview,risk-averse,neutral,pro-gun culture,risk-taking,moderate,low trust,high responsibility,...,no children present,unstable finances,poor health,employed without job security,younger,low-risk area,non-participant,moderate influence,non-shooter,negative experiences


# Output raw_df

In [None]:

for _, row in raw_df.iterrows():
    user = str(row['user'])
    personas = data[user]
    for idx, p in enumerate(personas):
        val = row[f"{p['level']}/{p['name']}"]
        mapping = idx2opt_mapping[p['name']]
        new_val = mapping[val]
        data[user][idx]['inferred_value'] = new_val

new_persona_filename = 'opinions_qa/persona_val/American_Trends_Panel_W26/date0826_personas_full_sonnet_bn_hcaic.json'

with open(new_persona_filename, 'w') as f:
    json.dump(data, f, indent=4)

# Temp General

In [1]:
import os
os.chdir("/home/ubuntu/code/pump_post_midterm/pump")

import json
import bnlearn as bn
import pandas as pd
import networkx as nx
from tqdm import tqdm


for score in tqdm(['k2']):

    persona_filename = 'opinions_qa/persona_val/American_Trends_Panel_W26/date0729_midterm_personas_full_sonnet.json'

    with open(persona_filename, 'r') as f:
        data = json.load(f)
    data[list(data.keys())[0]]

    # clean data

    def clean(data):
        for user in data.keys():
            personas = data[user]
            clean = []
            for p in personas:
                inf_val = p['inferred_value']
                cands = p['candidate_values']
                if inf_val not in cands:
                    if inf_val[1:-1] in cands and all(_ in ['\'', '\"'] for _ in [inf_val[0], inf_val[-1]]):
                        p['inferred_value'] = inf_val[1:-1]
                        clean.append(p)
                else:
                    clean.append(p)
            data[user] = clean

    clean(data)

    # categorize_data

    idx2opt_mapping = {}
    opt2idx_mapping = {}

    for persona in data[list(data.keys())[0]]:
        p_name = persona['name']
        cands = persona['candidate_values']
        idx2opt = {k+1: v for k, v in enumerate(cands)}  # 0 for potential NAs
        idx2opt[0] = "Unknown"
        idx2opt_mapping[p_name] = idx2opt
        opt2idx_mapping[p_name] = {v: k for k, v in idx2opt.items()}

    def categorize(data):
        for user in data:
            personas = data[user]
            clean = []
            for p in personas:
                p_name = p['name']
                inf_val = p['inferred_value']
                p['inferred_value'] = opt2idx_mapping[p_name][inf_val]
                clean.append(p)
            data[user] = clean

    categorize(data)



    def get_bn(df, method, score):

        model = bn.structure_learning.fit(df, methodtype=method, scoretype=score)
        # Plot detected DAG
        pos = nx.spring_layout(model['adjmat'])
        # G = bn.plot(model, pos=pos)
        # dirs = []
        # for edge in G['edge_properties']:
        #     s, t = edge
        #     s = s.split('/')[0]
        #     t = t.split('/')[0]
        #     dirs.append(f"{s}_{t}")
        # from collections import Counter
        # sorted(Counter(dirs).items())


        # Compute edge strength using chi-square independence test
        model1 = bn.independence_test(model, df, alpha=0.05, prune=True)
        pos = nx.spring_layout(model['adjmat'])
        # G1 = bn.plot(model, pos=pos)
        # dirs = []
        # for edge in G1['edge_properties']:
        #     s, t = edge
        #     s = s.split('/')[0]
        #     t = t.split('/')[0]
        #     dirs.append(f"{s}_{t}")
        # from collections import Counter
        # sorted(Counter(dirs).items())


        param_model = bn.parameter_learning.fit(model1, df)

        return param_model



    res = []
    for user in data.keys():
        entry = {'user': user}
        for persona in data[user]:
            entry[f"{persona['level']}/{persona['name']}"] = persona['inferred_value']
        res.append(entry)

    raw_df = pd.DataFrame(res)
    raw_df.fillna(0, inplace=True)
    raw_df = raw_df.astype(int)
    df = raw_df[[_ for _ in raw_df.columns if _ != "user"]]
    df

    param_model = get_bn(df, method='hc', score=score)

    for _ in range(1):
        Xtest = bn.sampling(param_model, 10)
        cols = Xtest.columns
        variables=[col for col in cols if col.startswith('mid') or col.startswith('high')]
        for var in tqdm(variables):
            Pout = bn.predict(param_model, df[[col for col in cols if col not in variables]], variables=[var])
            df[var] = Pout[var]
            raw_df[var] = Pout[var]
        
        # param_model = get_bn(df)

    for _, row in raw_df.iterrows():
        user = str(row['user'])
        personas = data[user]
        for idx, p in enumerate(personas):
            val = row[f"{p['level']}/{p['name']}"]
            mapping = idx2opt_mapping[p['name']]
            new_val = mapping[val]
            data[user][idx]['inferred_value'] = new_val

    new_persona_filename = f'opinions_qa/persona_val/American_Trends_Panel_W26/date0826_personas_full_sonnet_bn_hc{score}.json'

    with open(new_persona_filename, 'w') as f:
        json.dump(data, f, indent=4)

  0%|          | 0/1 [00:00<?, ?it/s]

[bnlearn] >Computing best DAG using [hc]
[bnlearn] >Set scoring type at [k2]
