### Importings

In [1]:
import pandas as pd
import warnings
import numpy as np
warnings.filterwarnings("ignore")
pd.options.display.max_columns = None

# set your path
DATASET_PATH = './datasets'

In [2]:
df = pd.read_sas(DATASET_PATH + '/ess6e02_4.sas7bdat', encoding = 'latin-1')
df_efa = pd.read_csv('EFA_2.csv')




### Filter columns

In [3]:
cols = ['cntry', 'sclmeet', 'inprdsc', 'aesfdrk', 'health', 'hlthhmp', 'sclact', 'crmvct',
        'fltsd', 'fltdpr', 'fltanx', 'flteeff', 'slprl', 'cldgng', 'fltlnl',
        'tmimdng', 'tmabdng', 'tmendng', 'flapppl', 'lchshcp',
        'stflife', 'happy', 'enjlf', 'wrhpp', 'fltpcfl', 'enrglot',
        'dclvlf', 'accdng', 'dngval', 'optftr', 'lotsgot', 'pstvms', 
        'flclpla', 'pplahlp', 'rehlppl', 'trstprl', 'trstlgl', 'trstplc', 'trstplt', 'trstprt', 'trstep', 'trstun', 
        'ppltrst', 'pplfair', 'pplhlp','deaimpp', 'flrms', 'sedirlf', 'wrbknrm', 'uempla','uempli']

df = df[cols]

### Dataprep

#### Summarizing similar well-beings based on EFA

In [4]:
# dropping columns that have different measures
df_efa.loc[df_efa['Item codes'].isin(['enjlf', 'wrhpp', 'fltpcfl', 'enrglot', 'pplfair', 'flclpla']), 'ESS Items'].tolist()

['Enjoyed life, how often past week',
 'Were happy, how often past week',
 'Felt calm and peaceful, how often past week',
 'Had lot of energy, how often past week',
 'Most people try to take advantage of you, or try to be fair',
 'Feel close to the people in local area']

In [5]:
df_efa = df_efa[~df_efa['Item codes'].isin(['enjlf', 'wrhpp', 'fltpcfl', 'enrglot', 'pplfair', 'flclpla'])] # dropping columns 

In [6]:
# creating dictionary to assign each factor to respective item code

efa_item_codes = dict()
for factor in df_efa.columns[2:]:
    efa_item_codes.update({factor: df_efa.loc[~df_efa[factor].isna(), 'Item codes'].tolist()})

In [7]:
def summarize_factors(df, factor, factor_name, greater_than, fill_values, efa_item_codes=efa_item_codes, df_efa=df_efa):
    '''Summarize variables of certain factor
    '''
    print(f'{factor} was named {factor_name} and consists on: \n')
    print('\n'.join(df_efa.loc[~df_efa[factor].isna(), 'ESS Items'].tolist()))
    
    df_factor = df[efa_item_codes[factor]] # filter only variables of the factor
    df_factor[df_factor > greater_than] = np.NaN
    df[factor_name] = df_factor.mean(axis=1).fillna(fill_values).astype(int) # summarize by the mean, desconsidering useless values
    return df

In [8]:
df = summarize_factors(df, 'Factor 1', 'Sadness', 4, 9)

Factor 1 was named Sadness and consists on: 

Felt sad, how often past week
Felt depressed, how often past week
Felt anxious, how often past week
Felt everything did as effort, how often past week
Sleep was restless, how often past week
Could not get going, how often past week
Felt lonely, how often past week


In [9]:
df = summarize_factors(df, 'Factor 2', 'Enthusiasm', 10, 99)

Factor 2 was named Enthusiasm and consists on: 

Interested in what you are doing, how much of the time
Absorbed in what you are doing, how much of the time
Enthusiastic about what you are doing, how much of the time
Feel appreciated by people you are close to


In [10]:
df = summarize_factors(df, 'Factor 3', 'Happiness', 10, 99)

Factor 3 was named Happiness and consists on: 

How satisfied with life as a whole
How happy are you


In [11]:
df = summarize_factors(df, 'Factor 4', 'Optimism', 5, 9)

Factor 4 was named Optimism and consists on: 

Free to decide how to live my life
Feel accomplishment from what I do
Feel what I do in life is valuable and worthwhile
Always optimistic about my future
There are lots of things I am good at
In general feel very positive about myself


In [12]:
df = summarize_factors(df, 'Factor 5', 'Trust', 10, 99)

Factor 5 was named Trust and consists on: 

Most people can be trusted or you can't be too careful
Most of the time people helpful or mostly looking out for themselves


In [13]:
df = summarize_factors(df, 'Factor 6', 'Helpful', 6, 9)

Factor 6 was named Helpful and consists on: 

Feel people in local area help one another
Receive help and support from people you are close to


#### Assigning values

In [14]:
df['cntry'] = df['cntry'].map({'AL':'Albania','BE':'Belgium','BG':'Bulgaria','CH':'Switzerland','CY':'Cyprus','CZ':'Czechia','DE':'Germany','DK':'Denmark','EE':'Estonia','ES':'Spain','FI':'Finland','FR':'France','GB':'United Kingdom','HU':'Hungary','IE':'Ireland','IL':'Israel','IS':'Iceland','IT':'Italy','LT':'Lithuania','NL':'Netherlands','NO':'Norway','PL':'Poland','PT':'Portugal','RU':'Russian Federation','SE':'Sweden','SI':'Slovenia','SK':'Slovakia','UA':'Ukraine','XK':'Kosovo'})

In [15]:
df['sclmeet'] = df['sclmeet'].map({1:'Never',2:'Less than once a month',3:'Once a month',4:'Several times a month',5:'Once a week',6:'Several times a week',7:'Every day',77:'Refusal',88:'Dont know',99:'No answer'})
df['inprdsc'] = df['inprdsc'].map({0:'None',1:'1',2:'2',3:'3',4:'4-6',5:'7-9',6:'10 or more',77:'Refusal',88:'Dont know',99:'No answer'})
df['aesfdrk'] = df['aesfdrk'].map({1:'Very safe',2:'Safe',3:'Unsafe',4:'Very unsafe',7:'Refusal',8:'Dont know',9:'No answer'})
df['health'] = df['health'].map({1:'Very good',2:'Good',3:'Fair',4:'Bad',5:'Very bad',7:'Refusal',8:'Dont know',9:'No answer'})
df['hlthhmp'] = df['hlthhmp'].map({1:'Yes a lot',2:'Yes to some extent',3:'No',7:'Refusal',8:'Dont know',9:'No answer'})

In [16]:
for col in ['sclact', 'crmvct']:
    df[col] = df[col].map({1:'Much less than most',2:'Less than most',3:'About the same',
                           4:'More than most',5:'Much more than most',7:'Refusal',8:'Dont know',9:'No answer'})

In [17]:
for col in ['fltsd', 'fltdpr', 'fltanx', 'flteeff', 'slprl', 'cldgng', 'fltlnl',
            'enjlf', 'wrhpp', 'fltpcfl', 'enrglot', 'Sadness']:
    df[col] = df[col].map({1:'None or almost none of the time',2:'Some of the time',3:'Most of the time',
                           4:'All or almost all of the time',7:'Refusal',8:'Dont know',9:'No answer'})

In [18]:
for col in ['dclvlf', 'accdng', 'dngval', 'optftr', 'lotsgot', 'pstvms', 'flclpla', 'lchshcp', 'flrms', 'wrbknrm', 'Optimism']:
    df[col] = df[col].map({1:'Agree strongly',2:'Agree',3:'Neither agree nor disagree',4:'Disagree',
                           5:'Disagree strongly',7:'Refusal',8:'Dont know',9:'No answer'})

In [19]:
for col in df.select_dtypes(include='float'):
    df[col] = df[col].astype('int16')

In [20]:
df.head()

Unnamed: 0,cntry,sclmeet,inprdsc,aesfdrk,health,hlthhmp,sclact,crmvct,fltsd,fltdpr,fltanx,flteeff,slprl,cldgng,fltlnl,tmimdng,tmabdng,tmendng,flapppl,lchshcp,stflife,happy,enjlf,wrhpp,fltpcfl,enrglot,dclvlf,accdng,dngval,optftr,lotsgot,pstvms,flclpla,pplahlp,rehlppl,trstprl,trstlgl,trstplc,trstplt,trstprt,trstep,trstun,ppltrst,pplfair,pplhlp,deaimpp,flrms,sedirlf,wrbknrm,uempla,uempli,Sadness,Enthusiasm,Happiness,Optimism,Trust,Helpful
0,Albania,Once a week,3,Very safe,Fair,No,About the same,Less than most,Some of the time,Most of the time,Some of the time,Some of the time,All or almost all of the time,,None or almost none of the time,8,8,8,10,Agree,6,8,Some of the time,Most of the time,Most of the time,Most of the time,Agree strongly,Agree,Agree,Agree strongly,Disagree,Agree,Agree,1,6,0,0,2,0,0,2,2,5,3,0,5,Disagree,3,Disagree,0,0,Some of the time,8,7,Agree,2,3
1,Albania,Once a month,1,Unsafe,Good,No,Less than most,Less than most,All or almost all of the time,All or almost all of the time,All or almost all of the time,Some of the time,Most of the time,,All or almost all of the time,10,5,0,10,Agree strongly,0,0,None or almost none of the time,None or almost none of the time,Some of the time,None or almost none of the time,Agree strongly,Agree,Agree strongly,Agree,Agree strongly,Neither agree nor disagree,Neither agree nor disagree,0,6,0,0,10,8,10,10,10,0,88,0,0,Agree,10,Agree,1,0,Most of the time,6,0,Agree strongly,0,3
2,Albania,Every day,10 or more,Unsafe,Very good,No,About the same,Much less than most,Some of the time,Some of the time,None or almost none of the time,Most of the time,Some of the time,,Some of the time,10,10,10,3,Agree,5,5,Most of the time,Some of the time,Some of the time,Most of the time,Agree strongly,Agree,Agree strongly,Agree strongly,Agree strongly,Agree,Disagree,0,6,88,2,0,0,0,8,5,5,5,4,5,Disagree strongly,10,Disagree strongly,0,0,Some of the time,8,5,Agree strongly,4,3
3,Albania,Several times a week,2,Very safe,Fair,No,About the same,Less than most,Most of the time,Some of the time,Some of the time,Most of the time,Most of the time,,Most of the time,88,88,88,88,Agree strongly,1,4,Some of the time,Some of the time,Most of the time,Some of the time,Agree,Agree,Dont know,Agree,Dont know,Agree,Dont know,8,8,1,7,0,1,1,10,8,10,5,2,88,Agree strongly,88,Agree strongly,0,0,Some of the time,99,2,Agree,6,9
4,Albania,Several times a month,,Safe,Fair,No,Less than most,Less than most,Most of the time,Some of the time,Some of the time,Most of the time,All or almost all of the time,,Some of the time,10,88,88,8,Neither agree nor disagree,6,7,Dont know,Some of the time,None or almost none of the time,None or almost none of the time,Agree,Neither agree nor disagree,Agree,Disagree,Agree strongly,Neither agree nor disagree,Agree,4,6,8,6,7,8,7,9,10,5,6,7,88,Neither agree nor disagree,10,Agree,0,0,Some of the time,9,6,Agree,6,5


### Exporting

In [21]:
df.to_csv(DATASET_PATH + '/df_abt.csv', index=False)

In [22]:
df_trstprl_happiness = df[["trstprl", "Happiness"]]
df_trstprl_happiness.to_csv(DATASET_PATH + '/df_trstprl_happiness.csv', index=False)

In [23]:
DATASET_PATH

'./datasets'

In [24]:
df_h_t = df[["Happiness", "trstprl"]]
df_h_t.shape
# .apply(pd.Series.value_counts)

(54673, 2)

In [25]:
df_h_t = df_h_t[df_h_t["Happiness"] < 11]
df_h_t = df_h_t[df_h_t["trstprl"] < 11]

In [26]:
df_h_t

Unnamed: 0,Happiness,trstprl
0,7,0
1,0,0
3,2,1
4,6,8
5,7,4
...,...,...
54667,6,4
54668,5,3
54670,6,5
54671,7,3


In [27]:
crosstab = pd.crosstab(df_h_t.Happiness, df_h_t.trstprl)
crosstab 

trstprl,0,1,2,3,4,5,6,7,8,9,10
Happiness,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,321,35,31,24,15,28,4,7,2,2,10
1,322,93,73,52,31,48,14,8,9,6,7
2,485,204,165,142,76,104,34,21,27,7,14
3,616,284,345,289,169,233,70,63,42,10,11
4,720,388,473,530,335,391,148,119,87,16,18
5,1252,553,751,813,637,954,342,239,142,34,55
6,1145,611,786,983,781,1142,575,467,262,72,76
7,1183,632,1000,1306,1208,1765,1079,1052,654,168,98
8,1098,498,904,1296,1264,2170,1553,1710,1129,333,193
9,622,319,434,646,666,1233,901,1109,850,285,172


In [28]:
rows = []
for i in range(11):
    for k in range(11):
        new = {"Happiness": i, "trstprl": k, "value": crosstab.iloc[i,k]}
        rows.append(new)
counts = pd.DataFrame(rows)

In [29]:
counts.to_csv("./datasets/happiness_trstprl_count.csv")

In [30]:
counts

Unnamed: 0,Happiness,trstprl,value
0,0,0,321
1,0,1,35
2,0,2,31
3,0,3,24
4,0,4,15
...,...,...,...
116,10,6,257
117,10,7,356
118,10,8,319
119,10,9,110


In [31]:
df.head()

Unnamed: 0,cntry,sclmeet,inprdsc,aesfdrk,health,hlthhmp,sclact,crmvct,fltsd,fltdpr,fltanx,flteeff,slprl,cldgng,fltlnl,tmimdng,tmabdng,tmendng,flapppl,lchshcp,stflife,happy,enjlf,wrhpp,fltpcfl,enrglot,dclvlf,accdng,dngval,optftr,lotsgot,pstvms,flclpla,pplahlp,rehlppl,trstprl,trstlgl,trstplc,trstplt,trstprt,trstep,trstun,ppltrst,pplfair,pplhlp,deaimpp,flrms,sedirlf,wrbknrm,uempla,uempli,Sadness,Enthusiasm,Happiness,Optimism,Trust,Helpful
0,Albania,Once a week,3,Very safe,Fair,No,About the same,Less than most,Some of the time,Most of the time,Some of the time,Some of the time,All or almost all of the time,,None or almost none of the time,8,8,8,10,Agree,6,8,Some of the time,Most of the time,Most of the time,Most of the time,Agree strongly,Agree,Agree,Agree strongly,Disagree,Agree,Agree,1,6,0,0,2,0,0,2,2,5,3,0,5,Disagree,3,Disagree,0,0,Some of the time,8,7,Agree,2,3
1,Albania,Once a month,1,Unsafe,Good,No,Less than most,Less than most,All or almost all of the time,All or almost all of the time,All or almost all of the time,Some of the time,Most of the time,,All or almost all of the time,10,5,0,10,Agree strongly,0,0,None or almost none of the time,None or almost none of the time,Some of the time,None or almost none of the time,Agree strongly,Agree,Agree strongly,Agree,Agree strongly,Neither agree nor disagree,Neither agree nor disagree,0,6,0,0,10,8,10,10,10,0,88,0,0,Agree,10,Agree,1,0,Most of the time,6,0,Agree strongly,0,3
2,Albania,Every day,10 or more,Unsafe,Very good,No,About the same,Much less than most,Some of the time,Some of the time,None or almost none of the time,Most of the time,Some of the time,,Some of the time,10,10,10,3,Agree,5,5,Most of the time,Some of the time,Some of the time,Most of the time,Agree strongly,Agree,Agree strongly,Agree strongly,Agree strongly,Agree,Disagree,0,6,88,2,0,0,0,8,5,5,5,4,5,Disagree strongly,10,Disagree strongly,0,0,Some of the time,8,5,Agree strongly,4,3
3,Albania,Several times a week,2,Very safe,Fair,No,About the same,Less than most,Most of the time,Some of the time,Some of the time,Most of the time,Most of the time,,Most of the time,88,88,88,88,Agree strongly,1,4,Some of the time,Some of the time,Most of the time,Some of the time,Agree,Agree,Dont know,Agree,Dont know,Agree,Dont know,8,8,1,7,0,1,1,10,8,10,5,2,88,Agree strongly,88,Agree strongly,0,0,Some of the time,99,2,Agree,6,9
4,Albania,Several times a month,,Safe,Fair,No,Less than most,Less than most,Most of the time,Some of the time,Some of the time,Most of the time,All or almost all of the time,,Some of the time,10,88,88,8,Neither agree nor disagree,6,7,Dont know,Some of the time,None or almost none of the time,None or almost none of the time,Agree,Neither agree nor disagree,Agree,Disagree,Agree strongly,Neither agree nor disagree,Agree,4,6,8,6,7,8,7,9,10,5,6,7,88,Neither agree nor disagree,10,Agree,0,0,Some of the time,9,6,Agree,6,5


## Create dataset for arc plot

In [75]:
group_mapping = {
    "trstplt": 0,
    "trstprl": 1,
    "trstlgl": 2,
    "trstplc": 3,
    "Happiness": 4,
    "trstprt": 5,
    "trstep": 6,
    "trstun": 7
}

In [76]:
ess = {}
ess["nodes"] = []
for k, v in group_mapping.items():
    for i in range(11):
        ess["nodes"].append({"name": f"{k}_{i}", "group": v, "index": v*11+i})

In [77]:
columns_to_filter = group_mapping.keys()
cleaned_df = df.copy()
for col in columns_to_filter:
    cleaned_df = cleaned_df[cleaned_df[col] < 11]

In [78]:
from collections import defaultdict

links = defaultdict(int)
trst_cols = [col for col in columns_to_filter if col != "Happiness"]
for i, row in cleaned_df.iterrows():
    for col in trst_cols:
        links[f"{col}_{row[col]}_{row['Happiness']}"] += 1 

In [79]:
node_to_index_mapping = {node["name"]: node["index"] for node in ess["nodes"]}

In [80]:
happiness_val_index_mapping = {f"Happiness_{i}": 44+i for i in range(11)}

In [81]:
import random
# format links for vega
formatted_links = []
for k, v in links.items():
    trst_col, trust_val, happiness_val = k.split("_")
    formatted_links.append({'source': node_to_index_mapping[f"{trst_col}_{trust_val}"], 
                            'target': node_to_index_mapping[f"Happiness_{happiness_val}"], 
                            'value': v/100})

ess["links"] = formatted_links

In [82]:
with open("arc_plot_trst_vs_happiness.json", "w+") as f:
    json.dump(ess, f)

## Create dataset for edge bundling

In [39]:
ess_network = {}
ess_network["nodes"] = []
for k, v in group_mapping.items():
    for i in range(11):
        ess_network["nodes"].append({"name": f"{k}_{i}", "group": v, "index": v*11+i})

In [40]:
from collections import defaultdict

links = defaultdict(int)
trst_cols = [col for col in columns_to_filter if col != "Happiness"]
for _, row in cleaned_df.iterrows():
    for i, col_i in enumerate(columns_to_filter):
        for j, col_j in enumerate(columns_to_filter):
            if j > i:
                links[f"{col_i}_{row[col_i]}__{col_j}_{row[col_j]}"] += 1 

In [41]:
formatted_links = []
for k, v in links.items():
    if k.startswith("Happiness"):
        target, source = k.split("__")
    else:
        source, target = k.split("__")
    formatted_links.append({'source': node_to_index_mapping[source], 'target': node_to_index_mapping[target], 'value': v})

ess_network["links"] = formatted_links

In [42]:
with open("network_plot_trst_vs_happiness.json", "w+") as f:
    json.dump(ess_network, f)

## Create dataset for happiness only

In [43]:
ess_network = {}
ess_network["nodes"] = []
for k, v in group_mapping.items():
    for i in range(11):
        ess_network["nodes"].append({"name": f"{k}_{i}", "group": v, "index": v*11+i})

In [44]:
from collections import defaultdict

links = defaultdict(int)
trst_cols = [col for col in columns_to_filter if col != "Happiness"]
for _, row in cleaned_df.iterrows():
    for i, col_i in enumerate(columns_to_filter):
        for j, col_j in enumerate(columns_to_filter):
            if j > i:
                links[f"{col_i}_{row[col_i]}__{col_j}_{row[col_j]}"] += 1 

for col in columns_to_filter:
    for i in range(11):
        for j in range(11):
            if j > i: 
                links[f"{col}_{i}__{col}_{j}"] = (10 - (j - i)) - 5

In [45]:
formatted_links = []
for k, v in links.items():
    if k.startswith("Happiness"):
        target, source = k.split("__")
    else:
        source, target = k.split("__")
    
    if v > 150 or (source == target ):  # This reveals some interpretation
        formatted_links.append({'source': node_to_index_mapping[source], 'target': node_to_index_mapping[target], 'value': v})

ess_network["links"] = formatted_links

with open("network_plot_trst_to_only_trstprl.json", "w+") as f:
    json.dump(ess_network, f)

In [46]:
group_mapping

{'Happiness': 4,
 'trstprl': 1,
 'trstlgl': 2,
 'trstplc': 3,
 'trstplt': 0,
 'trstprt': 5,
 'trstep': 6,
 'trstun': 7}