In [103]:
import pandas as pd
import numpy as np

In [104]:
file_path = './../data/Empirical_investigation.csv'

df = pd.read_csv(file_path, index_col=0, sep=',')


In [105]:
rank_list = ['rank1', 'rank2', 'rank3', 'rank4', 'rank5', 'rank6']
rank_list2 = rank_list[:len(rank_list)-1]

def beautify_df(df_input):
    df_cur = df_input.reset_index()
    # remove unnecessary columns
    df_cur = df_cur.iloc[:, 1: len(df_cur.columns)-1]

    df_cur.columns = rank_list
    return df_cur

df = beautify_df(df)

In [106]:
CON = 'Convenience'
PRI = 'Privacy'
AUT = 'Autonomy'
DIV = 'Diversity'
NOB = 'No Bias'
TRA = 'Transparency'

rank_list = ['rank1', 'rank2', 'rank3', 'rank4', 'rank5', 'rank6']

values = {CON, PRI, AUT, DIV, NOB, TRA}
unwanted_values = {NOB}

def remove_unwanted_values(df_input, values=values, unwanted_values=unwanted_values, rank_list=rank_list):
    wanted_values = []
    for index, entry in enumerate(df_input.values):
        wanted_values.append(entry.tolist())
        for value in unwanted_values:
            wanted_values[index].remove(value)

    shortened_rank_list = rank_list[:len(values - unwanted_values)]
    return pd.DataFrame(wanted_values, columns=shortened_rank_list), shortened_rank_list

In [107]:
# removing No Bias
df, rank_list = remove_unwanted_values(df)

In [108]:
df_string = df

In [109]:
df.sort_values(rank_list)

Unnamed: 0,rank1,rank2,rank3,rank4,rank5
2,Autonomy,Diversity,Convenience,Privacy,Transparency
11,Convenience,Autonomy,Diversity,Privacy,Transparency
6,Convenience,Autonomy,Transparency,Diversity,Privacy
0,Convenience,Diversity,Privacy,Autonomy,Transparency
7,Convenience,Diversity,Transparency,Privacy,Autonomy
4,Diversity,Transparency,Privacy,Autonomy,Convenience
1,Privacy,Autonomy,Transparency,Convenience,Diversity
5,Privacy,Autonomy,Transparency,Diversity,Convenience
8,Privacy,Autonomy,Transparency,Diversity,Convenience
10,Privacy,Convenience,Diversity,Autonomy,Transparency


In [110]:
scale_mapper = {CON:1, PRI:2, AUT:3, DIV:4, NOB:5, TRA:6}

def apply_scale_mapper(df_input, scale_mapper=scale_mapper):
    df_int = pd.DataFrame()
    for i in range(1, 6):
        df_int["rank"+str(i)] = df_input["rank"+str(i)].replace(scale_mapper)
    return df_int

In [111]:
df = apply_scale_mapper(df)

In [112]:
df.sort_values(rank_list)

Unnamed: 0,rank1,rank2,rank3,rank4,rank5
11,1,3,4,2,6
6,1,3,6,4,2
0,1,4,2,3,6
7,1,4,6,2,3
10,2,1,4,3,6
1,2,3,6,1,4
5,2,3,6,4,1
8,2,3,6,4,1
3,2,6,1,3,6
9,2,6,4,3,1


In [113]:
def gen_ppl_string_representation(df):
    ppl_dict = {}
    for i, entry in enumerate(df.values):
        ppl_dict[i] = ''.join(str(entry))
    return ppl_dict

In [114]:
ppl_dict = gen_ppl_string_representation(df)

In [115]:
ppl_dict

{0: '[1 4 2 3 6]',
 1: '[2 3 6 1 4]',
 2: '[3 4 1 2 6]',
 3: '[2 6 1 3 6]',
 4: '[4 6 2 3 1]',
 5: '[2 3 6 4 1]',
 6: '[1 3 6 4 2]',
 7: '[1 4 6 2 3]',
 8: '[2 3 6 4 1]',
 9: '[2 6 4 3 1]',
 10: '[2 1 4 3 6]',
 11: '[1 3 4 2 6]',
 12: '[2 6 4 3 1]',
 13: '[6 2 4 1 3]'}

In [116]:
from similarities import jaro_distance
from collections import defaultdict
def ppl_with_same_values(ppl_dict, jaccard_dist = 0.9):
    sim_ppl = defaultdict(list)
    for person1 in ppl_dict:
        for person2 in ppl_dict:
            if person1 == person2:
                continue
            if jaro_distance(ppl_dict[person1], ppl_dict[person2]) > jaccard_dist:
                sim_ppl[person1].append(person2)
    return sim_ppl

In [117]:
ppl_with_same_values(ppl_dict, 0.95)

defaultdict(list,
            {0: [7, 10, 11],
             1: [5, 8],
             2: [11],
             4: [9, 12],
             5: [1, 8, 9, 12],
             6: [11],
             7: [0],
             8: [1, 5, 9, 12],
             9: [4, 5, 8, 12],
             10: [0],
             11: [0, 2, 6],
             12: [4, 5, 8, 9]})

In [118]:
similar1 = [0, 7, 10, 11]
similar2 = [1, 5, 4, 8, 9, 12]
similar3 = [2, 6, 11]

In [119]:
persona1 = df_string.iloc[similar1, :]
persona1

Unnamed: 0,rank1,rank2,rank3,rank4,rank5
0,Convenience,Diversity,Privacy,Autonomy,Transparency
7,Convenience,Diversity,Transparency,Privacy,Autonomy
10,Privacy,Convenience,Diversity,Autonomy,Transparency
11,Convenience,Autonomy,Diversity,Privacy,Transparency


In [120]:
persona2 = df_string.iloc[similar2, :]
persona2

Unnamed: 0,rank1,rank2,rank3,rank4,rank5
1,Privacy,Autonomy,Transparency,Convenience,Diversity
5,Privacy,Autonomy,Transparency,Diversity,Convenience
4,Diversity,Transparency,Privacy,Autonomy,Convenience
8,Privacy,Autonomy,Transparency,Diversity,Convenience
9,Privacy,Transparency,Diversity,Autonomy,Convenience
12,Privacy,Transparency,Diversity,Autonomy,Convenience


In [121]:
persona3 = df_string.iloc[similar3, :]
persona3

Unnamed: 0,rank1,rank2,rank3,rank4,rank5
2,Autonomy,Diversity,Convenience,Privacy,Transparency
6,Convenience,Autonomy,Transparency,Diversity,Privacy
11,Convenience,Autonomy,Diversity,Privacy,Transparency
