In [227]:
import pandas as pd
import numpy as np

from similarities import jaro_distance, jaro_winkler
from collections import defaultdict

In [228]:
file_path = './../data/Empirical_investigation.csv'

df = pd.read_csv(file_path, index_col=0, sep=',')


In [229]:
rank_list = ['rank1', 'rank2', 'rank3', 'rank4', 'rank5', 'rank6']
rank_list2 = rank_list[:len(rank_list)-1]

def beautify_df(df_input):
    df_cur = df_input.reset_index()
    # remove unnecessary columns
    df_cur = df_cur.iloc[:, 1: len(df_cur.columns)-1]

    df_cur.columns = rank_list
    return df_cur

df = beautify_df(df)

In [230]:
df = df.drop(3)

In [231]:
CON = 'Convenience'
PRI = 'Privacy'
AUT = 'Autonomy'
DIV = 'Diversity'
NOB = 'No Bias'
TRA = 'Transparency'

rank_list = ['rank1', 'rank2', 'rank3', 'rank4', 'rank5', 'rank6']

values = {CON, PRI, AUT, DIV, NOB, TRA}
# unwanted_values = {PRI, DIV, NOB, CON}
unwanted_values = {NOB}

def remove_unwanted_values(df_input, values=values, unwanted_values=unwanted_values, rank_list=rank_list):
    wanted_values = []
    for index, entry in enumerate(df_input.values):
        wanted_values.append(entry.tolist())
        for value in unwanted_values:
            wanted_values[index].remove(value)
    if unwanted_values:
        len_values = values - unwanted_values
    else:
        len_values = values

    shortened_rank_list = rank_list[:len(len_values)]
    return pd.DataFrame(wanted_values, columns=shortened_rank_list), shortened_rank_list

In [232]:
# removing No Bias
df, rank_list = remove_unwanted_values(df)

In [233]:
df_string = df

In [234]:
df.sort_values(rank_list)

Unnamed: 0,rank1,rank2,rank3,rank4,rank5
2,Autonomy,Diversity,Convenience,Privacy,Transparency
10,Convenience,Autonomy,Diversity,Privacy,Transparency
5,Convenience,Autonomy,Transparency,Diversity,Privacy
0,Convenience,Diversity,Privacy,Autonomy,Transparency
6,Convenience,Diversity,Transparency,Privacy,Autonomy
3,Diversity,Transparency,Privacy,Autonomy,Convenience
1,Privacy,Autonomy,Transparency,Convenience,Diversity
4,Privacy,Autonomy,Transparency,Diversity,Convenience
7,Privacy,Autonomy,Transparency,Diversity,Convenience
9,Privacy,Convenience,Diversity,Autonomy,Transparency


In [235]:
scale_mapper = {CON:1, PRI:2, AUT:3, DIV:4, NOB:5, TRA:6}

def apply_scale_mapper(df_input, scale_mapper=scale_mapper):
    df_int = pd.DataFrame()
    for i in range(1, len(df_input.columns) + 1):
        df_int["rank"+str(i)] = df_input["rank"+str(i)].replace(scale_mapper)
    return df_int

In [236]:
df = apply_scale_mapper(df)

In [237]:
def gen_ppl_string_representation(df):
    ppl_dict = {}
    for i, entry in enumerate(df.values):
        ppl_dict[i] = ''.join(str(entry))
    return ppl_dict

In [238]:
ppl_dict = gen_ppl_string_representation(df)

In [239]:
def ppl_with_same_values(ppl_dict, winkler = True, threshold = 0.9):
    sim_ppl = defaultdict(list)
    for person1 in ppl_dict:
        for person2 in ppl_dict:
            if person1 == person2:
                continue
            if winkler:
                if jaro_winkler(ppl_dict[person1], ppl_dict[person2]) > threshold:
                    sim_ppl[person1].append(person2)
            else:
                if jaro_distance(ppl_dict[person1], ppl_dict[person2]) > threshold:
                    sim_ppl[person1].append(person2)

    return sim_ppl

In [249]:
ppl_with_same_values(ppl_dict, True, 0.95)

defaultdict(list,
            {0: [5, 6, 9, 10],
             1: [4, 7, 8, 9, 11],
             2: [10],
             3: [8, 11],
             4: [1, 7, 8, 11],
             5: [0, 10],
             6: [0],
             7: [1, 4, 8, 11],
             8: [1, 3, 4, 7, 11],
             9: [0, 1],
             10: [0, 2, 5],
             11: [1, 3, 4, 7, 8]})

In [262]:
similar1 = [0, 5, 6, 10]
similar2 = [1, 4, 7, 8, 11]
similar3 = [2, 3, 9, 12]

In [263]:
persona1 = df_string.iloc[similar1, :]
persona1

Unnamed: 0,rank1,rank2,rank3,rank4,rank5
0,Convenience,Diversity,Privacy,Autonomy,Transparency
5,Convenience,Autonomy,Transparency,Diversity,Privacy
6,Convenience,Diversity,Transparency,Privacy,Autonomy
10,Convenience,Autonomy,Diversity,Privacy,Transparency


In [264]:
persona2 = df_string.iloc[similar2, :]
persona2

Unnamed: 0,rank1,rank2,rank3,rank4,rank5
1,Privacy,Autonomy,Transparency,Convenience,Diversity
4,Privacy,Autonomy,Transparency,Diversity,Convenience
7,Privacy,Autonomy,Transparency,Diversity,Convenience
8,Privacy,Transparency,Diversity,Autonomy,Convenience
11,Privacy,Transparency,Diversity,Autonomy,Convenience


In [260]:
persona3 = df_string.iloc[similar3, :]
persona3

Unnamed: 0,rank1,rank2,rank3,rank4,rank5
2,Autonomy,Diversity,Convenience,Privacy,Transparency
3,Diversity,Transparency,Privacy,Autonomy,Convenience
12,Transparency,Privacy,Diversity,Convenience,Autonomy


In [241]:
similar1 = [0, 7, 10, 11]
similar2 = [1, 5, 8, 9, 12]
similar3 = [2, 6, 11]

In [242]:
persona1 = df_string.iloc[similar1, :]
persona1

Unnamed: 0,rank1,rank2,rank3,rank4,rank5
0,Convenience,Diversity,Privacy,Autonomy,Transparency
7,Privacy,Autonomy,Transparency,Diversity,Convenience
10,Convenience,Autonomy,Diversity,Privacy,Transparency
11,Privacy,Transparency,Diversity,Autonomy,Convenience


In [243]:
persona2 = df_string.iloc[similar2, :]
persona2

Unnamed: 0,rank1,rank2,rank3,rank4,rank5
1,Privacy,Autonomy,Transparency,Convenience,Diversity
5,Convenience,Autonomy,Transparency,Diversity,Privacy
8,Privacy,Transparency,Diversity,Autonomy,Convenience
9,Privacy,Convenience,Diversity,Autonomy,Transparency
12,Transparency,Privacy,Diversity,Convenience,Autonomy


In [244]:
persona3 = df_string.iloc[similar3, :]
persona3

Unnamed: 0,rank1,rank2,rank3,rank4,rank5
2,Autonomy,Diversity,Convenience,Privacy,Transparency
6,Convenience,Diversity,Transparency,Privacy,Autonomy
11,Privacy,Transparency,Diversity,Autonomy,Convenience


In [245]:
ppl_with_same_values(ppl_dict, True, 0.90)

defaultdict(list,
            {0: [5, 6, 9, 10],
             1: [4, 7, 8, 9, 11],
             2: [5, 10],
             3: [8, 11, 12],
             4: [1, 7, 8, 11],
             5: [0, 2, 10],
             6: [0],
             7: [1, 4, 8, 11],
             8: [1, 3, 4, 7, 9, 11, 12],
             9: [0, 1, 8, 11],
             10: [0, 2, 5],
             11: [1, 3, 4, 7, 8, 9, 12],
             12: [3, 8, 11]})

In [246]:
sim1 = [0, 6, 7, 11]
sim2 = [1, 3, 5, 8, 9, 10, 12, 13]

In [247]:
df_string.iloc[sim1, :]

Unnamed: 0,rank1,rank2,rank3,rank4,rank5
0,Convenience,Diversity,Privacy,Autonomy,Transparency
6,Convenience,Diversity,Transparency,Privacy,Autonomy
7,Privacy,Autonomy,Transparency,Diversity,Convenience
11,Privacy,Transparency,Diversity,Autonomy,Convenience


In [248]:
df_string.iloc[sim2, :]

IndexError: positional indexers are out-of-bounds

In [None]:
similar1 = [0, 3, 6, 7, 10, 11]
similar2 = [1, 4, 5, 8, 9, 10, 12, 13]
# similar3 = [2, 6, 11]

In [None]:
df_string.iloc[similar1, :]

In [None]:
df_string.iloc[similar2, :]

In [None]:
df_string