In [1]:
import pandas as pd
import os
from typing import List

Read the data in 2 lists. 1 list per voting system and each item within a list is the data for a unique number of dimensions (2, 6, or 10).

In [2]:
files = sorted(os.listdir())
limited = [pd.read_csv(file) for file in files if 'limited' in file]
plurality = [pd.read_csv(file) for file in files if 'plurality' in file]

Drop all run id's that have only a single sample associated with them. Those runs are invalid (e.g. the number of winners > the number of candidates)

In [3]:
def drop_single_sample_trials(df: pd.DataFrame) -> pd.DataFrame:
    counts = df.groupby('[run number]').size()
    return df[df['[run number]'].isin(counts[counts > 1].index)]

Drop features that remain constant throughout all runs.

In [4]:
def drop_irrelevant_features(df: pd.DataFrame) -> pd.DataFrame:
    cols_to_drop = ['movement-speed', 'max-utility-for-voting', 'repulsion-distance', 'change-satisfaction']
    return df.drop(cols_to_drop, axis='columns')

In [5]:
def fix_run_id(df: pd.DataFrame) -> pd.DataFrame:
    df['[run number]'] = df['[run number]'].map({v: i for i, v in enumerate(df['[run number]'].unique())})
    return df

Concatenate a list of datframes by taking into account overlapping run id's

In [6]:
def concat_dfs(dfs: List[pd.DataFrame]):
    fixed_dfs = list(map(lambda x: fix_run_id(drop_irrelevant_features(drop_single_sample_trials(x))), dfs))
    for i, df in enumerate(fixed_dfs):
        df['[run number]'] += i + i * fixed_dfs[0]['[run number]'].max()
    return pd.concat(fixed_dfs)

In [7]:
limited = concat_dfs(limited)
plurality = concat_dfs(plurality)

In [8]:
plurality['[run number]'] += limited['[run number]'].max() + 1

Concatenate the resulting $limited$ and $plurality$ dataframes and save them as parquet format.

In [9]:
all_data = pd.concat((limited, plurality)).reset_index(drop=True)

In [10]:
all_data = all_data[all_data['[step]'] != 0] # get rid of the first steps, then the voter turnout is always 0

In [11]:
all_data.to_parquet('all_data.parquet')

Take the average voter turnout per run id

In [12]:
all_data['voter-turnout'] = all_data.groupby('[run number]')['voter-turnout'].transform('mean')

In [13]:
all_data = all_data.drop(['average-satisfaction', '[step]'], axis=1).drop_duplicates()

In [14]:
all_data.to_parquet('averaged_data.parquet')