2 main distributions:

number of voters per cluster: e.g. (1000,1000,750,500,200,200,100,100,100)

number of votes per project: e.g. (7000,6500,6000,4500,4000,3000,2000,2000,1500)

2 sub distributions:

number of projects in a cluster vs cluster size

number of approvals per voter: based on first two distributions



In [2]:
import numpy as np
from numpy import random
import copy 
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import umap
from collections import Counter

import statistics
from pylab import rcParams
from random import sample

In [595]:
class data_generator():
    def __init__(self, voters_per_cluster, votes_per_project, cluster_independence=3, spread_of_approvals=1.5, adcavpd=0.15, sdcavpd=0.08):
        self.voters_per_cluster = np.array(sorted(voters_per_cluster, reverse=True))
        self.votes_per_project = np.array(votes_per_project)
        self.cluster_independence = cluster_independence
        self.spread_of_approvals = spread_of_approvals
        self.avg_diff_cluster_all_voters_project_dist = adcavpd
        self.std_diff_cluster_all_voters_project_dist = sdcavpd

        self.max_voters_per_cluster = max(voters_per_cluster)
        self.n_projects = len(votes_per_project)
        self.n_clusters = len(self.voters_per_cluster)
        self.rng = random.default_rng()

    @property
    def voters_per_cluster_dist(self):
        return self.voters_per_cluster / sum(self.voters_per_cluster)
    
    @property
    def votes_per_project_dist(self):
        return self.votes_per_project / sum(self.votes_per_project)
        
    @property
    def avg_approvals_per_voter(self):
        return sum(self.votes_per_project) / sum(self.voters_per_cluster)
    
    def make_clusters(self):
        clusters = []
        for cluster_size in self.voters_per_cluster:
            print(self.avg_approvals_per_voter)
            print(self.votes_per_project)

#             print(self.votes_per_project_dist)
            npic = self.n_projects_in_cluster(cluster_size)
#             print(npic)
            ps = self.generate_project_set(npic)
#             print(ps)
            apv = self.approvals_per_voter(npic, cluster_size)
#             print(apv)
            cpd = self.gen_cluster_project_dist(ps)
#             print(cpd)
            ballots = self.gen_cluster_ballot(ps, apv, cpd)
            
            # update generator
            for proj, votes in Counter([y for x in ballots for y in x]).items():
                self.votes_per_project[proj] = np.maximum(self.votes_per_project[proj] - votes, 0)
            self.voters_per_cluster = np.delete(self.voters_per_cluster, 0)
                
            clusters.append(ballots)
        return clusters
    
    def gen_cluster_ballot(self, project_set, approvals_per_voter, cluster_project_dist):
        return [self.rng.choice(project_set, apv, p=cluster_project_dist, replace=False, shuffle=False) for apv in approvals_per_voter]
            
    def gen_cluster_project_dist(self, project_set):
        new_dist = np.maximum(self.votes_per_project_dist[project_set] + random.normal(self.avg_diff_cluster_all_voters_project_dist, self.std_diff_cluster_all_voters_project_dist, len(project_set)), 0)
        return new_dist / sum(new_dist)
            
    def n_projects_in_cluster(self, cluster_size):
        # the lower cluster_independence is the more dependent the number of projects are in therms of clustersize. 
        return int(np.clip(random.normal(self.avg_approvals_per_voter * 2.2 * (cluster_size / self.max_voters_per_cluster), self.cluster_independence, 1), 1, self.n_projects))

    def generate_project_set(self, n_projects_in_cluster):
        return self.rng.choice(self.n_projects, n_projects_in_cluster, p=self.votes_per_project_dist, replace=False, shuffle=False)
    
    def approvals_per_voter(self, n_projects_in_cluster, cluster_size):
        # spread_of_approvals determines the spread of approvals per voter within a cluster.
        return np.clip(random.normal(n_projects_in_cluster-self.spread_of_approvals, self.spread_of_approvals, cluster_size), 1, n_projects_in_cluster).astype(int)
    
    def __call__(self):
        return self.make_clusters()

In [597]:
VOTERS_PER_CLUSTER = list(range(1000, 100, -100))
VOTES_PER_PROJECT = list(range(5000, 1500, -300))

tmp = data_generator(VOTERS_PER_CLUSTER, VOTES_PER_PROJECT)()
print(Counter([z for x in tmp for y in x for z in y]))
print(VOTES_PER_PROJECT)



7.444444444444445
[5000 4700 4400 4100 3800 3500 3200 2900 2600 2300 2000 1700]
6.878863636363636
[4089 3830 3593 3157 2960 2680 2264 2155 1821 1641 1143  934]
6.104285714285714
[3239 3019 2794 2409 2155 1852 1502 1424 1407  828  381  355]
5.33962962962963
[2530 2327 2083 1703 1513 1200  909  800  823  174    0  355]
4.767
[1948 1893 1490 1102  916  645  302  800  216  174    0   48]
4.722857142857142
[1442 1409 1015  639  465  225  179  800  216  174    0   48]
6.4655555555555555
[1151 1409  750  402  465  225  179  800  216  174    0   48]
8.684
[ 864 1138  507  402  465   56  179  509    0  174    0   48]
13.585
[623 874 278 212 226  56 179 269   0   0   0   0]
Counter({0: 4507, 2: 4122, 3: 4019, 1: 3986, 4: 3655, 5: 3444, 6: 3021, 7: 2726, 8: 2655, 9: 2357, 10: 2240, 11: 1795})
[5000, 4700, 4400, 4100, 3800, 3500, 3200, 2900, 2600, 2300, 2000, 1700]


In [469]:
def generate_ballot(m, p=0.5):
    return (random.rand(m)+(p-0.5)).round().astype(int)

class Cluster():
    def __init__(self, m, n, permutation_rate=0.1, p=0.5):
        self.m = m
        m = int(m * 0.95)
        self.n = n
        self.permutation_rate = permutation_rate
        self._index_map = np.array(sample(range(self.m), m))
        self._mean = generate_ballot(m, p)
        self.__create_n_voter_ballots(m)

        self._mean, tmp = np.zeros(self.m), self._mean
        self._mean[self._index_map] = tmp
        
    def __create_n_voter_ballots(self, m):
        self._ballots = np.zeros((self.n, self.m))
        for i in range(self.n):
            self._ballots[i,self._index_map] = self.__permutate_ballot(m)
        
    def __permutate_ballot(self, m):
        # point permutate the ballot up to 10 times
        new_ballot = copy.copy(self._mean)
        for _ in range(random.randint(self.permutation_rate*m+1)):
            new_ballot[random.randint(m)] = 1 - new_ballot[random.randint(m)]
            
        return new_ballot
    
    @property
    def ballots(self):
        return self._ballots
    
    @property
    def cluster_ballot(self):
        return self._mean
    
    def __repr__(self):
        return str(self._ballots)
    
    def statistics(self):
        rcParams['figure.figsize'] = 7, 4
        rcParams['figure.dpi'] = 75
        rcParams.update({'font.size': 10})
        print(f"Probability that a voter votes the same for one project compared to the cluster ballot = {1-np.mean(np.abs(self._ballots-self._mean))}")
        x,y = np.unique(self._ballots, return_counts=True, axis=0)
        print(f"Number of unique ballets in this cluster = {len(x)}")
        print(f"Number of voters that vote the cluster ballot = {max(y)}")
        plt.plot(*np.unique(np.abs(self._ballots - self._mean).sum(1), return_counts=True), '-o')
        plt.xlabel("difference between voter ballot and cluster ballot")
        plt.ylabel("number of voters")
        plt.show()
        plt.plot(self._mean*100, 'o',label="cluster ballot")
        plt.plot(self._ballots.mean(0)*100,'x',label="voters")
        plt.legend()
        plt.xlim(0,self.m)
        plt.xlabel("Projects")
        plt.ylabel("Change of approving a project(%)")
        plt.show()
        
    