In [1]:
import os
import json
import re
import itertools 
import math
import pandas as pd
import numpy as np
import scipy
from scipy import signal
from matplotlib import cm, pyplot as plt
from statistics import stdev 
%matplotlib inline


## Pseudo dataset
- K classes from K Gaussian distributions (10 dimentional)
- N items, item difficulty $1/\beta_j$ sampled from N(1, 1), $\beta_j \in [0, +\infty)$.
    - $\beta_j=0$ difficult item 
    - $\beta_j=+\infty$ easy item
- M workers, worker competence $\alpha_i$ sampled from N(1,1), $\alpha_i \in (-\infty, +\infty)$. 
    - $\alpha_i=-\infty$ bad worker
    - $\alpha_i=+\infty$ good worker
-  $N \times M$ correlation matrix $\gamma_{ij}$ with $\gamma_{ij} \in [0, +\infty)$.
    - $\gamma_{ij}=0$ no matching between worker i and item j
    - $\gamma_{ij}=+\infty$ high matching score between worker i and item j
    
- assume 80% of the items are annotated by only 10% of the workers. Use $y = x^{-1.5}$ to make the distribution for sampling workers.

In [6]:
def sample_true_label(N, K=5, dim=10):
    samples = []
    # setting for mixture gaussians
    mus = np.eye(dim)
    for j in range(N):
        # sample a class
        k = np.random.randint(K)
        
        # sample a point from k-th gaussian distribution
        item_feature = np.random.multivariate_normal(np.eye(dim)[k], np.eye(dim))
        
        samples.append((j, item_feature, k))
        
    return samples


def sample_item_difficulty(N):
    betas = np.random.normal(loc=1.0, scale=1.0, size=N)
    betas[betas<0] = 1e-5
    return 1/betas


def sample_worker_competence(M, power=-1.5):
    alphas = np.random.normal(loc=1.0, scale=1.0, size=M)
    sampling_probs = np.array([i**power for i in range(M)])
    sampling_probs = sampling_probs/sampling_probs.sum()
    return alphas, sampling_probs


def sample_correlation_matrix(M,N):
    gammas = np.random.normal(loc=1.0, scale=1.0, size=(M, N))
    gammas[gammas<0] = 1e-5

    return gammas

def sample_crowd_label(K, true_label, item_difficulty, worker_competence, correlation_matrix ):
    cp = 1.0/ (1 + np.exp(-item_difficulty*worker_competence*correlation_matrix))  # correct prob
    wp = (1-cp)/(K-1)  # wrong probs
    
    p = np.full(K, wp)
    p[true_label] = cp
   
    label = np.random.choice(a=K, p=p, size=1)[0]
    return label

def generate_pseudo_data(K, N, M, worker_num_per_task):
    """
    Generate pseudo data.
    param K: number of classes.
    param N: number of items.
    param M: number of workers.
    param worker_num_per_task indicates the sparsity of the crowd annotation.
    """
    data = []  # item_id, item_difficulty, worker_id, worker_competence, correlation, item_feature, true_label, crowd_label
    
    
    # ground truth
    items = sample_true_label(N, K)
    item_difficulties = sample_item_difficulty(N)
    worker_competences, sampling_probs = sample_worker_competence(M, power=1.5)

    
    correlations = sample_correlation_matrix(M, N)
    

    # sample alpha
    worker_ids = np.random.choice(a=M, p=sampling_probs, size=worker_num_per_task, replace=False)
    for worker_id in worker_ids:
        worker_competence = worker_competences[worker_id]
        
        #sample beta
        for item_id, item_feature, true_label in items:
            item_difficulty = item_difficulties[item_id]

            correlation = correlations[worker_id][item_id]
            
            print("correlation", correlation)

            
            crowd_label = sample_crowd_label(K, true_label, item_difficulty, worker_competence, correlation)
            
            if correlation < 0.5:
                crowd_label == 0
            elif correlation > 100:
                crowd_label == 1
            
            data.append((item_id, item_difficulty, worker_id, worker_competence, correlation, true_label, crowd_label))
            


    return data



In [10]:
pseudo= generate_pseudo_data(K=2, N=100, M=6, worker_num_per_task=5)

noise min  -0.8420185015289741
correlations stdev 0.8440683287267176
correlations max 4.67114226156416
correlations min  1e-05
correlations mean  1.078136738053467
correlation 0.6228680144975192
new correlation 1.4663299151886613
----
correlation 1.6041872012060319
correlation 0.39348117414899986
new correlation 1.2369430748401422
----
correlation 1.1085440026387394
correlation 0.5270874740509193
new correlation 1.3705493747420614
----
correlation 1.7625253324296515
correlation 1.0823044354392992
correlation 1e-05
new correlation 0.8434719006911422
----
correlation 1.387021631862168
correlation 1.6843673117827815
correlation 1e-05
new correlation 0.8434719006911422
----
correlation 0.5293844184492891
new correlation 1.3728463191404314
----
correlation 1.5820323388553157
correlation 0.6285148111289265
new correlation 1.4719767118200688
----
correlation 0.8427407955189076
new correlation 1.6862026962100498
----
correlation 2.580748906445876
correlation 1.837940932259857
correlation 1.318



correlation 1e-05
new correlation 0.8434719006911422
----
correlation 1.1223543147499548
correlation 1.620571162944365
correlation 1.6048279678029016
correlation 0.9019261263350298
correlation 1e-05
new correlation 0.8434719006911422
----
correlation 0.7304574762690053
new correlation 1.5739193769601476
----
correlation 0.4634412947982425
new correlation 1.3069031954893848
----
correlation 1.0482617725781256
correlation 0.7899245555380083
new correlation 1.6333864562291507
----
correlation 0.7658258110885346
new correlation 1.6092877117796769
----
correlation 0.007397743887363695
new correlation 0.8508596445785059
----
correlation 1.8111183313499277
correlation 3.080125744625287
correlation 2.160556899418973
correlation 0.24983336699460568
new correlation 1.093295267685748
----
correlation 0.9827805591540214
correlation 1e-05
new correlation 0.8434719006911422
----


In [11]:
df = pd.DataFrame(pseudo, columns =['item_id', 'item_difficulty', 'worker_id', 'worker_competence', 'correlation', 'true_label', 'crowd_label']) 


df

Unnamed: 0,item_id,item_difficulty,worker_id,worker_competence,correlation,true_label,crowd_label
0,0,2.339102,4,-1.134306,1.466330,1,0
1,1,100000.000000,4,-1.134306,1.604187,1,0
2,2,0.834904,4,-1.134306,1.236943,1,1
3,3,3.108295,4,-1.134306,1.108544,0,0
4,4,100000.000000,4,-1.134306,1.370549,0,1
...,...,...,...,...,...,...,...
495,95,0.948356,1,0.744876,3.080126,0,0
496,96,8.003625,1,0.744876,2.160557,0,0
497,97,0.723474,1,0.744876,1.093295,1,1
498,98,1.025777,1,0.744876,0.982781,1,1


In [21]:
answer = df[['item_id', 'worker_id', 'crowd_label']]
answer.columns=["question", "worker", "answer"]
answer.to_csv("answer.csv", index=None)

In [22]:
truth = df[['item_id', 'true_label']]
truth.columns=["question", "truth"]
truth.to_csv("truth.csv", index=None)