In [40]:
import random
import pandas as pd
import numpy as np
import plotly.express as px

Begin by defining some parameters:

In [41]:
n_users = 2000
n_tweets = 5003
tweets_per_person = 50
avg_time = 19 # Average survey took 19 minutes as per Tamara
sd_time = 5 # Just a guess, we could get a more precise estimate if desired
avg_user_freq = 2 # Average number of users starting per minute

Next, define a class Survey which we'll use to track people taking the survey simultaneously

In [42]:
class User():
    def __init__(self, u_id, finish_time):
        self.u_id = u_id
        self.finish_time = finish_time

class Survey():
    def __init__(self, n):
        self.cur_users = []
        self.remaining_q_ids = [i for i in range(n)]
        self.assignment_dict = {}
        self.duplicate_tracker = {i:0 for i in self.remaining_q_ids}
        self.sys_time = 0
        self.cur_user_time = {}
        self.max_user_time = {}
        
    def assign_user(self, user):
        self.cur_users.append(user.u_id)
        self.cur_user_time[user.u_id] = 0
        self.max_user_time[user.u_id] = user.finish_time
        assignment_index = random.choice(range(len(self.remaining_q_ids)))
        assignment = self.remaining_q_ids[assignment_index]
        self.assignment_dict[user.u_id] = assignment
        
        
    def increment_time(self):
        self.sys_time += 1
        for user in self.cur_users:
            self.cur_user_time[user] += 1
            if self.cur_user_time[user] >= self.max_user_time[user]: 
                # If user is now finished, we update the remaining question ID list
                self.cur_users.remove(user)
                if self.assignment_dict[user] in self.remaining_q_ids:
                    # If the user's assigned set of questions is still in the remaining q_ids list, remove it
                    self.remaining_q_ids.remove(self.assignment_dict[user])
                else:
                    # If not, we had a duplicate, so increment the value of the q_id's duplicate tracker dict
                    self.duplicate_tracker[self.assignment_dict[user]] += 1
                
        
    

Generate some users:

In [43]:
users = [User(i, avg_time+random.choice(range(-sd_time,sd_time))) for i in range(n_users)]

In [44]:
survey = Survey(len(users))
survey.assign_user(users.pop(0))

while len(survey.cur_users) > 0 or len(users) > 0:
    # While there are either people who have not yet taken the survey, or there are people currently taking it...
    # Determine how many new people will take the survey this minute
    new_users = min(avg_user_freq + random.choice(range(-avg_user_freq,avg_user_freq)), len(users)) # This is a bit arbitrary but will give us a more realistic flow of people
    for i in range(new_users):
        survey.assign_user(users.pop(0))
    
    survey.increment_time()

In [45]:
duplicates = pd.DataFrame(survey.duplicate_tracker, index=["dups"]).transpose()
fig = px.histogram(duplicates, x="dups")
fig.show()

In [46]:
share_one_duplicate = 100*sum(duplicates.dups==1)/len(duplicates.dups)
share_two_duplicates = 100*sum(duplicates.dups==2)/len(duplicates.dups)
share_more_duplicates = 100*sum(duplicates.dups>2)/len(duplicates.dups)
unassigned = 100*len(survey.remaining_q_ids)/len(duplicates.dups)
print(share_one_duplicate, "percent of question sets were accidentally assigned twice")
print(share_two_duplicates, "percent of question sets were accidentally assigned three times")
print(share_more_duplicates, "percent of question sets were accidentally assigned more than three times")
print(unassigned, "percent of question sets were entirely unassigned")


3.6 percent of question sets were accidentally assigned twice
0.3 percent of question sets were accidentally assigned three times
0.0 percent of question sets were accidentally assigned more than three times
4.2 percent of question sets were entirely unassigned


While it doesn't appear we're getting very many duplicates with these parameters, we should look at the number of times each individual question gets assigned:

In [47]:
from sampling_tweets import getTweetAssignments

# Using the script I created to assign tweets, get lists of tweets that correspond to the tweet groupings assigned in the above exercize.
output_df = getTweetAssignments(n_tweets, n_users, tweets_per_person)

In [48]:
# For each set of tweets to be assigned, count up the number of times individual tweets were seen, factoring in imperfect quota allocation
total_usage_count = {i:0 for i in range(n_tweets)}
for tweet_set in range(len(output_df.tweets_assigned)):
    for individual_tweet in range(len(output_df.tweets_assigned[tweet_set])):
        total_usage_count[output_df.tweets_assigned[tweet_set][individual_tweet]] += 1 * (duplicates.dups[tweet_set] + 1)

In [49]:
max_sampled = np.array(list(total_usage_count.values())).max()
min_sampled = np.array(list(total_usage_count.values())).min()
avg_sampled = round(np.array(list(total_usage_count.values())).mean(),1)

print('The most sampled tweet was seen', max_sampled, 'times.')
print('The least sampled tweet was seen', min_sampled, 'times.')
print('The average tweet was seen', avg_sampled, 'times.')



The most sampled tweet was seen 29 times.
The least sampled tweet was seen 19 times.
The average tweet was seen 20.8 times.


This seems encouraging! The final step is to run a Monte Carlo to see how we do given different parameters.


In [50]:
def run_sim(n_simulations, n_users, n_tweets, tweets_per_person, avg_user_freq, avg_time, sd_time):
    output_df = getTweetAssignments(n_tweets, n_users, tweets_per_person)
    simulation_stats = pd.DataFrame({"avg_user_frequency":[], "avg_time":[], "sd_time":[], "max_sampled":[], "min_sampled":[], "avg_sampled":[], "share_one_duplicate":[], "share_two_duplicates":[], "share_more_duplicates":[], "share_unassigned":[]})


    for _ in range(n_simulations):
        avg_user_freq_sim = max(np.random.normal(avg_user_freq,5), 0.51) # Don't want to include simulations with a frequency too close to zero for computational reasons
        # We're more interested in situations with high frequencies anyways, so I biased this exercize upwards such that mean number of users per minute is 5
        avg_time_sim = max(np.random.normal(avg_time,5), 0)
        sd_time_sim = max(np.random.normal(sd_time,4), 0)
        
        users = [User(i, avg_time_sim+random.uniform(-sd_time_sim,sd_time_sim)) for i in range(n_users)]
        survey = Survey(len(users))
        survey.assign_user(users.pop(0))

        while len(survey.cur_users) > 0 or len(users) > 0:
            # While there are either people who have not yet taken the survey, or there are people currently taking it...
            # Determine how many new people will take the survey this minute
            new_users = min(round(avg_user_freq_sim + random.uniform(-avg_user_freq_sim, avg_user_freq_sim)),len(users))
            # Here, the number of new users in a given minute is taken to be nearest whole number to the avg number per minute plus some random value
            # The random value is between the positive and negative value of the avg per minute
            for i in range(new_users):
                survey.assign_user(users.pop(0))
            
            survey.increment_time()
            
        duplicates = pd.DataFrame(survey.duplicate_tracker, index=["dups"]).transpose()
        share_one_duplicate = 100*sum(duplicates.dups==1)/len(duplicates.dups)
        share_two_duplicates = 100*sum(duplicates.dups==2)/len(duplicates.dups)
        share_more_duplicates = 100*sum(duplicates.dups>2)/len(duplicates.dups)
        unassigned = 100*len(survey.remaining_q_ids)/len(duplicates.dups)

        total_usage_count = {i:0 for i in range(n_tweets)}
        for tweet_set in range(len(output_df.tweets_assigned)):
            for individual_tweet in range(len(output_df.tweets_assigned[tweet_set])):
                total_usage_count[output_df.tweets_assigned[tweet_set][individual_tweet]] += 1 * (duplicates.dups[tweet_set] + 1)
        
        max_sampled = np.array(list(total_usage_count.values())).max()
        min_sampled = np.array(list(total_usage_count.values())).min()
        avg_sampled = round(np.array(list(total_usage_count.values())).mean(),1)
        simulation_stats = simulation_stats.append({"avg_user_frequency":avg_user_freq_sim, "avg_time":avg_time_sim, "sd_time":sd_time_sim,"max_sampled":max_sampled, "min_sampled":min_sampled, "avg_sampled":avg_sampled, "share_one_duplicate":share_one_duplicate, "share_two_duplicates":share_two_duplicates, "share_more_duplicates":share_more_duplicates, "share_unassigned":unassigned}, ignore_index=True)
    
    return(simulation_stats)

In this simulation, I try to deliberately skew parameters towards negative scenarios:
- The mean number of users joining per minute is taken to be 20, even though in the previous survey we saw a mean of 2.
- average time taken to complete the survey is taken to be 30 minutes, even though we saw an average of 19 in the previous survey

In [51]:
# Note that the simulation can take quite some time to run depending on each of the 4 parameters.
# In particular, if n_users and n_tweets have a common denominator there is a chance that getTweetAssignments will never halt
# (There is probably a solution for this, but it doesn't happen frequently and we can always just generate one or two extra/fewer tweet groupings to avoid it)
simulation_stats = run_sim(n_simulations=50, n_users=2000, n_tweets=5001, tweets_per_person=50, avg_user_freq=20, avg_time=30, sd_time=5)
simulation_stats

Unnamed: 0,avg_user_frequency,avg_time,sd_time,max_sampled,min_sampled,avg_sampled,share_one_duplicate,share_two_duplicates,share_more_duplicates,share_unassigned
0,19.012941,21.333938,7.914415,38.0,19.0,24.5,16.7,2.6,0.15,22.35
1,20.991051,27.343467,6.306401,38.0,19.0,25.0,16.8,3.15,0.55,24.8
2,23.506868,24.980147,4.932555,37.0,19.0,24.9,17.5,2.65,0.65,24.75
3,26.243675,28.410443,6.309984,41.0,19.0,25.8,19.3,3.9,0.6,29.05
4,15.270438,25.730801,11.35219,38.0,19.0,24.8,17.3,2.7,0.35,23.85
5,20.798835,33.354476,5.304197,37.0,19.0,25.5,18.1,4.15,0.3,27.35
6,19.642578,30.612501,5.031673,36.0,19.0,25.3,17.7,3.6,0.6,26.75
7,17.976153,31.86008,3.449655,35.0,19.0,24.9,18.2,2.9,0.15,24.45
8,16.203936,31.655713,6.252779,38.0,19.0,24.9,17.1,3.2,0.4,24.75
9,20.141457,38.183746,5.836467,39.0,19.0,26.0,18.25,4.35,1.1,30.25


The share of tweet groupings that go unassigned correlates strongly with the frequency with which users join the survey.
Although some tweets do get seen more frequently, we never find a single case where a tweet is seen an unacceptably low number of times.

In [52]:
print('Correlation between average users per minute and share of tweet groups unassigned:', round(simulation_stats.avg_user_frequency.corr(simulation_stats.share_unassigned),4))
print()
print('The average maximum number of times a tweet was seen across simulations was', simulation_stats.max_sampled.mean())
print('The average minimum number of times a tweet was seen across simulations was', simulation_stats.min_sampled.mean())
print('Across all simulations, no tweet was ever seen fewer than', simulation_stats.min_sampled.min(), 'times.')

Correlation between average users per minute and share of tweet groups unassigned: 0.75

The average maximum number of times a tweet was seen across simulations was 37.5
The average minimum number of times a tweet was seen across simulations was 19.0
Across all simulations, no tweet was ever seen fewer than 19.0 times.


What if we have considerably fewer tweets?

In [53]:
simulation_stats = run_sim(n_simulations=50, n_users=2000, n_tweets=1801, tweets_per_person=50, avg_user_freq=20, avg_time=30, sd_time=5)

print('The average maximum number of times a tweet was seen across simulations was', simulation_stats.max_sampled.mean())
print('The average minimum number of times a tweet was seen across simulations was', simulation_stats.min_sampled.mean())
print('Across all simulations, no tweet was ever seen fewer than', simulation_stats.min_sampled.min(), 'times.')
print()
print("Under perfect sampling, we'd expect each tweet to be seen an average of", round((2000*50)/1801,2), 'times')

The average maximum number of times a tweet was seen across simulations was 85.86
The average minimum number of times a tweet was seen across simulations was 58.38
Across all simulations, no tweet was ever seen fewer than 56.0 times.

Under perfect sampling, we'd expect each tweet to be seen an average of 55.52 times


If we have fewer users?

In [54]:
simulation_stats = run_sim(n_simulations=50, n_users=619, n_tweets=5001, tweets_per_person=50, avg_user_freq=20, avg_time=30, sd_time=5)

print('The average maximum number of times a tweet was seen across simulations was', simulation_stats.max_sampled.mean())
print('The average minimum number of times a tweet was seen across simulations was', simulation_stats.min_sampled.mean())
print('Across all simulations, no tweet was ever seen fewer than', simulation_stats.min_sampled.min(), 'times.')
print()
print("Under perfect sampling, we'd expect each tweet to be seen an average of", round((619*50)/5001,2), 'times')

The average maximum number of times a tweet was seen across simulations was 17.48
The average minimum number of times a tweet was seen across simulations was 6.0
Across all simulations, no tweet was ever seen fewer than 6.0 times.

Under perfect sampling, we'd expect each tweet to be seen an average of 6.19 times


Of course 6 ratings is not enough, but overlapping assignment still does not cause a problem - under perfect distribution each tweet could only be seen an average of 6.2 times with these numbers, and the system still manages to show each tweet a minimum of 6 times.