In [8]:
import random
import pandas as pd
import numpy as np
import plotly.express as px

Begin by defining some parameters:

In [9]:
n_users = 2000
n_tweets = 5003
tweets_per_person = 50
avg_time = 19 # Average survey took 19 minutes as per Tamara
sd_time = 5 # Just a guess, we could get a more precise estimate if desired
avg_user_freq = 2 # Average number of users starting per minute

Next, define a class Survey which we'll use to track people taking the survey simultaneously

In [10]:
class User():
    def __init__(self, u_id, finish_time):
        self.u_id = u_id
        self.finish_time = finish_time

class Survey():
    def __init__(self, n):
        self.cur_users = []
        self.remaining_q_ids = [i for i in range(n)]
        self.assignment_dict = {}
        self.duplicate_tracker = {i:0 for i in self.remaining_q_ids}
        self.sys_time = 0
        self.cur_user_time = {}
        self.max_user_time = {}
        
    def assign_user(self, user):
        self.cur_users.append(user.u_id)
        self.cur_user_time[user.u_id] = 0
        self.max_user_time[user.u_id] = user.finish_time
        assignment_index = random.choice(range(len(self.remaining_q_ids)))
        assignment = self.remaining_q_ids[assignment_index]
        self.assignment_dict[user.u_id] = assignment
        
        
    def increment_time(self):
        self.sys_time += 1
        for user in self.cur_users:
            self.cur_user_time[user] += 1
            if self.cur_user_time[user] >= self.max_user_time[user]: 
                # If user is now finished, we update the remaining question ID list
                self.cur_users.remove(user)
                if self.assignment_dict[user] in self.remaining_q_ids:
                    # If the user's assigned set of questions is still in the remaining q_ids list, remove it
                    self.remaining_q_ids.remove(self.assignment_dict[user])
                else:
                    # If not, we had a duplicate, so increment the value of the q_id's duplicate tracker dict
                    self.duplicate_tracker[self.assignment_dict[user]] += 1
                
        
    

Generate some users:

In [11]:
users = [User(i, avg_time+random.choice(range(-sd_time,sd_time))) for i in range(n_users)]

In [12]:
survey = Survey(len(users))
survey.assign_user(users.pop(0))

while len(survey.cur_users) > 0 or len(users) > 0:
    # While there are either people who have not yet taken the survey, or there are people currently taking it...
    # Determine how many new people will take the survey this minute
    new_users = min(avg_user_freq + random.choice(range(-avg_user_freq,avg_user_freq)), len(users)) # This is a bit arbitrary but will give us a more realistic flow of people
    for i in range(new_users):
        survey.assign_user(users.pop(0))
    
    survey.increment_time()

In [13]:
duplicates = pd.DataFrame(survey.duplicate_tracker, index=["dups"]).transpose()
fig = px.histogram(duplicates, x="dups")
fig.show()

In [14]:
share_one_duplicate = 100*sum(duplicates.dups==1)/len(duplicates.dups)
share_two_duplicates = 100*sum(duplicates.dups==2)/len(duplicates.dups)
share_more_duplicates = 100*sum(duplicates.dups>2)/len(duplicates.dups)
unassigned = 100*len(survey.remaining_q_ids)/len(duplicates.dups)
print(share_one_duplicate, "percent of question sets were accidentally assigned twice")
print(share_two_duplicates, "percent of question sets were accidentally assigned three times")
print(share_more_duplicates, "percent of question sets were accidentally assigned more than three times")
print(unassigned, "percent of question sets were entirely unassigned")


4.15 percent of question sets were accidentally assigned twice
0.15 percent of question sets were accidentally assigned three times
0.0 percent of question sets were accidentally assigned more than three times
4.45 percent of question sets were entirely unassigned


While it doesn't appear we're getting very many duplicates with these parameters, we should look at the number of times each individual question gets assigned:

In [15]:
from sampling_tweets import getTweetAssignments

# Using the script I created to assign tweets, get lists of tweets that correspond to the tweet groupings assigned in the above exercize.
output_df = getTweetAssignments(n_tweets, n_users, tweets_per_person)

In [16]:
# For each set of tweets to be assigned, count up the number of times individual tweets were seen, factoring in imperfect quota allocation
total_usage_count = {i:0 for i in range(n_tweets)}
for tweet_set in range(len(output_df.tweets_assigned)):
    for individual_tweet in range(len(output_df.tweets_assigned[tweet_set])):
        total_usage_count[output_df.tweets_assigned[tweet_set][individual_tweet]] += 1 * (duplicates.dups[tweet_set] + 1)

In [17]:
max_sampled = np.array(list(total_usage_count.values())).max()
min_sampled = np.array(list(total_usage_count.values())).min()
avg_sampled = round(np.array(list(total_usage_count.values())).mean(),1)

print('The most sampled tweet was seen', max_sampled, 'times.')
print('The least sampled tweet was seen', min_sampled, 'times.')
print('The average tweet was seen', avg_sampled, 'times.')



The most sampled tweet was seen 28 times.
The least sampled tweet was seen 19 times.
The average tweet was seen 20.9 times.


This seems encouraging! The final step is to run a Monte Carlo to see how we do given different parameters

In [18]:
n_users = 2000
n_tweets = 5009
tweets_per_person = 50
output_df = getTweetAssignments(n_tweets, n_users, tweets_per_person)

In [19]:
n_simulations = 50
simulation_stats = pd.DataFrame({"avg_user_frequency":[], "avg_time":[], "sd_time":[], "max_sampled":[], "min_sampled":[], "avg_sampled":[], "share_one_duplicate":[], "share_two_duplicates":[], "share_more_duplicates":[], "share_unassigned":[]})


for _ in range(n_simulations):
    avg_user_freq_sim = max(avg_user_freq + np.random.normal(15,5), 0.51) # Don't want to include simulations with a frequency too close to zero for computational reasons
    # We're more interested in situations with high frequencies anyways, so I biased this exercize upwards such that mean number of users per minute is 5
    avg_time_sim = max(avg_time + np.random.normal(0,5), 0)
    sd_time_sim = max(sd_time + np.random.normal(0,4), 0)
    
    users = [User(i, avg_time_sim+random.uniform(-sd_time_sim,sd_time_sim)) for i in range(n_users)]
    survey = Survey(len(users))
    survey.assign_user(users.pop(0))

    while len(survey.cur_users) > 0 or len(users) > 0:
        # While there are either people who have not yet taken the survey, or there are people currently taking it...
        # Determine how many new people will take the survey this minute
        new_users = min(round(avg_user_freq_sim + random.uniform(-avg_user_freq_sim, avg_user_freq_sim)),len(users))
        # Here, the number of new users in a given minute is taken to be nearest whole number to the avg number per minute plus some random value
        # The random value is between the positive and negative value of the avg per minute
        for i in range(new_users):
            survey.assign_user(users.pop(0))
        
        survey.increment_time()
        
    duplicates = pd.DataFrame(survey.duplicate_tracker, index=["dups"]).transpose()
    share_one_duplicate = 100*sum(duplicates.dups==1)/len(duplicates.dups)
    share_two_duplicates = 100*sum(duplicates.dups==2)/len(duplicates.dups)
    share_more_duplicates = 100*sum(duplicates.dups>2)/len(duplicates.dups)
    unassigned = 100*len(survey.remaining_q_ids)/len(duplicates.dups)

    total_usage_count = {i:0 for i in range(n_tweets)}
    for tweet_set in range(len(output_df.tweets_assigned)):
        for individual_tweet in range(len(output_df.tweets_assigned[tweet_set])):
            total_usage_count[output_df.tweets_assigned[tweet_set][individual_tweet]] += 1 * (duplicates.dups[tweet_set] + 1)
    
    max_sampled = np.array(list(total_usage_count.values())).max()
    min_sampled = np.array(list(total_usage_count.values())).min()
    avg_sampled = round(np.array(list(total_usage_count.values())).mean(),1)
    simulation_stats = simulation_stats.append({"avg_user_frequency":avg_user_freq_sim, "avg_time":avg_time_sim, "sd_time":sd_time_sim,"max_sampled":max_sampled, "min_sampled":min_sampled, "avg_sampled":avg_sampled, "share_one_duplicate":share_one_duplicate, "share_two_duplicates":share_two_duplicates, "share_more_duplicates":share_more_duplicates, "share_unassigned":unassigned}, ignore_index=True)

In [20]:
simulation_stats

Unnamed: 0,avg_user_frequency,avg_time,sd_time,max_sampled,min_sampled,avg_sampled,share_one_duplicate,share_two_duplicates,share_more_duplicates,share_unassigned
0,26.130615,21.589255,3.227445,40.0,19.0,24.8,17.4,3.0,0.25,24.25
1,20.441981,14.176853,10.984282,34.0,19.0,23.7,13.95,1.85,0.3,18.6
2,12.814233,23.464203,9.214951,35.0,19.0,23.8,14.9,2.05,0.1,19.3
3,21.532391,14.883751,1.510841,35.0,19.0,24.2,15.85,2.1,0.35,21.2
4,22.156318,9.345324,4.442159,33.0,19.0,23.3,13.7,1.25,0.15,16.65
5,10.783063,19.778839,6.73478,32.0,19.0,23.0,13.3,0.85,0.0,15.0
6,25.147456,14.659691,8.958617,38.0,19.0,24.0,14.4,2.25,0.5,20.45
7,10.84549,13.77045,7.545174,30.0,19.0,22.4,10.5,0.75,0.0,12.0
8,11.809255,15.906797,0.555514,35.0,19.0,23.0,12.75,1.05,0.1,15.15
9,16.989979,17.966579,0.0,33.0,19.0,23.6,14.35,1.65,0.25,18.4


The share of tweet groupings that go unassigned correlates strongly with the frequency with which users join the survey.
Although some tweets do get seen more frequently, it is very rare for even tweet to be observed an unacceptably low number of times.

In [21]:
print('Correlation between average users per minute and share of tweet groups unassigned:', round(simulation_stats.avg_user_frequency.corr(simulation_stats.share_unassigned),4))
print()
print('The average maximum number of times a tweet was seen across simulations was', simulation_stats.max_sampled.mean())
print('The average minimum number of times a tweet was seen across simulations was', simulation_stats.min_sampled.mean())
print('Across all simulations, no tweet was ever seen fewer than', simulation_stats.min_sampled.min(), 'times.')

Correlation between average users per minute and share of tweet groups unassigned: 0.6753

The average maximum number of times a tweet was seen across simulations was 35.12
The average minimum number of times a tweet was seen across simulations was 19.0
Across all simulations, no tweet was ever seen fewer than 19.0 times.


In [22]:
from sampling_tweets import getUniparkInputs
getUniparkInputs(output_df)