In [115]:
import random
import pandas as pd
import numpy as np
import plotly.express as px
from sampling_tweets import getTweetAssignments

Begin by defining some parameters:

(Note: I am using an odd number of tweets because the function that assigns lists of tweets for each person can occasionally get caught in an infinite loop if *n_users* and *n_tweets* share a common denominator. There is probably a way to fix this, but it's infrequent and something we can easily adjust if neccessary.)


In [116]:
n_users = 2000
n_tweets = 5003
tweets_per_person = 50
avg_time = 19 # Average survey took 19 minutes as per Tamara
sd_time = 5 # Just a guess, we could get a more precise estimate if desired
avg_user_freq = 2 # Average number of users starting per minute

Next, define a class Survey which we'll use to track people taking the survey simultaneously

In [117]:
class User():
    def __init__(self, u_id, finish_time):
        self.u_id = u_id
        self.finish_time = finish_time

class Survey():
    def __init__(self, n):
        self.cur_users = []
        self.remaining_q_ids = [i for i in range(n)]
        self.assignment_dict = {}
        self.duplicate_tracker = {i:0 for i in self.remaining_q_ids}
        self.sys_time = 0
        self.cur_user_time = {}
        self.max_user_time = {}
        
    def assign_user(self, user):
        self.cur_users.append(user.u_id)
        self.cur_user_time[user.u_id] = 0
        self.max_user_time[user.u_id] = user.finish_time
        assignment_index = random.choice(range(len(self.remaining_q_ids)))
        assignment = self.remaining_q_ids[assignment_index]
        self.assignment_dict[user.u_id] = assignment
        
        
    def increment_time(self):
        self.sys_time += 1
        for user in self.cur_users:
            self.cur_user_time[user] += 1
            if self.cur_user_time[user] >= self.max_user_time[user]: 
                # If user is now finished, we update the remaining question ID list
                self.cur_users.remove(user)
                if self.assignment_dict[user] in self.remaining_q_ids:
                    # If the user's assigned set of questions is still in the remaining q_ids list, remove it
                    self.remaining_q_ids.remove(self.assignment_dict[user])
                else:
                    # If not, we had a duplicate, so increment the value of the q_id's duplicate tracker dict
                    self.duplicate_tracker[self.assignment_dict[user]] += 1
    
    def check_unassigned(self):
        # At the end of a survey, we'll set the value of any unassigned tweet sets to -1 in the duplicate tracker
        for i in self.remaining_q_ids:
            self.duplicate_tracker[i] = -1 
                
        
    

I'll run a Monte Carlo to see how we do given different parameters:

In [118]:
def run_sim(n_simulations, n_users, n_tweets, tweets_per_person, avg_user_freq, avg_time, sd_time, acceptable_views):
    output_df = getTweetAssignments(n_tweets, n_users, tweets_per_person)
    simulation_stats = pd.DataFrame({"avg_user_frequency":[], "avg_time":[], "sd_time":[], "max_sampled":[], "min_sampled":[], "avg_sampled":[], "share_one_duplicate":[], "share_two_duplicates":[], "share_more_duplicates":[], "share_grps_unassigned":[], "n_undersampled":[]})


    for _ in range(n_simulations):
        avg_user_freq_sim = max(np.random.normal(avg_user_freq,5), 0.51) # Don't want to include simulations with a frequency too close to zero for computational reasons
        # We're more interested in situations with high frequencies anyways, so I biased this exercize upwards such that mean number of users per minute is 5
        avg_time_sim = max(np.random.normal(avg_time,5), 0)
        sd_time_sim = max(np.random.normal(sd_time,4), 0)
        
        users = [User(i, avg_time_sim+random.uniform(-sd_time_sim,sd_time_sim)) for i in range(n_users)]
        survey = Survey(len(users))
        survey.assign_user(users.pop(0))

        while len(survey.cur_users) > 0 or len(users) > 0:
            # While there are either people who have not yet taken the survey, or there are people currently taking it...
            # Determine how many new people will take the survey this minute
            new_users = min(round(avg_user_freq_sim + random.uniform(-avg_user_freq_sim, avg_user_freq_sim)),len(users))
            # Here, the number of new users in a given minute is taken to be nearest whole number to the avg number per minute plus some random value
            # The random value is between the positive and negative value of the avg per minute
            for i in range(new_users):
                survey.assign_user(users.pop(0))
            
            survey.increment_time()
            
        survey.check_unassigned()
        duplicates = pd.DataFrame(survey.duplicate_tracker, index=["dups"]).transpose()
        share_one_duplicate = 100*sum(duplicates.dups==1)/len(duplicates.dups)
        share_two_duplicates = 100*sum(duplicates.dups==2)/len(duplicates.dups)
        share_more_duplicates = 100*sum(duplicates.dups>2)/len(duplicates.dups)
        unassigned = 100*len(survey.remaining_q_ids)/len(duplicates.dups)

        total_usage_count = {i:0 for i in range(n_tweets)}
        for tweet_set in range(len(output_df.tweets_assigned)):
            for individual_tweet in range(len(output_df.tweets_assigned[tweet_set])):
                total_usage_count[output_df.tweets_assigned[tweet_set][individual_tweet]] += 1 * (duplicates.dups[tweet_set] + 1)
        
        max_sampled = np.array(list(total_usage_count.values())).max()
        min_sampled = np.array(list(total_usage_count.values())).min()
        avg_sampled = round(np.array(list(total_usage_count.values())).mean(),1)
        n_undersampled = len(dict((k,v) for k, v in total_usage_count.items() if v < acceptable_views))
        simulation_stats = simulation_stats.append({"avg_user_frequency":avg_user_freq_sim, "avg_time":avg_time_sim, "sd_time":sd_time_sim,"max_sampled":max_sampled, "min_sampled":min_sampled, "avg_sampled":avg_sampled, "share_one_duplicate":share_one_duplicate, "share_two_duplicates":share_two_duplicates, "share_more_duplicates":share_more_duplicates, "share_grps_unassigned":unassigned, "n_undersampled":n_undersampled}, ignore_index=True)
    
    return(simulation_stats)

Under our expected parameters (as measured in the previous survey), results look reasonable:

In [119]:
# Keep in mind that the length of the simulation is highly dependant on parameters. Under the given parameters this should take ~3 minutes.
# Runtime increases in n_simulations, n_users, n_tweets, tweets_per_person, and avg_time. It decreases in avg_user_freq.
simulation_stats = run_sim(n_simulations=50, n_users=2000, n_tweets=5001, tweets_per_person=50, avg_user_freq=2, avg_time=19, sd_time=5, acceptable_views=15)
simulation_stats

Unnamed: 0,avg_user_frequency,avg_time,sd_time,max_sampled,min_sampled,avg_sampled,share_one_duplicate,share_two_duplicates,share_more_duplicates,share_grps_unassigned,n_undersampled
0,2.667677,8.930857,3.207425,27.0,15.0,20.0,3.2,0.1,0.0,3.4,0.0
1,0.51,18.071338,4.775156,26.0,16.0,20.0,1.55,0.0,0.0,1.55,0.0
2,0.51,20.707317,2.030713,28.0,16.0,20.0,1.65,0.1,0.0,1.85,0.0
3,0.534843,21.471812,7.64603,26.0,16.0,20.0,1.95,0.15,0.0,2.25,0.0
4,0.51,11.660689,0.244248,26.0,16.0,20.0,1.15,0.0,0.0,1.15,0.0
5,2.103153,24.317818,0.0,29.0,12.0,20.0,6.45,0.4,0.1,7.55,6.0
6,5.462439,14.669317,2.708176,30.0,12.0,20.0,7.45,0.7,0.0,8.85,21.0
7,5.418803,14.947438,11.344619,30.0,13.0,20.0,8.15,0.6,0.0,9.35,24.0
8,0.51,15.145077,3.236084,26.0,16.0,20.0,1.6,0.0,0.0,1.6,0.0
9,0.51,17.331895,0.0,26.0,16.0,20.0,1.55,0.05,0.0,1.65,0.0


In [120]:
print('The average maximum number of times a tweet was seen across simulations was', simulation_stats.max_sampled.mean())
print('The average minimum number of times a tweet was seen across simulations was', simulation_stats.min_sampled.mean())
print('Across all simulations, no tweet was ever seen fewer than', simulation_stats.min_sampled.min(), 'times.')

The average maximum number of times a tweet was seen across simulations was 27.38
The average minimum number of times a tweet was seen across simulations was 14.3
Across all simulations, no tweet was ever seen fewer than 10.0 times.


Now I try to deliberately skew parameters towards negative scenarios:
- The mean number of users joining per minute is taken to be 15, even though in the previous survey we saw a mean of 2.
- average time taken to complete the survey is taken to be 30 minutes, even though we saw an average of 19 in the previous survey

In [121]:
simulation_stats = run_sim(n_simulations=50, n_users=2000, n_tweets=5001, tweets_per_person=50, avg_user_freq=15, avg_time=30, sd_time=5, acceptable_views=15)
simulation_stats

Unnamed: 0,avg_user_frequency,avg_time,sd_time,max_sampled,min_sampled,avg_sampled,share_one_duplicate,share_two_duplicates,share_more_duplicates,share_grps_unassigned,n_undersampled
0,10.177777,28.685616,6.653103,35.0,9.0,20.0,15.4,1.7,0.1,19.15,154.0
1,17.654167,39.375728,1.482494,35.0,9.0,20.0,18.4,3.1,0.75,27.0,333.0
2,16.847338,34.634144,0.0,35.0,8.0,20.0,18.2,3.5,0.55,26.95,298.0
3,21.642939,28.036823,0.0,36.0,9.0,20.0,17.8,3.3,0.75,26.7,318.0
4,13.336486,30.66678,8.309248,33.0,7.0,20.0,16.0,2.65,0.3,22.3,209.0
5,18.058914,33.19235,3.649091,33.0,7.0,20.0,18.4,2.7,0.6,25.7,279.0
6,11.327499,36.771639,2.756176,35.0,9.0,20.0,17.0,2.45,0.3,22.85,216.0
7,18.097815,26.49051,7.112295,33.0,9.0,20.0,17.75,3.35,0.35,25.5,295.0
8,13.621518,26.980018,0.826933,33.0,11.0,20.0,15.35,1.95,0.4,20.45,170.0
9,14.770611,29.288261,1.427813,33.0,9.0,20.0,17.85,2.25,0.3,23.35,243.0


Under these unfavorable conditions, we do see low counts for some tweets. Despite this, the absolute worst case scenario still had 93% of tweets being seen at least 15 times. 

In [123]:
print('Correlation between average users per minute and share of tweet groups unassigned:', round(simulation_stats.avg_user_frequency.corr(simulation_stats.share_grps_unassigned),4))
print()
print('The average maximum number of times a tweet was seen across simulations was', simulation_stats.max_sampled.mean())
print('The average minimum number of times a tweet was seen across simulations was', simulation_stats.min_sampled.mean())
print('Across all simulations, no tweet was ever seen fewer than', simulation_stats.min_sampled.min(), 'times.')

Correlation between average users per minute and share of tweet groups unassigned: 0.9026

The average maximum number of times a tweet was seen across simulations was 33.98
The average minimum number of times a tweet was seen across simulations was 8.68
Across all simulations, no tweet was ever seen fewer than 6.0 times.


What if we have considerably fewer tweets?

In [125]:
simulation_stats = run_sim(n_simulations=50, n_users=2000, n_tweets=1801, tweets_per_person=50, avg_user_freq=20, avg_time=30, sd_time=5, acceptable_views=30)

print('The average maximum number of times a tweet was seen across simulations was', simulation_stats.max_sampled.mean())
print('The average minimum number of times a tweet was seen across simulations was', simulation_stats.min_sampled.mean())
print('Across all simulations, no tweet was ever seen fewer than', simulation_stats.min_sampled.min(), 'times.')
print()
print("Under perfect sampling, we'd expect each tweet to be seen an average of", round((2000*50)/1801,2), 'times')

The average maximum number of times a tweet was seen across simulations was 76.56
The average minimum number of times a tweet was seen across simulations was 36.44
Across all simulations, no tweet was ever seen fewer than 29.0 times.

Under perfect sampling, we'd expect each tweet to be seen an average of 55.52 times


If we have fewer users?

In [128]:
simulation_stats = run_sim(n_simulations=50, n_users=1209, n_tweets=5001, tweets_per_person=50, avg_user_freq=20, avg_time=30, sd_time=5, acceptable_views=15)
simulation_stats

Unnamed: 0,avg_user_frequency,avg_time,sd_time,max_sampled,min_sampled,avg_sampled,share_one_duplicate,share_two_duplicates,share_more_duplicates,share_grps_unassigned,n_undersampled
0,20.59307,28.743503,0.0,23.0,2.0,12.1,18.444996,3.722084,1.240695,29.693962,3962.0
1,24.130308,31.877707,5.225153,27.0,1.0,12.1,17.535153,5.955335,1.323408,33.498759,3854.0
2,15.92573,30.606181,4.939474,25.0,3.0,12.1,18.858561,4.218362,0.992556,30.272953,3948.0
3,23.346036,39.093299,4.046866,26.0,2.0,12.1,18.610422,5.128205,1.736973,34.656741,3851.0
4,23.202344,35.906188,1.970042,25.0,2.0,12.1,19.1067,5.210918,1.323408,33.664185,3870.0
5,20.727629,23.603934,4.9184,24.0,2.0,12.1,18.444996,4.631927,0.992556,30.686518,3965.0
6,19.603658,31.1806,8.539157,26.0,1.0,12.1,19.520265,4.71464,1.075269,32.42349,3867.0
7,25.252125,24.320531,6.494336,25.0,3.0,12.1,19.520265,4.301075,1.157982,31.761787,3937.0
8,7.805479,36.555064,2.095722,23.0,3.0,12.1,17.204301,2.646816,0.248139,23.242349,4183.0
9,18.032864,33.619536,0.0,25.0,2.0,12.1,19.023987,5.707196,0.496278,32.009926,3919.0
