In [None]:
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Serif'
plt.rcParams['font.serif'] = 'Liberation Serif'
import scipy.stats as stats
import numpy as np
import os
import pickle

In [None]:
a = 1.5
n = 25

#plot the pmf of the zipfian(a,n) distribution
fig, ax = plt.subplots(figsize=(5, 3))

x = np.arange(1, n+1)
pmf = stats.zipfian.pmf(x, a, n)
print(pmf)
ax.bar(x, pmf, color="black", width=0.5, edgecolor='black', zorder=100)
ax.set_xlabel('Max # of Systems in an Interleaved Trace ')
ax.set_ylabel('Probability')
ax.set_yscale('log')
ax.set_ylim(1e-7, 1.1)
#add y axis grid lines
ax.yaxis.grid(True, linestyle=':', which='both', color='gray', alpha=0.7)
fig.tight_layout()
# plt.title('Zipfian({},{})'.format(a,n))
os.makedirs('outputs/zipf', exist_ok=True)
fig.savefig('outputs/zipf/zipfian.pdf', format='pdf')
plt.show()

In [None]:
harm = 0

for k in range(1, n+1):
    harm += 1/k**a

zipf_self = []
for k in range(1, n+1):
    zipf_self.append(1/((k**a)*harm))


print(zipf_self)

plt.bar(x, zipf_self)
plt.xlabel('x')
plt.ylabel('P(X=x)')
plt.yscale('log')
plt.title('Zipfian({},{})'.format(a,n))
fig.tight_layout()
plt.show()

print("exp(-2) = ", np.exp(-2))
# probability of zero cut trace
print(f"Probaility of zero cut trace = z(1)exp(-2)={pmf[0]*np.exp(-2)}, where z(x) is the pmf of Zipfian({a},{n})")

In [None]:
#let M be the average of a zipfian(1.5,25) distribution
m = stats.zipfian.mean(a, n)
print("Mean of Zipfian(1.5,25) = ", m)

avg_c = 2*m

#~expected # of empty bins

print("Approx. Expected # of empty bins = ", m*(1 - 1/m)**(avg_c+1))

In [None]:
def approx_prob_of_leak(alpha, zipf_pmf, backstory_len=7):
    sum = 0
    for m in range(alpha + 1, len(zipf_pmf) + 1):
        # print("m = ", m)
        lb = 1 - m*np.log((m - alpha)/m)
        # print("lb = ", lb)
        pois_ccdf = 1 - stats.poisson.cdf(lb, m)
        val = pois_ccdf * zipf_pmf[m-1]
        # print("val = ", val)
        sum += val

    print(f"Approx. prob. of leak for alpha = {alpha} is {sum}\n {250 - backstory_len*alpha} indices in the context left for data\n")
    return sum

In [None]:
zipf_pmf = stats.zipfian.pmf(x, a, n)

for alpha in range(1, 21):
    approx_prob_of_leak(alpha, zipf_pmf)

## frequencies of # of systems in a trace

In [None]:
#sample a value from the zipfian distribution
N = 1e7
N = int(N)

cards = []
Cs = []
sys_in_traces = []


In [None]:
samples = stats.zipfian.rvs(a, n, size=int(N))
for sample in samples:
    print(f"sample: {sample}")
    #sample C from a poisson distribution with parameter 2*sample
    C = np.random.poisson(2 * sample)
    print("C = ", C)
    Cs.append(C)
    # #take C+1 samples from a discrete uniform value from 0 to sample -1
    sys_in_trace = np.random.randint(0, sample, size=C+1)
    sys_in_traces.append(sys_in_trace)
    print("sys_in_trace = ", sys_in_trace)
    #make sys_in_trace a set to remove duplicates
    sys_in_trace_set = set(sys_in_trace)
    print("sys_in_trace_set = ", sys_in_trace_set)
    #take the cardinality of sys_in_trace_set
    cardinality = len(sys_in_trace_set)
    print("Cardinality = ", cardinality)
    cards.append(cardinality)

#save the samples, cards, Cs, and sys_in_traces to a file
output_data = {
    'samples': samples,
    'cards': cards,
    'Cs': Cs,
    'sys_in_traces': sys_in_traces
}
path = f"outputs/zipf/zipf_frequency_metadata.pkl"
with open(path, 'wb') as f:
    pickle.dump(output_data, f)

In [None]:
#load the zipf frequency metadata from the file
path = f"outputs/zipf/zipf_frequency_metadata.pkl"
with open(path, 'rb') as f:
    data = pickle.load(f)
samples = data['samples']
cards = data['cards']
Cs = data['Cs']
sys_in_traces = data['sys_in_traces']

In [None]:
print("len(cards) = ", len(cards))
print("cards[0:10] = ", cards[0:10])

In [None]:
# compute the frequency of each cardinality
unique, counts = np.unique(cards, return_counts=True)
counts = counts / N  # normalize counts to get frequency
# create a dictionary of cardinality and frequency
cardinality_freq = dict(zip(unique, counts))

#plot the frequency of each cardinality
fig, ax = plt.subplots(figsize=(5, 3))
ax.bar(cardinality_freq.keys(), cardinality_freq.values(), color="black", width=0.5, edgecolor='black', zorder=100)
ax.set_xlabel('# of Unique Systems in an Interleaved Trace')
# set Nsci to N in scientific notation with 0 decimal places
Nsci = '{:.0e}'.format(N)
ax.set_ylabel(f'Frequency after {Nsci} trials')
ax.set_yscale('log')
ax.set_ylim(1e-7, 1.1)
#add y axis grid lines
ax.yaxis.grid(True, linestyle=':', which='both', color='gray', alpha=0.7)
fig.tight_layout()
os.makedirs('outputs/zipf', exist_ok=True)
fig.savefig('outputs/zipf/unique_systems_dist.pdf', format='pdf')
plt.show()

In [None]:
paired_colors = plt.get_cmap('Paired')
start_col = 0
# color1 = paired_colors(start_col)
# color2 = paired_colors(start_col + 1)
color1 = "black"
color2 = "silver"


#plot pmf of zipfian(1.5,25) and histogram of samples on the same bar plot
fig, ax = plt.subplots(figsize=(7, 3.5))
x = np.arange(1, n+1)
pmf = stats.zipfian.pmf(x, a, n)
print(pmf)
ax.bar(x - 0.2, pmf, color=color1, width=0.4, edgecolor='black', label='Prob. of Max # of Systems in Trace', zorder=100)
#plot histogram of samples
card_freq_keys = np.array(list(cardinality_freq.keys()))
ax.bar(card_freq_keys + 0.2, cardinality_freq.values(), color=color2, width=0.4, edgecolor='black', zorder=100, label='Freq. of # of Unique Systems in Trace')
ax.set_xlabel('# of Systems')
ax.set_ylabel('Probability/Frequency')
ax.set_yscale('log')
ax.set_ylim(1e-7, 1.1)
#add y axis grid lines
ax.yaxis.grid(True, linestyle=':', which='both', color='gray', alpha=0.7)
ax.legend()
fig.tight_layout()
# plt.title('Zipfian({},{})'.format(a,n))
os.makedirs('outputs/zipf', exist_ok=True)
fig.savefig('outputs/zipf/zipfian_and_unique_systems_pmf.pdf', format='pdf')
plt.show()

In [None]:
#compute the frequency of each value of C in Cs
unique_C, counts_C = np.unique(Cs, return_counts=True)
counts_C = counts_C / N  # normalize counts to get frequency
# create a dictionary of C and frequency
C_freq = dict(zip(unique_C, counts_C))

#plot the frequency of each value of C
fig, ax = plt.subplots(figsize=(5, 3))
ax.bar(C_freq.keys(), C_freq.values(), color="black", width=0.5, edgecolor='black', zorder=100)
ax.set_xlabel('# of Cuts in an Interleaved Trace')
ax.set_ylabel(f'Frequency after {Nsci} trials')
ax.set_yscale('log')
ax.set_ylim(1e-7, 1.1)
#add y axis grid lines
ax.yaxis.grid(True, linestyle=':', which='both', color='gray', alpha=0.7)
fig.tight_layout()
os.makedirs('outputs/zipf', exist_ok=True)
fig.savefig('outputs/zipf/num_cuts_dist.pdf', format='pdf')
plt.show()

In [None]:
#plot the ccdf of the zipfian distribution
fig, ax = plt.subplots(figsize=(5, 3))

a = 1.5
n = 25

x = np.arange(1, n+1)
cdf = stats.zipfian._cdf(x,a,n)
ccdf = 1 - cdf

ccdf = ccdf + pmf
print(ccdf)
ax.bar(x, ccdf, color="black", width=0.5, edgecolor='black', zorder=100)
ax.set_xlabel('x (Max # of Systems in an Interleaved Trace)')
ax.set_ylabel('Prob. of at least x')
ax.set_yscale('log')
ax.set_ylim(1e-7, 1.1)
#set minor y ticks in log scale
# ax.set_yticks([1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1])
#add y axis grid lines
ax.yaxis.grid(True, linestyle=':', which='both', color='gray', alpha=0.7)
fig.tight_layout()
os.makedirs('outputs/zipf', exist_ok=True)
fig.savefig('outputs/zipf/zipfian_ccdf.pdf', format='pdf')
plt.show()




In [None]:
#plot the ccdf of the cardinality distribution
fig, ax = plt.subplots(figsize=(5, 3))
x = np.arange(1, max(cardinality_freq.keys()) + 1)
ccdf_cards = 1 - np.array([cardinality_freq.get(i, 0) for i in x]).cumsum()
ccdf_cards += np.array([cardinality_freq.get(i, 0) for i in x])
print(ccdf_cards)
ax.bar(x, ccdf_cards, color="black", width=0.5, edgecolor='black', zorder=100)
ax.set_xlabel('x (# of Unique Systems in an Interleaved Trace)')
ax.set_ylabel(f'Freq. of at least x after {Nsci} trials')
ax.set_yscale('log')
ax.set_ylim(1e-7, 1.1)
ax.yaxis.grid(True, linestyle=':', which='both', color='gray', alpha=0.7)
fig.tight_layout()
os.makedirs('outputs/zipf', exist_ok=True)
fig.savefig('outputs/zipf/unique_systems_ccdf.pdf', format='pdf')
plt.show()

In [None]:
#plot the ccdf of the zipfian pmf and the cardinality distribution on the same plot
fig, ax = plt.subplots(figsize=(5, 3))
x1 = np.arange(1, n+1)
cdf1 = stats.zipfian._cdf(x1,a,n)
ccdf1 = 1 - cdf1
ccdf1 = ccdf1 + pmf
# ax.bar(x1 - 0.2, ccdf1, color=color1, width=0.4, edgecolor='black', label='Prob. of Max # of Systems in Trace', zorder=100)
ax.plot(x1 - 0.02, ccdf1, color=color1, marker='o', markersize=5, label='Prob. of Max # of Systems in Trace', zorder=100)
x2 = np.arange(1, max(cardinality_freq.keys()) + 1)
ccdf2 = 1 - np.array([cardinality_freq.get(i, 0) for i in x2]).cumsum()
ccdf2 += np.array([cardinality_freq.get(i, 0) for i in x2])
# ax.bar(x2 + 0.2, ccdf2, color=color2, width=0.4, edgecolor='black', zorder=100, label='Freq. of # of Unique Systems in Trace')
ax.plot(x2 + 0.02, ccdf2, color=color2, marker='o', markersize=5, zorder=100, label='Freq. of # of Unique Systems in Trace', markeredgecolor='dimgray', markeredgewidth=0.5)
ax.set_xlabel('x (# of Systems)')
ax.set_ylabel('Prob./Freq. of at least x Systems')
ax.set_yscale('log')
ax.set_ylim(1e-7, 1.1)
ax.yaxis.grid(True, linestyle=':', which='both', color='gray', alpha=0.7)
ax.legend()
fig.tight_layout()
os.makedirs('outputs/zipf', exist_ok=True)
fig.savefig('outputs/zipf/zipfian_and_unique_systems_ccdf.pdf', format='pdf')
plt.show()

In [None]:
# compute the ccdf of C
fig, ax = plt.subplots(figsize=(5, 3))
x_C = np.arange(0, max(C_freq.keys()) + 1)
#reverse x_C to start from the maximum value
rev_x_C = x_C[::-1]
print("rev_x_C = ", rev_x_C)
print("sum of C_freq = ", sum(C_freq.values()))
print("C_freq = ", C_freq)
ccdf_C = np.array([C_freq.get(i, 0) for i in rev_x_C]).cumsum()[::-1]
print("ccdf_C = ", ccdf_C)

# ccdf_C += np.array([C_freq.get(i, 0) for i in x_C], dtype=float)
# print(ccdf_C)
ax.plot(x_C, ccdf_C, color="black", zorder=100, linewidth=2, marker='o', markersize=2)
ax.set_xlabel('x (# of Cuts in an Interleaved Trace)')
ax.set_ylabel(f'Freq. of at least x after {Nsci} trials')
ax.set_yscale('log')
ax.set_ylim(1e-7, 1.1)
#add y axis grid lines
ax.yaxis.grid(True, linestyle=':', which='both', color='gray', alpha=0.7)
fig.tight_layout()
os.makedirs('outputs/zipf', exist_ok=True)
fig.savefig('outputs/zipf/num_cuts_ccdf.pdf', format='pdf')
plt.show()

In [None]:
#poisson pmf with parameter lam
lam = 2
x = np.arange(0, max(Cs)+1)
pmf = stats.poisson.pmf(x, lam)

zipf_pmf = stats.zipfian.pmf(np.arange(1, n+1), a, n)

prob_cuts = []
for c in x:
    prob_cut = 0
    for lam in range(1, n+1):
        prob_cut += stats.poisson.pmf(c, 2*lam) * zipf_pmf[lam-1]
    prob_cuts.append(prob_cut)

print(prob_cuts)
print("sum of prob_cuts = ", sum(prob_cuts))


In [None]:

fig, ax = plt.subplots(figsize=(5, 3))
ax.bar(x, prob_cuts, color="black", width=0.5, edgecolor="black", zorder=100)

# c_freq_keys = np.array(list(C_freq.keys()))
# ax.bar(c_freq_keys+0.2, C_freq.values(), color=color2, width=0.4, zorder=100)
ax.set_xlabel('# of Cuts in an Interleaved Trace')
ax.set_ylabel('Probability')
ax.set_yscale('log')
ax.set_ylim(1e-7, 1.1)
#add y axis grid lines
ax.yaxis.grid(True, linestyle=':', which='both', color='gray', alpha=0.7)
fig.tight_layout()
# plt.title('Poisson({})'.format(lam))
os.makedirs('outputs/zipf', exist_ok=True)
fig.savefig('outputs/zipf/poisson_num_cuts.pdf', format='pdf')
plt.show()

In [None]:
# plot the ccdf of the poisson num cuts distribution
fig, ax = plt.subplots(figsize=(5, 3))
x = np.arange(0, max(Cs)+1)
cdf = np.array(prob_cuts).cumsum()
ccdf = 1 - cdf + np.array(prob_cuts)
print(ccdf)
ax.plot(x, ccdf, color="black", marker='o', markersize=2, zorder=100, linewidth=2)
ax.set_xlabel('x (# of Cuts in an Interleaved Trace)')
ax.set_ylabel('Prob. of at least x')
ax.set_yscale('log')
ax.set_ylim(1e-7, 1.1)
#add y axis grid lines
ax.yaxis.grid(True, linestyle=':', which='both', color='gray', alpha=
0.7)
fig.tight_layout()
# plt.title('Poisson({})'.format(lam))
os.makedirs('outputs/zipf', exist_ok=True)
fig.savefig('outputs/zipf/poisson_num_cuts_ccdf.pdf', format='pdf')
plt.show()

In [None]:
#for each list in sys_in_traces, for each unique number in the list, count how many times it appears in the list, keep count of the frequency of each count

freq_of_freq = {}
num_distinct_sys = 0
for trace in sys_in_traces:
    unique, counts = np.unique(trace, return_counts=True)
    num_distinct_sys += len(unique)
    for count in counts:
        if count in freq_of_freq:
            freq_of_freq[count] += 1
        else:
            freq_of_freq[count] = 1


In [None]:
#normalize freq_of_freq
norm_freq_of_freq = {}
for key in freq_of_freq:
    norm_freq_of_freq[key] = freq_of_freq[key] / num_distinct_sys


In [None]:
#sum the values of freq_of_freq
print(num_distinct_sys)
sum_freq_of_freq = sum(freq_of_freq.values())
print("sum of freq_of_freq = ", sum_freq_of_freq)
print("sum of norm_freq_of_freq = ", sum(norm_freq_of_freq.values()))

In [None]:
#make a bar plot of freq_of_freq
fig, ax = plt.subplots(figsize=(5, 3))
ax.bar(norm_freq_of_freq.keys(), norm_freq_of_freq.values(), color="black", width=0.5, edgecolor='black', zorder=100)
ax.set_xlabel('# of Times a System Appears in a Trace')
ax.set_ylabel(f'Frequency per System Appearance')
ax.set_yscale('log')
ax.set_ylim(6e-8, 1.1)
#add y axis grid lines
ax.yaxis.grid(True, linestyle=':', which='both', color='gray', alpha=0.7)
fig.tight_layout()
os.makedirs('outputs/zipf', exist_ok=True)
fig.savefig('outputs/zipf/freq_of_sys_appears.pdf', format='pdf')
plt.show()

In [None]:
# for each list in sys_in_traces, count the number of unique systems that are present for every new system in the trace
unique_sys_over_time_counts = {}
trace_count = 0
num_sys_total = 0
for trace in sys_in_traces:
    num_sys_total += len(trace)
    seen = set()
    for i, sys in enumerate(trace):
        # if i == 0:
        #     print("\n\n")
        if sys not in seen:
            count = 0
        else:
            count = len(seen) 

        if count in unique_sys_over_time_counts:
            unique_sys_over_time_counts[count] += 1
        else:
            unique_sys_over_time_counts[count] = 1

        seen.add(sys) # add the system to the set
        # print("i = ", i, " sys = ", sys, " seen = ", seen, " count = ", count)
    trace_count += 1

In [None]:
print("num_sys_total = ", num_sys_total)
print("sum of unique_sys_over_time_counts = ", sum(unique_sys_over_time_counts.values()))

In [None]:
#normalize unique_sys_over_time_counts
norm_unique_sys_over_time_counts = {}
for key in unique_sys_over_time_counts:
    norm_unique_sys_over_time_counts[key] = unique_sys_over_time_counts[key] / num_sys_total
    

In [None]:
#plot the normalized unique_sys_over_time_counts
fig, ax = plt.subplots(figsize=(5, 3))
ax.bar(norm_unique_sys_over_time_counts.keys(), norm_unique_sys_over_time_counts.values(), color="black", width=0.5, edgecolor='black', zorder=100)
ax.set_xlabel('# of System to Choose Between for Recall')
ax.set_ylabel(f'Frequency per # of Systems Chosen')
ax.set_yscale('log')
ax.set_ylim(1e-7, 1.1)
#add y axis grid lines
ax.yaxis.grid(True, linestyle=':', which='both', color='gray', alpha=0.7)
fig.tight_layout()
os.makedirs('outputs/zipf', exist_ok=True)
fig.savefig('outputs/zipf/sys_choice_recall.pdf', format='pdf')
plt.show()