In [1]:
import pandas
import numpy
from collections import Counter


def get_freq_dist(input_df, year):
    col_name = f"{year} Count"
    freq = [0 for _ in range(26)]
    df = input_df[["Name", col_name]]

    letter_to_index = {chr(x+65):x for x in range(26)}

    for ind in df.index:
        try:
            letter = df["Name"][ind][0]
            index = letter_to_index[letter]

            if df[col_name][ind] != "[x]":
                freq[index]+=int(df[col_name][ind].replace(',', ''))
        except TypeError:
            print("ERROR: ", df["Name"][ind])

    return freq


def get_prob_dist(input_df, year):
    freq = get_freq_dist(input_df, year)
    count = sum(freq)
    prob_dist = [x/count for x in freq]
    return prob_dist, count


def multi_year_prob_data(input_df, years):
    freq = [0 for _ in range(26)]
    for year in years:
        cur_freq = get_freq_dist(input_df, year)
        for i in range(26):
            freq[i] += cur_freq[i]
    count = sum(freq)
    prob_dist = [x/count for x in freq]
    return prob_dist, count

def print_dist(dist):
    s = "[" + " ".join(f"{i:.3f}" for i in dist) + "]"
    print(s)
    
    
girlnames = pandas.read_csv("f-babynames1996to2021.csv")
boynames = pandas.read_csv("m-babynames1996to2021.csv")

prob_distb, _ = multi_year_prob_data(boynames, range(1996,2022))
prob_distg, _ = multi_year_prob_data(girlnames, range(1996,2022))

prob_distb = numpy.array(prob_distb)
prob_distg = numpy.array(prob_distg)

total_dist = (prob_distb + prob_distg)/2

print(total_dist)

[0.11496008 0.03309375 0.06283095 0.03468293 0.0747514  0.02670963
 0.02488444 0.04748531 0.02877411 0.08992809 0.03774962 0.07547558
 0.08621922 0.02522859 0.02882965 0.01771189 0.00054666 0.05018882
 0.06493722 0.03924377 0.0013474  0.00536945 0.01096349 0.00071069
 0.0041952  0.01318207]


In [42]:
WORD = 'bermuda'
letter_to_index = {chr(x+97):x for x in range(26)}

indexed = [(letter, total_dist[letter_to_index[letter]]) for letter in WORD]
indexed = sorted(indexed, key=lambda x: x[1], reverse=True)
inletters, inprobs = zip(*indexed)

print(inletter)
print(inprobs)



('a', 'm', 'e', 'r', 'd', 'b', 'u')
(0.11496007733872045, 0.08621921732623261, 0.07475139833647483, 0.05018881507866309, 0.034682927937531444, 0.033093753553561006, 0.0013474028563640237)


In [72]:
def test(state, depth):
    if all(x==1 for x in state):
        return
    #testcopy = state.copy()
    for i in reversed(range(len(state))):
        if state[i] == 0:
            state[i]=1
            test(state, depth+1)
            state[i]=0
            print(state, depth)
            


test([0,0,0], 0)


[0, 1, 1] 2
[0, 0, 1] 1
[1, 0, 1] 2
[0, 0, 1] 1
[0, 0, 0] 0
[0, 1, 1] 2
[0, 1, 0] 1
[1, 1, 0] 2
[0, 1, 0] 1
[0, 0, 0] 0
[1, 0, 1] 2
[1, 0, 0] 1
[1, 1, 0] 2
[1, 0, 0] 1
[0, 0, 0] 0


In [87]:
def recursive(state, k, fld, cur=1):
    if all(x==1 for x in state):
        return cur
    condition_multiplier = 1
    total = 0
    
    for i in reversed(range(len(state))):
        if state[i] == 0:
            branch_prob = 1-pow(1-fld[i], k)
            full_thing = cur*branch_prob*condition_multiplier

            state[i]=1
            total += recursive(state, k, fld, full_thing)
            state[i]=0
            
            if k > 1:
                condition_multiplier *= (1-branch_prob)
    
    return total

values = list(range(1,11))+[15, 20, 30, 40, 50, 100, 500]
for k in values:
    answer = recursive([0 for _ in WORD], k, inprobs)
    print(f"{k: 5d} {answer:.4g}")

    1 2.898e-07
    2 1.525e-05
    3 0.000155
    4 0.0007026
    5 0.002063
    6 0.004632
    7 0.008683
    8 0.01432
    9 0.02149
   10 0.03003
   15 0.08467
   20 0.142
   30 0.2367
   40 0.3114
   50 0.3752
  100 0.6109
  500 0.9911
