In [7]:
import numpy as np
from typing import Sequence

# This definition the PMF to calculate the probability of having more than 𝑛 rainy days by summing the probabilities for all cases where the number of rainy days is greater than 𝑛.
def prob_rain_more_than_n(p: Sequence[float], n: int, num_simulations: int = 10000) -> float:
    # Convert the list of probabilities to a numpy array for efficiency
    p = np.array(p)
    
    # Run the Monte Carlo simulation
    rainy_days_counts = np.random.binomial(n=1, p=p, size=(num_simulations, len(p))).sum(axis=1)
    
    # Calculate the probability of having more than n rainy days
    prob = np.mean(rainy_days_counts > n)
    
    return prob
# I set up the random number of probability of rain
# Example usage:
p = [0.3] * 365  
n = 100
print(prob_rain_more_than_n(p, n))

0.8485


In [13]:
from typing import List, Tuple, Dict, Sequence

# This function converts the pronunciation dictionary into a dictionary where the keys are tuples of phonemes and the values are lists of words that match those phonemes.
def preprocess_dictionary(pronunciation_dict: List[str]) -> Dict[Tuple[str], List[str]]:
    phoneme_to_words = {}
    for entry in pronunciation_dict:
        parts = entry.split()
        word = parts[0]
        phonemes = tuple(parts[1:])
        if phonemes not in phoneme_to_words:
            phoneme_to_words[phonemes] = []
        phoneme_to_words[phonemes].append(word)
    return phoneme_to_words

# Function to find word combinations given a sequence of phonemes
def find_word_combos_with_pronunciation(phonemes: Sequence[str]) -> List[List[str]]:
    phoneme_to_words = preprocess_dictionary(pronunciation_dict)
    result = []

    #Use a recursive backtracking function to explore all possible ways to split the input phoneme sequence into valid words. If the current sub-sequence of phonemes (from start to end) exists in the preprocessed dictionary, it will add the corresponding words to the current path and continue the recursion.
    def backtrack(start: int, path: List[str]):
        if start == len(phonemes):
            result.append(path[:])
            return
        for end in range(start + 1, len(phonemes) + 1):
            sub_phonemes = tuple(phonemes[start:end])
            if sub_phonemes in phoneme_to_words:
                for word in phoneme_to_words[sub_phonemes]:
                    path.append(word)
                    backtrack(end, path)
                    path.pop()

    backtrack(0, [])
    return result

# Example pronunciation dictionary
pronunciation_dict = [
    "ABACUS AE B AH K AH S",
    "BOOK B UH K",
    "THEIR DH EH R",
    "THERE DH EH R",
    "TOMATO T AH M AA T OW",
    "TOMATO T AH M EY T OW"
]

# Example usage
phoneme_sequence = ["DH", "EH", "R", "DH", "EH", "R"]
combinations = find_word_combos_with_pronunciation(phoneme_sequence)
print(combinations)


[['THEIR', 'THEIR'], ['THEIR', 'THERE'], ['THERE', 'THEIR'], ['THERE', 'THERE']]


In [None]:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <stdint.h>

#define MAX_WORD_LEN 100
#define MAX_WORDS 10000

typedef struct {
    char word[MAX_WORD_LEN];
    int count;
} WordCount;

void to_lower_case(char *str) {
    for (int i = 0; str[i]; i++) {
        str[i] = tolower((unsigned char) str[i]);
    }
}

int compare_counts(const void *a, const void *b) {
    return ((WordCount *)b)->count - ((WordCount *)a)->count;
}

char **find_frequent_words(const char *path, int32_t n) {
    FILE *file = fopen(path, "r");
    if (!file) {
        fprintf(stderr, "Error opening file.\n");
        return NULL;
    }

    WordCount word_counts[MAX_WORDS];
    int word_count_size = 0;

    char buffer[1024];
    while (fgets(buffer, sizeof(buffer), file)) {
        char *token = strtok(buffer, " \n\t\r.,;:!?()[]\"");
        while (token) {
            to_lower_case(token);

            int found = 0;
            for (int i = 0; i < word_count_size; i++) {
                if (strcmp(word_counts[i].word, token) == 0) {
                    word_counts[i].count++;
                    found = 1;
                    break;
                }
            }

            if (!found && word_count_size < MAX_WORDS) {
                strncpy(word_counts[word_count_size].word, token, MAX_WORD_LEN - 1);
                word_counts[word_count_size].word[MAX_WORD_LEN - 1] = '\0';
                word_counts[word_count_size].count = 1;
                word_count_size++;
            }

            token = strtok(NULL, " \n\t\r.,;:!?()[]\"");
        }
    }

    fclose(file);

    qsort(word_counts, word_count_size, sizeof(WordCount), compare_counts);

    char **result = malloc(n * sizeof(char *));
    for (int i = 0; i < n && i < word_count_size; i++) {
        result[i] = strdup(word_counts[i].word);
    }

    return result;
}


In [23]:
import re
from collections import Counter

def find_frequent_words(path, n):
    with open(path, 'r') as file:
        text = file.read()

    # Tokenize the text into words
    words = re.findall(r'\b\w+\b', text.lower())

    # Count the frequency of each word
    word_counts = Counter(words)

    # Get the n most common words
    most_common_words = word_counts.most_common(n)

    return most_common_words

# Example usage:
path = 'shakespeare.txt'
n = 10
frequent_words = find_frequent_words(path, n)
for word, count in frequent_words:
    print(f'{word}: {count}')


the: 6287
and: 5690
i: 5111
to: 4934
of: 3760
you: 3211
my: 3120
a: 3018
that: 2664
in: 2403
