In [18]:
from sonnet_processing import read_sonnet_file, preprocess_sonnet, get_sonnet_phoneme_dict, get_cmu_simvecs_embedding, create_cmu_mean_simvecs_embedding_df, create_phoneme_embedding_df
import json
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

In [19]:
def create_dist_output(poet_list, counts_or_embedding):
    dists_filename = "sonnet_dists_" + counts_or_embedding
    sonnet_text_filename = "sonnets" 
    
    for poet in poet_list:
        dists_filename += "_"
        dists_filename += poet
        sonnet_text_filename += "_"
        sonnet_text_filename += poet
    dists_filename += '.json'
    sonnet_text_filename += ".json"
    print(dists_filename)

    sonnets_list = []
    curr_sonnet_idx = 0
    if "shakespeare" in poet_list:
        sonnets_list, curr_sonnet_idx = read_sonnet_file("text/sonnets.txt", "Shakespeare", sonnets_list, curr_sonnet_idx) #154
    if "spenser" in poet_list:
        sonnets_list, curr_sonnet_idx = read_sonnet_file("text/spenser_sonnets.txt", "Spenser", sonnets_list, curr_sonnet_idx) #89
    if "sidney" in poet_list:
        sonnets_list, curr_sonnet_idx = read_sonnet_file("text/sidney_sonnets.txt", "Sidney", sonnets_list, curr_sonnet_idx) #108
    sonnets_dict = {idx: sonnet for idx, sonnet in enumerate(sonnets_list)}
    
    with open(sonnet_text_filename, 'w') as outfile:
        json.dump(sonnets_list, outfile)

    preprocessed_sonnets_list =[preprocess_sonnet(sonnet["text"]) for sonnet in sonnets_list]
    
    if counts_or_embedding == "counts":
        with_lexical_stress = False
        df_counts, normalized_phoneme_count_mtx, all_words_not_in_dict = create_phoneme_embedding_df(preprocessed_sonnets_list, with_lexical_stress)
        dists = cosine_distances(normalized_phoneme_count_mtx)
            
    elif counts_or_embedding == "embedding":
        df_embeddings, cmu_embeddings_as_mtx, all_words_not_in_dict = create_cmu_mean_simvecs_embedding_df(preprocessed_sonnets_list)
        dists = euclidean_distances(cmu_embeddings_as_mtx)
    
    dists_as_lists = dists.tolist()
    
    with open(dists_filename, "w") as f:
        json.dump(dists_as_lists, f)
        

In [20]:
poet_list_of_lists = [
    ["shakespeare","spenser","sidney"],
    ["shakespeare"],
    ["spenser"],
    ["sidney"],
    ["shakespeare", "spenser"],
    ["shakespeare", "sidney"],
    ['spenser', 'sidney']
]

for poet_list in poet_list_of_lists: 
    create_dist_output(poet_list, "counts")
    create_dist_output(poet_list, "embedding")

sonnet_dists_counts_shakespeare_spenser_sidney.json
sonnet_dists_embedding_shakespeare_spenser_sidney.json
sonnet_dists_counts_shakespeare.json
sonnet_dists_embedding_shakespeare.json
sonnet_dists_counts_spenser.json
sonnet_dists_embedding_spenser.json
sonnet_dists_counts_sidney.json
sonnet_dists_embedding_sidney.json
sonnet_dists_counts_shakespeare_spenser.json
sonnet_dists_embedding_shakespeare_spenser.json
sonnet_dists_counts_shakespeare_sidney.json
sonnet_dists_embedding_shakespeare_sidney.json
sonnet_dists_counts_spenser_sidney.json
sonnet_dists_embedding_spenser_sidney.json
