In [10]:
import glob
import string
from collections import defaultdict, Counter

from style.constants import FILE_PATH_BOOK_DS
from style.dataset.reader import DatasetReader

In [2]:
def read_books(container_path='/Users/kerem/playground/kerem-side-projects-monorepo/style-resources/datasets/book_ds'):
    files = glob.glob(f"{container_path}/*/*.txt", recursive=True)
    authors_books_map = defaultdict(list)

    for file in files:
        file_split = file.split('/')
        book_id = file_split[-1]
        author_name = file_split[-2]
        with open(file) as f:
            authors_books_map[author_name].append((book_id, f.read()))
    return authors_books_map

In [3]:
authors = read_books()

In [4]:
def get_number_of_authors(authors_book_map: dict):
    return len(authors_book_map)

In [5]:
def get_number_of_books(authors_book_map: dict):
    return sum([len(books) for books in authors_book_map.values()])

get_number_of_books(authors)    

2184

In [6]:
def get_book_distribution_by_author(authors_book_map: dict):
    return sorted([(k, len(v)) for k, v in authors_book_map.items()], key=lambda tup: tup[1], reverse=True)    

get_book_distribution_by_author(authors)

[('mark_twain', 165),
 ('william_shakespeare', 163),
 ('honoré_de_balzac', 115),
 ('charles_dickens', 97),
 ('arthur_conan_doyle', 94),
 ('nathaniel_hawthorne', 83),
 ('h._g._(herbert_george)_wells', 77),
 ('andrew_lang', 61),
 ('jack_london', 52),
 ('walter_scott', 51),
 ('joseph_conrad', 51),
 ('daniel_defoe', 50),
 ('jules_verne', 49),
 ('rudyard_kipling', 47),
 ('thomas_hardy', 46),
 ('edith_wharton', 45),
 ('g._k._(gilbert_keith)_chesterton', 44),
 ('émile_zola', 44),
 ('bernard_shaw', 42),
 ('p._g._(pelham_grenville)_wodehouse', 41),
 ('tolstoy', 41),
 ('louisa_may_alcott', 38),
 ('edgar_rice_burroughs', 31),
 ('guy_de_maupassant', 31),
 ('frances_hodgson_burnett', 30),
 ('l._frank_(lyman_frank)_baum', 27),
 ('plato', 26),
 ('anton_pavlovich_chekhov', 24),
 ('robert_w._(robert_william)_chambers', 23),
 ('oscar_wilde', 23),
 ('beatrix_potter', 23),
 ('edgar_allan_poe', 20),
 ('herman_melville', 20),
 ('robert_louis_stevenson', 19),
 ('j._m._(james_matthew)_barrie', 19),
 ('henrik_

In [7]:
def get_average_number_of_books_by_author(authors_book_map):
    return get_number_of_books(authors_book_map) / get_number_of_authors(authors_book_map)

get_average_number_of_books_by_author(authors)

27.3

In [8]:
from itertools import islice

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

In [9]:
def count_words_by_books(container_path='/Users/kerem/playground/kerem-side-projects-monorepo/style-resources/datasets/book_ds'):
    files = glob.glob(f"{container_path}/*/*.txt", recursive=True)
    author_books_word_counts = defaultdict(list)

    for file in files:
        file_split = file.split('/')
        book_id = file_split[-1]
        author_name = file_split[-2]
        with open(file) as f:
            author_books_word_counts[author_name].append((book_id, len(f.read().split())))
        
    return author_books_word_counts
        
            
            
count_words_by_books()

defaultdict(list,
            {'l._m._(lucy_maud)_montgomery': [('5341.txt', 43234),
              ('5340.txt', 73574),
              ('5342.txt', 90548),
              ('5343.txt', 84009),
              ('24873.txt', 66088),
              ('24875.txt', 49620),
              ('1354.txt', 69060),
              ('3796.txt', 106399),
              ('24874.txt', 76782),
              ('24876.txt', 93891),
              ('24877.txt', 90846),
              ('24878.txt', 110683)],
             'g._k._(gilbert_keith)_chesterton': [('1721.txt', 28039),
              ('11505.txt', 62909),
              ('1696.txt', 47119),
              ('35115.txt', 7041),
              ('1720.txt', 62776),
              ('2015.txt', 55611),
              ('1695.txt', 61240),
              ('12037.txt', 15472),
              ('62467.txt', 7048),
              ('1718.txt', 60807),
              ('45811.txt', 43459),
              ('1719.txt', 20515),
              ('63084.txt', 47403),
              ('12245.txt'

In [10]:
def remove_punctuations(words: str): 
    translator = str.maketrans('', '', string.punctuation)
    return words.translate(translator)

In [11]:
def number_of_unique_words(container_path='/Users/kerem/playground/kerem-side-projects-monorepo/style-resources/datasets/book_ds'):

    files = glob.glob(f"{container_path}/*/*.txt", recursive=True)
    unique_words = set()

    for file in files:
        with open(file) as f:
            words = remove_punctuations(f.read().lower()).split()
            unique_words.update(words)
    return len(unique_words), unique_words


length, words = number_of_unique_words()

In [12]:
take(10, words)

['themhaving',
 '27thbefore',
 'manettes',
 'wellsalted',
 'papermakingif',
 'paidthough',
 'choiceless',
 'flowerto',
 'tenpage',
 'goldin']

In [13]:
print(length)

899375


In [5]:
dataset = DatasetReader.load_files(
    FILE_PATH_BOOK_DS, n=1000)

In [12]:
documents_numbers_by_author = Counter(dataset.target)

In [None]:
documents_numbers_by_author.sort()

In [14]:
doc_num_sorted = {k: v for k, v in sorted(documents_numbers_by_author.items(), key=lambda item: item[1])}

In [15]:
doc_num_sorted

{'emily_dickinson': 35,
 'frederick_douglass': 45,
 'alexander_pope': 73,
 'franz_kafka': 113,
 'beatrix_potter': 117,
 'emily_brontë': 119,
 'kate_chopin': 123,
 'lewis_carroll': 125,
 'alexandre_dumas': 160,
 'jonathan_swift': 161,
 'byron': 184,
 'jane_austen': 189,
 'mary_hallock_foote': 232,
 'virginia_woolf': 355,
 'w._e._b._(william_edward_burghardt)_du_bois': 393,
 'niccolò_machiavelli': 395,
 'henry_david_thoreau': 407,
 'agatha_christie': 420,
 'f._scott_(francis_scott)_fitzgerald': 447,
 'james_joyce': 468,
 'nikolai_vasilevich_gogol': 483,
 'george_sand': 511,
 'stendhal': 513,
 'charlotte_brontë': 521,
 'charlotte_perkins_gilman': 543,
 'henrik_ibsen': 554,
 'sinclair_lewis': 558,
 'voltaire': 579,
 'ambrose_bierce': 597,
 'friedrich_wilhelm_nietzsche': 600,
 'edgar_rice_burroughs': 601,
 'bertrand_russell': 642,
 'oscar_wilde': 700,
 'alphonse_daudet': 735,
 'j._m._(james_matthew)_barrie': 785,
 'gustave_flaubert': 804,
 'dante_alighieri': 819,
 'bram_stoker': 864,
 'm