I will create a code able to read several books in .pdf format and calculate the amount of unique words and their frequencies by language.

In [1]:
import os
import pandas as pd

# specify the folder's directory where the book files are located
book_dir = './Books'

# create two empty Dataframes to later store the info collected from every file
count_result = pd.DataFrame(columns=['lang', 'author', 'book_title', 'words', 'freq'])
stat_result = pd.DataFrame(columns=['lang', 'author', 'book_title', 'uniq_words', 'total_words'])

# iterate and read every file by language, author, and title
for language in os.listdir(book_dir):
    for author in os.listdir(book_dir + '/' + language):
        for title in os.listdir(book_dir + '/' + language + '/' + author):
            
            # this is the resulting path...
            title_path = book_dir + '/' + language + '/' + author + '/' + title
            
            # now it will read on every file
            with open(title_path, 'r', encoding='utf8') as current_file:
                text = current_file.read()
                
                # the following lines clean the book's content for the further analysis
                text = text.replace('\n', ' ').replace('\r', ' ') # remove the backspaces
                text = text.lower()    # turn every letter into lower case
                
                # remove the most common symbols, marks, and numbers
                skip_list = [',', '.', ':', ';', '¿', '?', '¡', '!', '#' '"', "'", '-', '(', ')', '{', '}',
                            '1', '2', '3', '4', '5', '6', '7', '8', '9', '0']
                for ch in skip_list:
                    text = text.replace(ch, '')
                
                # create a temporary dataframe for every book title to store and isolate the stats collected
                temp_df = pd.DataFrame(columns=['lang', 'author', 'book_title', 'words', 'freq'])
                
                # this loop will count the frequency for every unique word
                for word in text.split(' '):
                    if word in temp_df['words'].values:
                        temp_df.loc[temp_df.words == word, 'freq'] += 1
                    else:
                        temp_df.loc[len(temp_df)] = language, author, title.replace('.txt', ''), word, 1
                
                temp_df = temp_df.drop(temp_df.index[0]) # remove the counted empty spaces
                temp_df = temp_df.sort_values('freq', ascending=False) # sort the dataframe in descending order
                
                # collect the data from the current file before moving to the next one
                stat_result.loc[len(stat_result)] = language, author, title.replace('.txt', ''), len(temp_df), sum(temp_df['freq'].values)
            
            # this will save and accumulate the info collected from the temporary dataframe into a different table
            count_result = pd.concat([count_result, temp_df], axis=0, ignore_index=True)

print('\n-----------------------------')
print(stat_result)
print('\n-----------------------------')
print(count_result)


                      words  freq
0                   ﻿hamlet     1
1                            3613
2                     drama     8
3                        em   237
4                     cinco     5
5                     actos    14
6                   william     2
7               shakespeare     2
8                    hamlet   495
9                 traducção     1
10               portugueza     1
11                  segunda     6
12                   edição     1
13                   lisboa     1
14                 imprensa     1
15                 nacional     1
16           interlocutores     1
17               claudiorei     1
18                       de   752
19                dinamarca    29
20              hamletfilho     1
21                       do   302
22                  defunto     5
23                      rei   239
24                        e   888
25                 sobrinho     7
26                 reinante     1
27         poloniocamareiro     1
28            