# Literature Processer
* Jacob Yousif

## Importing the libraries

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
%%capture
!pip install chardet

In [5]:
import os
import re
import chardet
import pandas as pd
import json
from sklearn.preprocessing import LabelEncoder

## The path for the source of the literature

In [6]:
directory = 'Literature/'

## Predefined functions

In [7]:
def extract_file(file_path):
    
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

In [8]:
def count_words(text):
    if pd.isna(text):
        return 0
    return len(text.split())

In [9]:
def remove_whitespace(text):
    return  " ".join(text.split())

In [10]:
def filename_to_title(filename):
    name_without_ext = filename.replace(".txt", "")
    title = " ".join(
        [
            word if i == 0 else word.capitalize()
            for i, word in enumerate(re.findall("[A-Z][^A-Z]*", name_without_ext))
        ]
    )
    return title

In [11]:
def read_text_file(file_path):
    with open(file_path, 'rb') as file:
        raw_data = file.read()
    result = chardet.detect(raw_data)
    encoding = result['encoding']
    with open(file_path, 'r', encoding=encoding) as file:
        text = file.read()
    
    return text

## The processing

In [12]:
books_info = extract_file('Datasets/books.json')

print(books_info)

{'AChristmasCarolInProse.txt': 'Charles Dickens', "AConnecticutYankeeinKingArthur'sCourt.txt": 'Mark Twain', 'AdamBede.txt': 'George Eliot', 'AdventuresOfHuckleberryFinn.txt': 'Mark Twain', 'AJournalOfImpressionsInBelgium.txt': 'May Sinclair', 'ALittleCountryGirl.txt': 'Susan Coolidge', 'AllCatsAreGray.txt': 'Andre Norton', 'AnneOfAvonlea.txt': 'L. M. Montgomery', 'AnneOfGreenGables.txt': 'L. M. Montgomery', 'AnneOfTheIsland.txt': 'L. M. Montgomery', 'ArgonautStories.txt': 'Jack London', 'ARoundDozen.txt': 'Susan Coolidge', 'BleakHouse.txt': 'Charles Dickens', 'Clover.txt': 'Susan Coolidge', 'Curious-IfTrue.txt': 'Elizabeth Gaskell', 'DanielDeronda.txt': 'George Eliot', 'DavidCopperfield.txt': 'Charles Dickens', 'DefenseMech.txt': 'Ray Bradbury', 'EightCousins.txt': 'Louisa May Alcott', 'EmilyOfNewMoon.txt': 'L. M. Montgomery', 'Emma.txt': 'Jane Austen', 'EthanFrome.txt': 'Edith Wharton', 'FelixHolt-TheRadical.txt': 'George Eliot', 'FlowerFables.txt': 'Louisa May Alcott', 'FuturiaFanta

In [13]:
files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

In [14]:
files.sort()

In [15]:
df = pd.DataFrame()

In [16]:
df['Book'] = [filename_to_title(book) for book in files]

In [17]:
books = [read_text_file(directory + item) for item in files]

In [18]:
df['Text'] = books

In [19]:
books_info = extract_file('Datasets/books.json')

In [20]:
df['Author'] = [books_info[x] for x in files]

In [21]:
df['Text'] = df['Text'].apply(remove_whitespace)

In [22]:
df['Size'] = df['Text'].apply(count_words)

In [23]:
_sum = df['Size'].sum()

print(f"Sum of all words in all books: {_sum} words.")

Sum of all words in all books: 10316742 words.


In [24]:
label_encoder = LabelEncoder()
df['AuthorCode'] = label_encoder.fit_transform(df['Author'])

In [25]:
unique_classes = df['Author'].unique()
num_classes = len(unique_classes)

print(f"Unique Authors: {unique_classes}\n")
print(f"Number of unique classes: {num_classes}")

Unique Authors: ['Charles Dickens' 'Mark Twain' 'May Sinclair' 'Susan Coolidge'
 'George Eliot' 'Andre Norton' 'L. M. Montgomery' 'Jack London'
 'Elizabeth Gaskell' 'Ray Bradbury' 'Louisa May Alcott' 'Jane Austen'
 'Edith Wharton' 'Joseph Conrad' 'Isaac Asimov' 'Victor Hugo'
 'Agatha Christie' 'Ellen Glasgow']

Number of unique classes: 18


In [26]:
author_dict = df.set_index('Author')['AuthorCode'].to_dict()

In [27]:
with open('Datasets/author_codes.json', 'w') as json_file:
    json.dump(author_dict, json_file)

In [28]:
author_dict = dict(sorted(author_dict.items(), key=lambda item: item[1]))

In [29]:
author_dict

{'Agatha Christie': 0,
 'Andre Norton': 1,
 'Charles Dickens': 2,
 'Edith Wharton': 3,
 'Elizabeth Gaskell': 4,
 'Ellen Glasgow': 5,
 'George Eliot': 6,
 'Isaac Asimov': 7,
 'Jack London': 8,
 'Jane Austen': 9,
 'Joseph Conrad': 10,
 'L. M. Montgomery': 11,
 'Louisa May Alcott': 12,
 'Mark Twain': 13,
 'May Sinclair': 14,
 'Ray Bradbury': 15,
 'Susan Coolidge': 16,
 'Victor Hugo': 17}

In [30]:
has_nan = df.isna().any().any()
print(f"DataFrame has NaN values: {has_nan}")

DataFrame has NaN values: False


In [31]:
df['Book']

0                             A Christmas Carol In Prose
1             A Connecticut Yankeein King Arthur's Court
2                    A Journal Of Impressions In Belgium
3                                  A Little Country Girl
4                                          A Round Dozen
                             ...                        
103                                  Wives And Daughters
104    Worlds Within Worlds- The Story Of Nuclear Ene...
105    Worlds Within Worlds- The Story Of Nuclear Ene...
106    Worlds Within Worlds- The Story Of Nuclear Ene...
107                                                Youth
Name: Book, Length: 108, dtype: object

In [32]:
df

Unnamed: 0,Book,Text,Author,Size,AuthorCode
0,A Christmas Carol In Prose,STAVE I: MARLEY'S GHOST MARLEY was dead: to be...,Charles Dickens,28435,2
1,A Connecticut Yankeein King Arthur's Court,A WORD OF EXPLANATION It was in Warwick Castle...,Mark Twain,117491,13
2,A Journal Of Impressions In Belgium,"This is a ""Journal of Impressions,"" and it is ...",May Sinclair,68153,14
3,A Little Country Girl,"CHAPTER I. ON THE ""EOLUS."" IT was on one of th...",Susan Coolidge,49179,16
4,A Round Dozen,THE LITTLE WHITE DOOR. I SUPPOSE that most boy...,Susan Coolidge,47504,16
...,...,...,...,...,...
103,Wives And Daughters,To begin with the old rigmarole of childhood. ...,Elizabeth Gaskell,269696,4
104,Worlds Within Worlds- The Story Of Nuclear Ene...,"INTRODUCTION In a way, nuclear energy has been...",Isaac Asimov,11830,7
105,Worlds Within Worlds- The Story Of Nuclear Ene...,MASS AND ENERGY In 1900 it began to dawn on ph...,Isaac Asimov,10725,7
106,Worlds Within Worlds- The Story Of Nuclear Ene...,NUCLEAR FISSION New Elements In 1934 Enrico Fe...,Isaac Asimov,11129,7


In [33]:
csv_file_path = 'Datasets/Literature.csv'
df.to_csv(csv_file_path, index=False)