In [None]:
#Author: Jonathan Loyd
#Description: Python3 Getting Word Counts for a Chapter Book
#CSE590-59 Project 2

In [1]:
from collections import Counter
import pandas as pd
import re

In [2]:
# Read in metamorphosis text and get the string
alice_txt = open("alice.txt", "r", encoding="utf8")
content_string = alice_txt.read()

# Remove the start and ending pieces of the book
start_index = content_string.find("CHAPTER I.\nDown the Rabbit-Hole")
end_index = content_string.find(r"*** END OF THE PROJECT GUTENBERG EBOOK ALICE’S ADVENTURES IN WONDERLAND ***")
content_string = content_string[start_index:end_index].strip()

# Get one+ digits with . or, and more digits or any nonalphanumeric characters
# Group the numbers and make nonalphanumeric characters into spaces
content_string = re.sub(r'(\d+(?:[,.]\d+)+)|[\W_]+', lambda x: x.group(1) if x.group(1) else ' ', content_string).strip()
# print(content_string)

# Split the string
word_list = content_string.split()
# print(word_list)

In [3]:
# Count the frequency and create a dict for the DataFrame
string_counter = Counter(word_list)
key_list = list()
value_list = list()
for key, value in string_counter.items():
    key_list.append(key)
    value_list.append(value)
df_dict = {'Word': key_list, 'Count' : value_list}

In [4]:
# Create the Dataframe
df = pd.DataFrame(df_dict, index=range(1, len(value_list)+1))
df

Unnamed: 0,Word,Count
1,CHAPTER,12
2,I,545
3,Down,3
4,the,1533
5,Rabbit,45
...,...,...
2837,sorrows,1
2838,joys,1
2839,remembering,1
2840,happy,1


In [5]:
# In case you want the DataFrame printed out without formatting
print(df)

             Word  Count
1         CHAPTER     12
2               I    545
3            Down      3
4             the   1533
5          Rabbit     45
...           ...    ...
2837      sorrows      1
2838         joys      1
2839  remembering      1
2840        happy      1
2841          END      1

[2841 rows x 2 columns]


In [6]:
# Bring in the stop words variable from the python file
import stop_words

# Get the indices of all of the stop_words to remove
indices = list()
for word in stop_words.stop_words:
    for i in range(1, len(df.Word)+1):
        if word == df.Word[i]:
            indices.append(i)

# Deep copy the dataframe and then
# Remove stop-words from the dataframe and sort by descending count frequency
df2 = df.copy(deep=True)
df2.drop(indices, inplace=True)
df2 = df2.sort_values(by=['Count'], ascending=False)
df2

Unnamed: 0,Word,Count
2,I,545
7,Alice,397
150,The,108
299,know,87
159,herself,83
...,...,...
911,soothing,1
1917,Visit,1
919,lazily,1
920,sits,1


In [7]:
#Again printing in plain format just in case
print(df2)

          Word  Count
2            I    545
7        Alice    397
150        The    108
299       know     87
159    herself     83
...        ...    ...
911   soothing      1
1917     Visit      1
919     lazily      1
920       sits      1
2841       END      1

[2468 rows x 2 columns]


In [8]:
# Split the book into chapters and remove the chapter number/roman numerals
chapters = re.split("CHAPTER ", content_string)
chapters.pop(0)
for i, chapter in enumerate(chapters):
    if chapter.startswith("I.\n") or chapter.startswith("V.\n") or chapter.startswith("X.\n"):
        chapters[i] = chapter[3:]
    elif chapter.startswith("II.\n") or chapter.startswith("IV.\n") or chapter.startswith("VI.\n")\
    or chapter.startswith("IX.\n") or chapter.startswith("XI.\n"):
        chapters[i] = chapter[4:]
    elif chapter.startswith("III.\n") or chapter.startswith("VII.\n") or chapter.startswith("XII.\n"):
        chapters[i] = chapter[5:]
    elif chapter.startswith("VIII.\n"):
        chapters[i] = chapter[6:]

In [17]:
dataframe_list = list()
for chapter in chapters:
    # Create the dataframe for the chapter
    chapter_string = chapter.strip()
    chapter_string = re.sub(r'(\d+(?:[,.]\d+)+)|[\W_]+', lambda x: x.group(1) if x.group(1) else ' ', chapter_string).strip()
    chapter_word_list = chapter_string.split()
    string_counter = Counter(chapter_word_list)
    key_list = list()
    value_list = list()
    for key, value in string_counter.items():
        key_list.append(key)
        value_list.append(value)
    df_dict = {'Word': key_list, 'Count' : value_list}
    chapter_df = pd.DataFrame(df_dict, index=range(1, len(value_list)+1))
    
    # Get indices to remove
    indices = list()
    for word in stop_words.stop_words:
        for i in range(1, len(chapter_df.Word)+1):
            if word == chapter_df.Word[i]:
                indices.append(i)
    # Remove those indices
    chapter_df.drop(indices, inplace=True)
    chapter_df = chapter_df.sort_values(by=['Count'], ascending=False)
    dataframe_list.append(chapter_df)

In [18]:
for i, dataframe in enumerate(dataframe_list):
    print(f'Chapter: {i+1}\n{dataframe}')

Chapter: 1
        Word  Count
1          I     36
6      Alice     28
158  herself     13
135      see     10
105     time      9
..       ...    ...
328      cat      1
326    night      1
323  talking      1
320  written      1
637     work      1

[434 rows x 2 columns]
Chapter: 2
        Word  Count
28         I     73
10     Alice     26
498    Mouse     11
131     dear      9
87   thought      9
..       ...    ...
334   repeat      1
331      lap      1
329  crossed      1
326    wrong      1
651    party      1

[438 rows x 2 columns]
Chapter: 3
         Word  Count
81          I     37
58      Alice     23
105     Mouse     18
241      Dodo     12
87       know     11
..        ...    ...
277    paused      1
270   audibly      1
269  tittered      1
268     other      1
597    coming      1

[408 rows x 2 columns]
Chapter: 4
        Word  Count
50         I     64
55     Alice     31
134  herself     17
8       Bill     14
3     Rabbit     12
..       ...    ...
393    Fetch