<a href="https://colab.research.google.com/github/jjgamez1/project_gutenberg_analysis/blob/main/project_gutenberg_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Reading material

* [Official Python documentation for lists (arrays)](https://docs.python.org/3/tutorial/datastructures.html)

* [Slide deck from 4/15/25](https://docs.google.com/presentation/d/13rru5EpI_SUUn8CQgLH_irb2QcGOTDYoLsWGKr7cqRQ/edit?usp=sharing)

In [None]:
#################################
#                               #
# DO NOT MODIFY THIS CODE BLOCK #
#                               #
#################################

from google.colab import drive
import os
import requests
from datetime import datetime

# Mount Google Drive
drive.mount('/content/drive')

# Define path inside your Google Drive
gdrive_folder = 'COMP150_PG_books/'
folder_path = '/content/drive/MyDrive/'

META = folder_path + gdrive_folder + "metadata.txt"
FILE_BASE = "pg{0}"
FILE_SUFFIX = ".txt"

with open(META, "r", encoding="utf-8") as file:
    lines = file.readlines()

if len(lines) < 4:
    raise ValueError("metadata.txt must contain at least 4 lines")

# Strip whitespace and parse
FROM_BOOK = int(lines[0].strip())
TO_BOOK = int(lines[1].strip())
FIRST_NAME = lines[2].strip()
LAST_NAME = lines[3].strip()

#
print(f'\nHello \033[1m{FIRST_NAME} {LAST_NAME}.\033[0m ')
print(f'You\'ll be processing books {FROM_BOOK} through {TO_BOOK}\n')

def cleanup(string):
    """Removes all non letter characters from a string.

    Input
    -----
    string : the string we wish to clean up

    Returns
    -------
    output_string: a string that contains only the letter characters
                   of the input string.
    """

    # Initialize output string to empty
    output_string = ""

    # Initialize position to traverse input string
    position = 0

    # Loop while there are characters to consider in input string
    while (position < len(string)):
        # Obtain the character at the indicated positio in the string
        character = string[position]
        # Obtain the ASCII value of the current character
        ascii = ord(character)
        # Check if the ASCII value of the current character falls within range
        # of letter characters. The range is obtained by looking up the ASCII
        # value for letters a, z, A, and Z using the Python function ord()
        if ((ascii >= ord('a') and ascii <= ord('z')) or
            (ascii >= ord('A') and ascii <= ord('Z'))):
            # Current character is a letter. Add it to the output string.
            output_string = output_string + character
        # move to the next position
        position = position + 1
    # Done
    return output_string

Mounted at /content/drive

Hello [1mJesus Gamez.[0m 
You'll be processing books 233 through 261



In [None]:
################################################################################
#                                                                              #
#                           W  O  R  K  S  P  A  C  E                          #
#                                                                              #
################################################################################

# Beginning and end of a book
BOUNDARY_MARKER = "***"

# Master co_occurrence array for all the books in your collection
co_occurrence = []

# Radius of co-occurrence
radius = 5

# Target word
target = "river"

# The range of books is specified by variables FROM_BOOK and TO_BOOK. These
# variables were provided in part 1. In this part, they are automatically
# recovered by the progam; you don't have to enter them again. The loop below
# will go through every book assigned to you
for book_num in range(FROM_BOOK, TO_BOOK + 1):
    # Using the book number from the loop above, build the file name for the
    # book to scan, including the path to the Google drive folder where it's
    # stored
    filename = (folder_path + gdrive_folder +
                FILE_BASE.format(book_num) + FILE_SUFFIX)

    # Attempt to read the book -- in some cases, the book may not be available.
    try:

        # Initialize array for every word in the book in the order it appears.
        book_words = []

        # Flag to signal when we are inside the book. We begin from the outside.
        within_book = False

        # Open the book as a file and prepare to read it line-by-line
        with open(filename, "r", encoding="utf-8") as file:

            # Print an encouranging note
            print(f"Scanning {filename}...")

            # Read the file, line-by-line
            for line in file:

                # Before processing this line check if it's one of the two
                # boundaries that mark the beginning or the end of the book
                if line is not None and line.find(BOUNDARY_MARKER) > -1:
                    # If we cross a boundary line, reverse the WITHIN_BOOK flag.
                    within_book = not within_book

                # If the flag indicates we were within the book (ie its value is
                # True), process the line
                if within_book:

                    # Split each line into pieces (tokens) separated by spaces,
                    # and place these tokens in an array
                    tokens = line.split()

                    # Process the tokens, one at a time
                    for token in tokens:

                        # For every token in the line, remove numbers and
                        # punctuation using the cleanup function from earlier.
                        # Then convert the string to lower case for consistency.
                        cleanedup_token = cleanup(token).lower()

                        # Add each word to the array for this book
                        book_words.append(cleanedup_token)

        # At this point array book_words has all the words for the book
        # listed in the order they appear.
        # Let's print an encouraging diagnostic.

        # print(f'\tYou just scanned {filename=}')
        # print(f'\tIts words array has {len(book_words):,d} elements\n')

        ########################################################################
        # NOW WHAT? ...                                                        #
        # You need to go over the contents of array book_words to look for a   #
        # specific target word and a radius, find the words that occur within  #
        # the given radius, save them in an array (list), and then add this    #
        # list to the co_occurrence list.                                      #
        #                                                                      #
        # Select a relatively small radius, maybe 4 or 5, and a relatively     #
        # word with a frequency between 500 and 1000 in your csv file.         #
        #                                                                      #
        # Once you have yor co_occurrence array, process it to discover words  #
        # that appear frequently with the target word.
        ########################################################################

        # Process book_words to find co-occurrences with the target word
        for i in range(len(book_words)): # Loop within length of book
            if book_words[i] == target: # Find target word
                local_occurrence = [] # new occurence list
                # Look at words within our radius
                for j in range(i-radius, i+radius+1):
                  # Make sure we stay within the boundries and exclude target
                    if j >= 0 and j < len(book_words) and j != i:
                        local_occurrence.append(book_words[j])
                # Add this occurrence list to master list
                co_occurrence.append(local_occurrence)

    except FileNotFoundError:
        print(f"File {filename} not found. Skipping.")

# Analyzing cooccurence data
print(f'There are {len(co_occurrence)} lists in co_occurrence for "{target}"')

# Traverse the co-occurrence words and count them
dictionary = dict()
for list_of_words in co_occurrence:
    for word in list_of_words:
        if word in dictionary:
            dictionary[word] += 1
        else:
            dictionary[word] = 1

print(dictionary)


Scanning /content/drive/MyDrive/COMP150_PG_books/pg233.txt...
Scanning /content/drive/MyDrive/COMP150_PG_books/pg234.txt...
Scanning /content/drive/MyDrive/COMP150_PG_books/pg235.txt...
Scanning /content/drive/MyDrive/COMP150_PG_books/pg236.txt...
Scanning /content/drive/MyDrive/COMP150_PG_books/pg237.txt...
Scanning /content/drive/MyDrive/COMP150_PG_books/pg238.txt...
File /content/drive/MyDrive/COMP150_PG_books/pg239.txt not found. Skipping.
Scanning /content/drive/MyDrive/COMP150_PG_books/pg240.txt...
Scanning /content/drive/MyDrive/COMP150_PG_books/pg241.txt...
Scanning /content/drive/MyDrive/COMP150_PG_books/pg242.txt...
Scanning /content/drive/MyDrive/COMP150_PG_books/pg243.txt...
Scanning /content/drive/MyDrive/COMP150_PG_books/pg244.txt...
Scanning /content/drive/MyDrive/COMP150_PG_books/pg245.txt...
Scanning /content/drive/MyDrive/COMP150_PG_books/pg246.txt...
Scanning /content/drive/MyDrive/COMP150_PG_books/pg247.txt...
Scanning /content/drive/MyDrive/COMP150_PG_books/pg248.t