<a href="https://colab.research.google.com/github/jesswu1555/Imaginary-Inventions-Database/blob/main/scifi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


example of NER fine tuning using bert: https://github.com/NielsRogge/Transformers-Tutorials/blob/master/BERT/Custom_Named_Entity_Recognition_with_BERT.ipynb

potential database: http://technovelgy.com/


---


Design Questions:
* do we want to classify every single instance of invention?
* what size tokens? Bigger the better?
* chunking/ sliding window to perserve context?
* what model to use, start with bert, but maybe longformer? bigbird?


Overall questions:
* What input sequence of words lead to the output sequence of invention descriptions?
* If there is even such a thing, what is the “formula” for an invention in literary studies?

Patterns we hope to find -
*   Neologism
*   Compound word
*   key words like Invented, Created

In [None]:
# Google colab already has pandas, numpy, sklearn, pytorch, transformers
!pip install transformers seqeval[gpu]
!pip install python-docx pandas
!pip install requests beautifulsoup4 tqdm

Collecting seqeval[gpu]
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=a91b479f0f927db914b50d50f76a8309871e7adfa7a3962cb37cb9df888687be
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling 

In [None]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertConfig, BertForTokenClassification
from torch import cuda
from docx import Document
from tqdm import tqdm
import re
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [None]:
# read the titles from the docx and find the texts online
document = Document('/content/drive/MyDrive/Colab Notebooks/ListOf50SciFi.docx')
tables = []
for index,table in enumerate(document.tables):
    df = [['' for i in range(len(table.columns))] for j in range(len(table.rows))]
    for i, row in enumerate(table.rows):
        for j, cell in enumerate(row.cells):
            df[i][j] = cell.text.strip()
texts = pd.DataFrame(df)
texts.columns = texts.iloc[0]  # Assign first row as column names
texts = texts[1:].reset_index(drop=True)
texts['content'] = None
# print(texts.head)

def get_gutenberg_text(title, author):
  search_term = title.replace(" ", "+")
  # search_term = title.replace(" ", "+") + " " + author.replace(" ", "+")
  search_url = f"https://www.gutenberg.org/ebooks/search/?query={search_term}"
  response = requests.get(search_url)
  soup = BeautifulSoup(response.text, "html.parser")
  # print(soup)
  if soup.find("li", class_="booklink") is None:
    print("no entries found")
    return None
  # Find first result link
  link = soup.find("li", class_="booklink").find("a")["href"]
  book_id = link.split("/")[-1]

  # Construct text file URL
  text_url = f"https://www.gutenberg.org/cache/epub/{book_id}/pg{book_id}.txt"
  print("text_url", text_url)
  text_response = requests.get(text_url)
  if not text_response.status_code == 200:
    print("no entries found")
    return None

  # parse the text to the book contents and megadata
  lines = text_response.text.split("\n")
  metadata = []
  book_content = []
  inside_book = False
  for line in lines:
      line = line.strip()
      # print(line)
      if "*** START OF THE PROJECT GUTENBERG EBOOK" in line:
          inside_book = True
          continue
      if "*** END OF THE PROJECT GUTENBERG EBOOK" in line:
          break
      if inside_book:
          # print(line)
          book_content.append(line)
      else:
          metadata.append(line)

  # Extract author from metadata, check that it matches what we expect
  author_pattern = re.compile(r"Author:\s*(.+)", re.IGNORECASE)
  author_match = next((author_pattern.search(line) for line in metadata if author_pattern.search(line)), None)
  actual_author = author_match.group(1) if author_match else "Unknown"
  for name in re.split(r"[ .]+", author.lower()) :
    if name not in actual_author.lower():
      print(f"Skipping: Author '{actual_author}' does not match expected '{author}'.")
      return None

  return "\n".join(book_content).strip()

def get_internet_archive_text(title, author):
    search_url = f"https://openlibrary.org/search.json?title={title.replace(' ', '+')}"
    # print(f"Searching: {search_url}")
    response = requests.get(search_url)
    if response.status_code != 200:
        print("Error: Failed to fetch search results")
        return None
    response_data = response.json()
    if "docs" not in response_data or not response_data["docs"]:
        print("No books found.")
        return None
    for book in response_data["docs"]:
        book_authors = book.get("author_name", [])
        archive_id = book.get("ia", [None])[0]  # Get first Internet Archive ID if available
        if not archive_id:
            continue  # Skip books without an IA identifier
        # Check if author matches (ignoring case & handling initials)
        book_authors = [" ".join(re.split(r"[ .]+", a.lower())) for a in book_authors]
        search_author = " ".join(re.split(r"[ .]+", author.lower()))
        if search_author in book_authors:
            # print(f"Found matching book: {book['title']} by {book_authors}")=
            # Check if full text is available in TXT format
            metadata_url = f"https://archive.org/metadata/{archive_id}"
            print(f"Checking metadata: {metadata_url}")
            metadata_response = requests.get(metadata_url)
            if metadata_response.status_code != 200:
                print("Error: Could not fetch metadata.")
                continue
            metadata = metadata_response.json()
            files = metadata.get("files", [])
            txt_file = next((f['name'] for f in files if f['name'].endswith(".txt")), None)
            if txt_file:
                text_url = f"https://archive.org/download/{archive_id}/{txt_file}"
                # print(f"Downloading full text: {text_url}")
                text_response = requests.get(text_url)
                if text_response.status_code == 200:
                  print("Full text found!")
                  return text_response.text
    print("No full-text version available for the given author.")
    return None


# for index, row in texts.iterrows():
#   # if row.Title in ["The Man in the Moone"]:
#   #   continue
#   print(row.Title)
#   text = get_gutenberg_text(row.Title, row.Author)
#   if text is None:
#     text = get_internet_archive_text(row.Title, row.Author)
#     if text is None:
#       print("no text found for ", row.Title)
#     continue
#   else:
#      texts.at[index, 'content'] = text
# print(texts[texts['content'].notna()])

# ONLY 4 BOOKS found in project gutenberg

In [None]:
# read the database csv file and clean up data
try:
  df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/scifidb.csv')
except FileNotFoundError:
  print("Error: 'data.csv' not found. Please upload the file or provide the correct path.")
except pd.errors.ParserError:
  print("Error: Could not parse the CSV file. Please check the file format.")
except Exception as e:
  print(f"An unexpected error occurred: {e}")

actual_title_names = ["author_first_middle",
                      "author_last",
                      "creation_date",
                      "ID",
                      "inventions_category",
                      "inventions_comments",
                      "inventions_description",
                      "inventions_fictional",
                      "inventions_ID",
                      "inventions_invention_name",
                      "inventions_pages",
                      "modification_date",
                      "notes",
                      "publisher",
                      "title",
                      "year"]
df.loc[-1] = df.columns
df.index = df.index + 1
df = df.sort_index()
df.columns = actual_title_names
df['author'] = df['author_first_middle'] + ' ' + df['author_last']
df['ID'] = pd.to_numeric(df['ID'], errors='coerce')
df['inventions_ID'] = pd.to_numeric(df['inventions_ID'], errors='coerce')

important_data_values = ["author", "title",
                      "inventions_category",
                      "inventions_fictional",
                      "inventions_ID",
                      "inventions_invention_name",
                      "inventions_pages"]
data = df[important_data_values]
print("\nRows where there aren't NaN in essential fields:")
core_data = data[["title", 'inventions_invention_name', "inventions_pages"]]
core = core_data[~core_data.isna().any(axis=1)].head
print(len(core_data))

# find out how much of the data is usable (hint: none)
core_data['title'] = core_data['title'].str.replace('"', '').str.lower()
print(len(set(core_data['title'])), " unique titles. ", list(set(core_data['title'])))
found_titles = [x.lower for x in texts[texts['content'].notna()]["Title"]]
core_data[core_data['title'].isin(found_titles)]



Rows where there aren't NaN in essential fields:
193
46  unique titles.  ['the man who saved the earth', 'kindred', 'the house on the moon', 'the scientific adventures of mr. fosdick: mr. fosdick invents the seidlitzmobile', 'state tectonics', 'the token superhero', nan, 'the frequency of compassion', 'birthday girl', 'revolution shuffle', 'black angel', 'sanford and sun', 'infomocracy', 'trail of lightning', 'manhunters', 'evidence', 'exhalation', 'fire on the mountain', 'station x', 'the stars above', 'the man from the atom', '22xx: one-shot', 'a columbus of space', 'abigail dreams of weather', 'hollow', 'an open letter to the family', 'the things i miss the most', 'impossible facts', '...and other disasters', 'lalibela', 'evidence ', 'aftermath', 'null states', 'the moon metal', "kafka's last laugh", 'revolution shuffle ', 'amazing stories', 'the chaos', 'disconnect', "a connecticut yankee in king arthur's court", 'the new accelerator ', 'by degrees and dilatory time', 'off a comet

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  core_data['title'] = core_data['title'].str.replace('"', '').str.lower()


Unnamed: 0,title,inventions_invention_name,inventions_pages


In [None]:
BASE_URL = "https://www.gutenberg.org"
SCI_FI_URL = "https://www.gutenberg.org/ebooks/bookshelf/68"

# Get all of the books from the sci-fi bookshelf
def get_all_scifi_books():
    book_index = 1
    books = []
    while True:
        url = f"{SCI_FI_URL}?sort_order=title&start_index={book_index}"
        response = requests.get(url)
        if response.status_code != 200:
            break  # Stop if we hit an error

        soup = BeautifulSoup(response.text, "html.parser")
        book_links = soup.select("li.booklink a.link")
        if not book_links:
            break  # Stop when there are no more books to process

        for link in book_links:
            books.append(BASE_URL + link["href"])  # Store full book URL
        if len(book_links)<25:
          break
        book_index += 25
    return books

# scifi_books = get_all_scifi_books()
# print(f"Found {len(scifi_books)} sci-fi books on project gutenberg")


In [None]:
BASE_URL = "https://www.gutenberg.org"

# Function to get the title, author, and plain text URL from a book page
def get_book_metadata(book_url):
    try:
        response = requests.get(book_url, timeout=10)
        response.raise_for_status()  # Raise error if request fails
    except requests.RequestException as e:
        print(f"Request failed for {book_url}: {e}")
        return None, None, None, None

    soup = BeautifulSoup(response.text, "html.parser")

    # Extract Title
    title = soup.find("h1").get_text(strip=True).split(" by ", 1)[0] if soup.find("h1") else "Unknown"

    # Extract Author
    author_tag = soup.select_one("a[href*='/author/']")
    author = author_tag.get_text(strip=True) if author_tag else "Unknown"
    author = author.split(" by ", 1)[-1]  # Ensure format
    if "," in author:
        last, first = author.split(",", 1)
        author = f"{first.strip()} {last.strip()}"

    # Extract Language
    lang_tag = soup.select_one("table.bibrec tr:contains('Language') td")
    language = lang_tag.get_text(strip=True) if lang_tag else "Unknown"

    # Extract Plain Text URL
    text_url = None
    for link in soup.select("table.files a"):
        if "Plain Text UTF-8" in link.get_text(strip=True):
            text_url = BASE_URL + link["href"]
            break

    return title, author, language, text_url

# Function to parse content from a book's text
def parse_content(content):
    lines = content.split("\n")
    book_content = []
    inside_book = False

    for line in lines:
        line = line.strip()
        if "*** START OF THE PROJECT GUTENBERG EBOOK" in line:
            inside_book = True
            continue
        if "*** END OF THE PROJECT GUTENBERG EBOOK" in line:
            break
        if inside_book:
            book_content.append(line)

    return "\n".join(book_content)

# Function to download book content
def get_book_content(text_url):
    try:
        response = requests.get(text_url, timeout=10)
        response.raise_for_status()
        return response.text.strip()
    except requests.RequestException as e:
        print(f"Failed to fetch content from {text_url}: {e}")
        return None

#remove dates and parenthesis from author (screw parsing)
def clean_author_name(author):
    # Remove birth/death years
    author = re.sub(r",\s*\d{4}-\d{4}", "", author)
    # Remove anything in parentheses
    author = re.sub(r"\s*\(.*?\)", "", author)
    return author.strip()

# Process Books
# data = []
# for book_url in tqdm(scifi_books, desc="Processing Books"):
#     title, author, language, text_url = get_book_metadata(book_url)
#     if title and author and "English" in language and text_url:
#         content = get_book_content(text_url)
#         if content:
#             parsed_content = parse_content(content)
#             data.append({"title": title, "author": author, "text_url": text_url, "content": parsed_content})
#             # print(f"✅ {title} by {author}")
#         else:
#             print(f"⚠️ No text content found for {title} by {author}")
#     else:
#         print(f"🚫 Skipping {title} by {author} (No text or wrong language)")


# # Convert to DataFrame and Save
# df = pd.DataFrame(data)
# df = df.drop_duplicates(subset="title", keep="first")
# df.to_csv("/content/drive/MyDrive/Colab Notebooks/gutenberg_book_contents.csv", index=False)

#if already processed, load from here
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/gutenberg_book_contents.csv")
print("size of database: ", len(df))
df_sample = df.sample(n=100, random_state=42)
print("size of database: ", len(df_sample))

df_sample.to_csv("/content/drive/MyDrive/Colab Notebooks/sample_gutenberg_book_contents.csv", index=False)

size of database:  1283
size of database:  100


In [None]:
df_sample

Unnamed: 0,title,author,text_url,content
1243,Whiskaboom,Alan Arkin,https://www.gutenberg.org/ebooks/51132.txt.utf-8,"\n\n\n\nProduced by Greg Weeks, Mary Meehan an..."
1245,The White Feather Hex,Don Peterson,https://www.gutenberg.org/ebooks/23308.txt.utf-8,"\n\n\n\nProduced by Greg Weeks, Mary Meehan an..."
270,Disturbing Sun,Robert S. Richardson,https://www.gutenberg.org/ebooks/24150.txt.utf-8,"\n\n\n\nProduced by Greg Weeks, Bruce Albrecht..."
221,Cry Snooker,Andrew Fetler,https://www.gutenberg.org/ebooks/51570.txt.utf-8,"\n\n\n\nProduced by Greg Weeks, Mary Meehan an..."
543,Join Our Gang?,Sterling E. Lanier,https://www.gutenberg.org/ebooks/29987.txt.utf-8,\n\n\n\nProduced by Greg Weeks and the Online ...
...,...,...,...,...
630,The Man Who Hated Mars,Randall Garrett,https://www.gutenberg.org/ebooks/25644.txt.utf-8,"\n\n\n\n_To escape from Mars, all Clayton had ..."
58,Asteroid of Fear,Raymond Z. Gallun,https://www.gutenberg.org/ebooks/32780.txt.utf-8,"\n\n\n\nProduced by Greg Weeks, Mary Meehan an..."
978,The Short Life,Francis Donovan,https://www.gutenberg.org/ebooks/23928.txt.utf-8,"\n\n\n\nProduced by Greg Weeks, David Garcia a..."
825,Planet of Dreams,James McKimmey,https://www.gutenberg.org/ebooks/30045.txt.utf-8,\n[Illustration]\n\n\n_The climate was perfect...
