## Replace Title

##### Assistant

In [1]:
from pathlib import Path
from bs4 import BeautifulSoup as bs
import requests
from fastcore.net import urlsave
from fastcore.utils import Path

##### User

In [20]:
!ls static/307cfee0a3f8f7d76b7646960ea599f0

cover.png       long_tail.html  long_tail.ipynb nb.yml


In [37]:
! cat static/307cfee0a3f8f7d76b7646960ea599f0/long_tail.html | head -n10

<!DOCTYPE html>

<html lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml"><head>
<meta charset="utf-8"/>
<meta content="quarto-1.5.57" name="generator"/>
<meta content="width=device-width, initial-scale=1.0, user-scalable=yes" name="viewport"/>
<title>Stitch Fix, Jupyter, GitHub, and the Long Tail</title>
<style>
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
cat: stdout: Broken pipe


In [44]:
def update_meta(html_path: str|Path, 
                image_path:str,
                title:str,
                default_title: str = "nbsanity | Jupyter Notebook Viewer"):
    meta_tags="""<meta property="og:image" content="{{image_path}}">
<meta property="og:site_name" content="nbsanity">
<meta property="og:image:type" content="image/png">
<meta property="og:image:width" content="1200">
<meta property="og:image:height" content="630">
<meta property="og:type" content="website">
<meta property="og:url" content="https://nbsanity.com">
<meta property="og:title" content="{{title}}">
<meta property="og:description" content="nbsanity: A modern way to view public Jupyter notebooks on GitHub">
<meta name="twitter:image" content="{{image_path}}">
<meta name="twitter:card" content="summary_large_image">
<meta name="twitter:title" content="{{title}}">
<meta name="twitter:description" content="nbsanity: A modern way to view public Jupyter notebooks on GitHub">
"""
    doc = Path(html_path)
    soup = bs(doc.read_text(encoding='utf-8'), 'html.parser')
    head = soup.find('head')
    new_html = bs(meta_tags, 'html.parser')
    for element in new_html:
        head.insert(0, element)
    doc.write_text(str(soup), encoding='utf-8')

In [45]:
# update_meta('static/307cfee0a3f8f7d76b7646960ea599f0/long_tail.html', 'myimage.png', 'My title')

In [46]:
! cat static/307cfee0a3f8f7d76b7646960ea599f0/long_tail.html | head -n20

<!DOCTYPE html>

<html lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml"><head><meta content="nbsanity: A modern way to view public Jupyter notebooks on GitHub" name="twitter:description"/><meta content="{{title}}" name="twitter:title"/><meta content="summary_large_image" name="twitter:card"/><meta content="{{image_path}}" name="twitter:image"/><meta content="nbsanity: A modern way to view public Jupyter notebooks on GitHub" property="og:description"/><meta content="{{title}}" property="og:title"/><meta content="https://nbsanity.com" property="og:url"/><meta content="website" property="og:type"/><meta content="630" property="og:image:height"/><meta content="1200" property="og:image:width"/><meta content="image/png" property="og:image:type"/><meta content="nbsanity" property="og:site_name"/><meta content="{{image_path}}" property="og:image"/>
<meta charset="utf-8"/>
<meta content="quarto-1.5.57" name="generator"/>
<meta content="width=device-width, initial-scale=1.0, use

## Test shotscraper

In [57]:
def gist_raw(gist_url):
    # Extract gist ID from URL
    gist_id = gist_url.split('/')[-1]
    
    # Get gist info from GitHub API
    api_url = f"https://api.github.com/gists/{gist_id}"
    response = requests.get(api_url)
    data = response.json()
    
    # Check if 'files' or 'raw_url' is missing and raise HTTPError
    if 'files' not in data or not data['files']:
        raise urllib.error.HTTPError(api_url, 404, "No files found in the gist", None, None)
    
    raw_url = list(data['files'].values())[0].get('raw_url')
    if not raw_url:
        raise urllib.error.HTTPError(api_url, 404, "No raw_url found for the file", None, None)
    
    return raw_url


def git2raw(git_url: str): 
    if git_url.startswith('https://gist.github.com/'): return gist_raw(git_url)
    if git_url.startswith('https://gist.githubusercontent.com') and 'raw' in git_url: return git_url
    return git_url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/")


def escape_filename(filename):
    # Replace HTML-like characters and other problematic characters
    invalid_chars = '<>:"/\\|?*%#&{}+=`@!^;[]()$'
    for char in invalid_chars:
        filename = filename.replace(char, '_')
    return filename

Let's test it:

In [76]:
!rm -rf temp/
!mkdir temp

In [77]:

full_url='https://gist.github.com/93degree/f979a600bd6ef75044e83c3d9bfd6022'
nm = urlsave(git2raw(full_url), 'temp')
nm

Path('temp/AoC_2023%2C_day_1.ipynb')

In [80]:
nm.rename(nm.parent/escape_filename(nm.name))

Path('temp/AoC_2023_2C_day_1.ipynb')

In [81]:
!ls temp/

AoC_2023_2C_day_1.ipynb


In [85]:
!ls static/90372d112b04739ed2fd5e545ea62112/AoC_2023_2C_day_1.ipynb

static/90372d112b04739ed2fd5e545ea62112/AoC_2023_2C_day_1.ipynb


In [101]:
rm -rf static/*

## Let's see if we can fix the nb format 

In [102]:
import nbformat
# Read the notebook

def fix_nb(nbpath):
    "Mutate notebook to right version for Quarto."
    nb = nbformat.read(nbpath, as_version=4)
    nbformat.write(nb, nbpath)

In [94]:
! quarto render static/90372d112b04739ed2fd5e545ea62112/AoC_2023_2C_day_1.ipynb --no-execute --to html

[1mpandoc [22m
  to: html
  output-file: AoC_2023_2C_day_1.html
  standalone: true
  section-divs: true
  html-math-method: mathjax
  wrap: none
  default-image-extension: png
  
[1mmetadata[22m
  document-css: false
  link-citations: true
  date-format: long
  lang: en
  title: 'AoC 2023, day 1'
  
Output created: AoC_2023_2C_day_1.html



In [103]:
def fix_nb(nbpath):
    "Mutate notebook to right version for Quarto."
    nb = nbformat.read(nbpath, as_version=4)
    nbformat.write(nb, nbpath)

In [104]:
fix_nb('static/90372d112b04739ed2fd5e545ea62112/AoC_2023_2C_day_1.ipynb')