In [None]:
import feedparser

from bs4 import BeautifulSoup

from whoosh.index import create_in
from whoosh.fields import *
from whoosh.qparser import QueryParser, FuzzyTermPlugin
from whoosh import highlight

import calendar
from datetime import datetime

In [None]:
import feedparser

NewsFeed = feedparser.parse("https://thegeneralist.substack.com/feed")

entry = NewsFeed.entries[0]
print(entry)

In [None]:
print(entry['title'])
print(entry['author'])
print(entry['summary'])
print(entry['link'])
print(entry['published_parsed'])
content = entry['content'][0]['value']
print(content)

In [None]:
# Turn HTML into clean text for nice searching

from bs4 import BeautifulSoup
tree = BeautifulSoup(content)

# This validates and cleans HTML but Substack HTML is already fine
# pretty = tree.prettify()
# print(pretty)

pure_text = tree.get_text("\n")
print(pure_text)

In [None]:
# Next, build a Whoosh database and search engine
# https://whoosh.readthedocs.io/en/latest/quickstart.html#a-quick-introduction

# Define schema (pretty simple)
# Add documents from all top Substacks
# Build a searcher
# Show results
# Highlight bits from the thinkpieces that match
# Optional: add stemming for better searching
# Later, add support for other newsletters with an RSS feed
# Maybe, later, offer `more_like_this` so people can rabbit-hole in

# Use case: I want to find all articles from my favorite writers that include the term
# "BNPL" since I want to research "Buy Now, Pay Later" in fintech.
# Or I want to find all thinkpieces that mentioned Google in the last year

In [None]:
# Create a list of all RSSes to search

# First, Substacks. Getting their RSS feeds is pretty simple/systematic.
substack_domains = [
    "thegeneralist",
    "danco",
    "diff",
    "nbt",
    "platformer",
    "notboring",
    "sariazout",
    "digitalnative",
    "jamesonstartups",
    "breakingsmart",
    "artofgig",
    "theskip",
    "gwern"
]

# Feeds are, e.g., https://thegeneralist.substack.com/feed
substack_feeds = ["https://{0}.substack.com/feed".format(domain) for domain in substack_domains]

# Now add some custom RSS feeds
# Medium feeds are medium.com/feed/@user or medium.com/feed/publication
custom_feeds = [
    "https://stratechery.com/feed/",
    "https://www.profgalloway.com/feed",
    "https://eugene-wei.squarespace.com/blog?format=rss",
    "https://medium.com/feed/@superwuster",
    "https://commoncog.com/blog/rss",
    "https://www.lennyrachitsky.com/feed",
    "https://medium.com/feed/bloated-mvp",
    "https://daringfireball.net/feeds/main",
    "https://wongmjane.com/api/feed/rss",
    "https://fourweekmba.com/feed",
]

# Unite all feeds into one
all_feeds = substack_feeds + custom_feeds
all_feeds

# We'll read in the feeds from each of these. Get RSS feed at 
# https://thegeneralist.substack.com/feed

# NewsFeed = feedparser.parse("https://thegeneralist.substack.com/feed")

In [None]:
# Convenience function to safely get an item from a dict.
# If the key doesn't exist, just returns none
def safe_get(obj, key):
    if obj.has_key(key):
        return obj[key]
    else:
        return None

In [None]:
# Allows optional case-sensitive searches. all-lowercase is case insensitive, 
# any capital letters makes it case sensitive 

class CaseSensitivizer(analysis.Filter):
    def __call__(self, tokens):
        for t in tokens:
            yield t
            if t.mode == "index":
               low = t.text.lower()
               if low != t.text:
                   t.text = low
                   yield t

ana = analysis.RegexTokenizer() | CaseSensitivizer()
# [t.text for t in ana("The new SuperTurbo 5000", mode="index")]
# ["The", "the", "new", "SuperTurbo", "superturbo", "5000"]



In [None]:
# Build the database
schema = Schema(
    title=TEXT(stored=True),
    author=TEXT(stored=True),
    publication=TEXT(stored=True),
    summary=TEXT(stored=True),
    url=TEXT(stored=True),
    published=DATETIME(stored=True, sortable=True),
    content=TEXT(stored=True, analyzer=ana))
index = create_in("whoosh_index2", schema)
writer = index.writer()

# Read every item from our RSS feeds into there
# Call writer.add_document() repeatedly for each item 

for feed_url in all_feeds:
    print(feed_url)
    news_feed = feedparser.parse(feed_url)
    
    # NOTE: we can only get the last few entries from this RSS feed.
    # Substack doesn't seem to show anything older than the last 20.
    # So we should build in a system to start caching these.
    
#     print(len(news_feed.entries))

    for entry in news_feed.entries:
        
        # Get publication name. This is in the feed's `feed` field, along with other metadata
        publication = None
        metadata = safe_get(news_feed, 'feed')
        if metadata is not None:
            publication = safe_get(metadata, 'title')

        # Clean up the date into a normal datetime
        clean_datetime = datetime.fromtimestamp(calendar.timegm(entry['published_parsed']))
        
        # Most feeds put the main content in `content`,
        # but a rare few like Eugene Wei put it in `summary`
        # (in which case `content` is empty). With this logic, let's get a single `content` field.
        body_text = None
        # See if `content` exists
        content_holder = safe_get(entry, 'content')
        if content_holder is not None:
            # We have content; fill it in
            content_tree = BeautifulSoup(content_holder[0]['value'])
            body_text = content_tree.get_text(" ", strip=True)
        else:
            # No content provided. `summary` must hold all the text.
            summary_tree = BeautifulSoup(safe_get(entry, 'summary'))
            body_text = summary_tree.get_text(" ", strip=True)

        writer.add_document(
            title=safe_get(entry, 'title'),
            author=safe_get(entry, 'author'),
            publication=publication,
            summary=safe_get(entry, 'summary'),
            url=safe_get(entry, 'link'),
            published=clean_datetime,
            content=body_text)

print("DONE!")
writer.commit()

In [None]:
# For convenience, we're overriding the standard fragment formatter
class BracketFormatter(highlight.Formatter):
    """Puts square brackets around the matched terms.
    """

    def format_token(self, text, token, replace=False):
        # Use the get_text function to get the text corresponding to the
        # token
        tokentext = highlight.get_text(text, token, replace)

        # Return the text as you want it to appear in the highlighted
        # string
        return "[[%s]]" % tokentext

In [None]:
# Try searching
from whoosh.qparser import QueryParser, MultifieldParser
import whoosh.qparser as qparser

search_term = "Notion"

with index.searcher() as searcher:
    parser = QueryParser("content", index.schema)
    # Allow fuzzy matching (EDIT: kinda screws things up)
    # parser.add_plugin(FuzzyTermPlugin())
    # Allow searching for entire phrases w/ single quotes, like 'microsoft teams'
    parser.add_plugin(qparser.SingleQuotePlugin())
    
    query = parser.parse(search_term)
    results = searcher.search(query, limit=None)
    
    # Highlighting settings
    # This provides more context characters around the searched-for text
    results.fragmenter.surround = 50
    results.fragmenter.maxchars = 500
    
    # Surround matched tags with brackets
    results.formatter = BracketFormatter()
    
    # Convert each Hit into a dict
    def extract_hit_info(hit):
        return {
            'title': hit.get('title'),
            'publication': hit.get('publication'),
            'author': hit.get('author'),
            'url': hit.get('url'),
            'highlights': hit.highlights("content", top=3),
            'published': hit.get('published'),
            'score': hit.score
        }
    
    hit_list = [extract_hit_info(h) for h in results]
    
    print(hit_list)

In [None]:
# Experimental

news_feed = feedparser.parse("https://diff.substack.com/feed")
print(news_feed['feed'])

In [None]:
# TODO: read from the thinkpiecer module and use it here, instead of writing custom code

In [None]:
z = {'a':5, 'b':3}

In [None]:
print(z.get('c'))

In [None]:
# Search for the newest items.

# Try searching
from whoosh.qparser import QueryParser, MultifieldParser
import whoosh.qparser as qparser
from whoosh.qparser.dateparse import DateParserPlugin

search_term = "'-26 weeks to now'"

with index.searcher() as searcher:
    parser = QueryParser("published", index.schema)
    # Allow fuzzy matching (EDIT: kinda screws things up)
    # parser.add_plugin(FuzzyTermPlugin())
    # Allow searching for entire phrases w/ single quotes, like 'microsoft teams'
    parser.add_plugin(qparser.SingleQuotePlugin())
    
    # Add the DateParserPlugin to the parser
    parser.add_plugin(DateParserPlugin())
    
    query = parser.parse(search_term)
    results = searcher.search(query, limit=50, sortedby="published", reverse=True)
    
    # Highlighting settings
    # This provides more context characters around the searched-for text
#     results.fragmenter.surround = 50
#     results.fragmenter.maxchars = 500
    
    # Surround matched tags with brackets
#     results.formatter = BracketFormatter()
    
    # Convert each Hit into a dict
    def extract_hit_info(hit):
        return {
            'title': hit.get('title'),
            'publication': hit.get('publication'),
            'author': hit.get('author'),
            'url': hit.get('url'),
#             'highlights': hit.highlights("content", top=3),
            'published': hit.get('published'),
            'score': hit.score
        }
    
    hit_list = [extract_hit_info(h) for h in results]
    
    print(hit_list)

In [8]:
from thinkpiecer import *

# Let's try the real code
# Load the real index and search it 

ix = load_index()

# Figure out the distribution of word counts. My theory is that there are a lot of
# low-quality, "preview-only / paywalled" pieces contained here and they should be filtered out.
# Is there a breakpoint from bad to good?
recs = get_recent_articles(ix)
wcs = [r['content_words'] for r in recs]
print(sorted(wcs))

midquals = [r for r in recs if r['content_words'] > 150 and r['content_words'] < 250]
print(midquals)

[11, 12, 12, 13, 13, 14, 16, 18, 21, 23, 29, 35, 43, 51, 52, 53, 55, 55, 55, 55, 55, 55, 57, 57, 58, 58, 58, 58, 59, 59, 59, 59, 59, 59, 60, 60, 60, 60, 60, 60, 61, 61, 62, 62, 63, 63, 63, 64, 64, 65, 65, 65, 65, 65, 65, 65, 67, 67, 67, 68, 69, 70, 70, 71, 72, 72, 73, 73, 79, 80, 118, 136, 138, 168, 204, 218, 252, 258, 297, 303, 323, 358, 368, 419, 445, 446, 467, 522, 525, 556, 571, 646, 654, 660, 665, 678, 683, 687, 695, 730, 738, 739, 779, 792, 793, 811, 843, 853, 868, 881, 889, 893, 895, 901, 924, 929, 938, 948, 956, 958, 1008, 1024, 1040, 1088, 1090, 1093, 1096, 1103, 1111, 1124, 1136, 1141, 1172, 1182, 1183, 1193, 1194, 1194, 1195, 1209, 1219, 1222, 1245, 1272, 1274, 1285, 1301, 1302, 1311, 1315, 1325, 1355, 1359, 1369, 1373, 1373, 1381, 1383, 1394, 1394, 1395, 1399, 1413, 1419, 1422, 1436, 1466, 1483, 1490, 1497, 1511, 1514, 1525, 1546, 1554, 1557, 1558, 1571, 1571, 1573, 1589, 1591, 1594, 1646, 1653, 1659, 1668, 1670, 1671, 1671, 1678, 1683, 1695, 1704, 1711, 1712, 1718, 1720, 1

In [None]:
# Word count
import re
line = " I am having a very -- nice @ 'day'."
count = len(re.findall(r'\w+', line))
print (count)