In [None]:
import feedparser

from bs4 import BeautifulSoup

from whoosh.index import create_in
from whoosh.fields import *
from whoosh.qparser import QueryParser, FuzzyTermPlugin
from whoosh import highlight

import calendar
from datetime import datetime

In [None]:
import feedparser

NewsFeed = feedparser.parse("https://thegeneralist.substack.com/feed")

entry = NewsFeed.entries[0]
print(entry)

In [None]:
print(entry['title'])
print(entry['author'])
print(entry['summary'])
print(entry['link'])
print(entry['published_parsed'])
content = entry['content'][0]['value']
print(content)

In [None]:
# Turn HTML into clean text for nice searching

from bs4 import BeautifulSoup
tree = BeautifulSoup(content)

# This validates and cleans HTML but Substack HTML is already fine
# pretty = tree.prettify()
# print(pretty)

pure_text = tree.get_text("\n")
print(pure_text)

In [None]:
# Next, build a Whoosh database and search engine
# https://whoosh.readthedocs.io/en/latest/quickstart.html#a-quick-introduction

# Define schema (pretty simple)
# Add documents from all top Substacks
# Build a searcher
# Show results
# Highlight bits from the thinkpieces that match
# Optional: add stemming for better searching
# Later, add support for other newsletters with an RSS feed
# Maybe, later, offer `more_like_this` so people can rabbit-hole in

# Use case: I want to find all articles from my favorite writers that include the term
# "BNPL" since I want to research "Buy Now, Pay Later" in fintech.
# Or I want to find all thinkpieces that mentioned Google in the last year

In [None]:
# Create a list of all RSSes to search

# First, Substacks. Getting their RSS feeds is pretty simple/systematic.
substack_domains = [
    "thegeneralist",
    "danco",
    "diff",
    "nbt",
    "platformer",
    "notboring",
    "sariazout",
    "digitalnative",
    "jamesonstartups",
    "breakingsmart",
    "artofgig",
    "theskip",
    "gwern"
]

# Feeds are, e.g., https://thegeneralist.substack.com/feed
substack_feeds = ["https://{0}.substack.com/feed".format(domain) for domain in substack_domains]

# Now add some custom RSS feeds
# Medium feeds are medium.com/feed/@user or medium.com/feed/publication
custom_feeds = [
    "https://stratechery.com/feed/",
    "https://www.profgalloway.com/feed",
    "https://eugene-wei.squarespace.com/blog?format=rss",
    "https://medium.com/feed/@superwuster",
    "https://commoncog.com/blog/rss",
    "https://www.lennyrachitsky.com/feed",
    "https://medium.com/feed/bloated-mvp",
    "https://daringfireball.net/feeds/main",
    "https://wongmjane.com/api/feed/rss",
    "https://fourweekmba.com/feed",
]

# Unite all feeds into one
all_feeds = substack_feeds + custom_feeds
all_feeds

# We'll read in the feeds from each of these. Get RSS feed at 
# https://thegeneralist.substack.com/feed

# NewsFeed = feedparser.parse("https://thegeneralist.substack.com/feed")

In [None]:
# Convenience function to safely get an item from a dict.
# If the key doesn't exist, just returns none
def safe_get(obj, key):
    if obj.has_key(key):
        return obj[key]
    else:
        return None

In [None]:
# Allows optional case-sensitive searches. all-lowercase is case insensitive, 
# any capital letters makes it case sensitive 

class CaseSensitivizer(analysis.Filter):
    def __call__(self, tokens):
        for t in tokens:
            yield t
            if t.mode == "index":
               low = t.text.lower()
               if low != t.text:
                   t.text = low
                   yield t

ana = analysis.RegexTokenizer() | CaseSensitivizer()
# [t.text for t in ana("The new SuperTurbo 5000", mode="index")]
# ["The", "the", "new", "SuperTurbo", "superturbo", "5000"]



In [None]:
# Build the database
schema = Schema(
    title=TEXT(stored=True),
    author=TEXT(stored=True),
    publication=TEXT(stored=True),
    summary=TEXT(stored=True),
    url=TEXT(stored=True),
    published=DATETIME(stored=True, sortable=True),
    content=TEXT(stored=True, analyzer=ana))
index = create_in("whoosh_index2", schema)
writer = index.writer()

# Read every item from our RSS feeds into there
# Call writer.add_document() repeatedly for each item 

for feed_url in all_feeds:
    print(feed_url)
    news_feed = feedparser.parse(feed_url)
    
    # NOTE: we can only get the last few entries from this RSS feed.
    # Substack doesn't seem to show anything older than the last 20.
    # So we should build in a system to start caching these.
    
#     print(len(news_feed.entries))

    for entry in news_feed.entries:
        
        # Get publication name. This is in the feed's `feed` field, along with other metadata
        publication = None
        metadata = safe_get(news_feed, 'feed')
        if metadata is not None:
            publication = safe_get(metadata, 'title')

        # Clean up the date into a normal datetime
        clean_datetime = datetime.fromtimestamp(calendar.timegm(entry['published_parsed']))
        
        # Most feeds put the main content in `content`,
        # but a rare few like Eugene Wei put it in `summary`
        # (in which case `content` is empty). With this logic, let's get a single `content` field.
        body_text = None
        # See if `content` exists
        content_holder = safe_get(entry, 'content')
        if content_holder is not None:
            # We have content; fill it in
            content_tree = BeautifulSoup(content_holder[0]['value'])
            body_text = content_tree.get_text(" ", strip=True)
        else:
            # No content provided. `summary` must hold all the text.
            summary_tree = BeautifulSoup(safe_get(entry, 'summary'))
            body_text = summary_tree.get_text(" ", strip=True)

        writer.add_document(
            title=safe_get(entry, 'title'),
            author=safe_get(entry, 'author'),
            publication=publication,
            summary=safe_get(entry, 'summary'),
            url=safe_get(entry, 'link'),
            published=clean_datetime,
            content=body_text)

print("DONE!")
writer.commit()

In [None]:
# For convenience, we're overriding the standard fragment formatter
class BracketFormatter(highlight.Formatter):
    """Puts square brackets around the matched terms.
    """

    def format_token(self, text, token, replace=False):
        # Use the get_text function to get the text corresponding to the
        # token
        tokentext = highlight.get_text(text, token, replace)

        # Return the text as you want it to appear in the highlighted
        # string
        return "[[%s]]" % tokentext

In [None]:
# Try searching
from whoosh.qparser import QueryParser, MultifieldParser
import whoosh.qparser as qparser

search_term = "Notion"

with index.searcher() as searcher:
    parser = QueryParser("content", index.schema)
    # Allow fuzzy matching (EDIT: kinda screws things up)
    # parser.add_plugin(FuzzyTermPlugin())
    # Allow searching for entire phrases w/ single quotes, like 'microsoft teams'
    parser.add_plugin(qparser.SingleQuotePlugin())
    
    query = parser.parse(search_term)
    results = searcher.search(query, limit=None)
    
    # Highlighting settings
    # This provides more context characters around the searched-for text
    results.fragmenter.surround = 50
    results.fragmenter.maxchars = 500
    
    # Surround matched tags with brackets
    results.formatter = BracketFormatter()
    
    # Convert each Hit into a dict
    def extract_hit_info(hit):
        return {
            'title': hit.get('title'),
            'publication': hit.get('publication'),
            'author': hit.get('author'),
            'url': hit.get('url'),
            'highlights': hit.highlights("content", top=3),
            'published': hit.get('published'),
            'score': hit.score
        }
    
    hit_list = [extract_hit_info(h) for h in results]
    
    print(hit_list)

In [None]:
# Experimental

news_feed = feedparser.parse("https://diff.substack.com/feed")
print(news_feed['feed'])

In [None]:
# TODO: read from the thinkpiecer module and use it here, instead of writing custom code

In [None]:
z = {'a':5, 'b':3}

In [None]:
print(z.get('c'))

In [None]:
# Search for the newest items.

# Try searching
from whoosh.qparser import QueryParser, MultifieldParser
import whoosh.qparser as qparser
from whoosh.qparser.dateparse import DateParserPlugin

search_term = "'-26 weeks to now'"

with index.searcher() as searcher:
    parser = QueryParser("published", index.schema)
    # Allow fuzzy matching (EDIT: kinda screws things up)
    # parser.add_plugin(FuzzyTermPlugin())
    # Allow searching for entire phrases w/ single quotes, like 'microsoft teams'
    parser.add_plugin(qparser.SingleQuotePlugin())
    
    # Add the DateParserPlugin to the parser
    parser.add_plugin(DateParserPlugin())
    
    query = parser.parse(search_term)
    results = searcher.search(query, limit=50, sortedby="published", reverse=True)
    
    # Highlighting settings
    # This provides more context characters around the searched-for text
#     results.fragmenter.surround = 50
#     results.fragmenter.maxchars = 500
    
    # Surround matched tags with brackets
#     results.formatter = BracketFormatter()
    
    # Convert each Hit into a dict
    def extract_hit_info(hit):
        return {
            'title': hit.get('title'),
            'publication': hit.get('publication'),
            'author': hit.get('author'),
            'url': hit.get('url'),
#             'highlights': hit.highlights("content", top=3),
            'published': hit.get('published'),
            'score': hit.score
        }
    
    hit_list = [extract_hit_info(h) for h in results]
    
    print(hit_list)

In [1]:
from thinkpiecer import *

# Let's try the real code
# Load the real index and search it 

ix = load_index()

# Figure out the distribution of word counts. My theory is that there are a lot of
# low-quality, "preview-only / paywalled" pieces contained here and they should be filtered out.
# Is there a breakpoint from bad to good?
recs = get_recent_articles(ix)
print(recs)

[{'title': 'EHR interoperability: why it matters', 'publication': 'In Silico', 'author': 'Scott Xiao', 'url': 'https://insilico.substack.com/p/ehr-interoperability-why-it-matters', 'published': datetime.datetime(2020, 12, 14, 17, 44, 23), 'score': 63743564663000000}, {'title': 'Everybody Hates Facebook', 'publication': 'Not Boring by Packy McCormick', 'author': 'Packy McCormick', 'url': 'https://notboring.substack.com/p/everybody-hates-facecbook', 'published': datetime.datetime(2020, 12, 14, 9, 7, 21), 'score': 63743533641000000}, {'title': 'Everybody Hates Facebook', 'publication': 'Not Boring by Packy McCormick', 'author': 'Packy McCormick', 'url': 'https://notboring.substack.com/p/everybody-hates-facebook', 'published': datetime.datetime(2020, 12, 14, 8, 54, 46), 'score': 63743532886000000}, {'title': 'What comes after smartphones?', 'publication': 'Essays - Benedict Evans', 'author': 'Benedict Evans', 'url': 'https://www.ben-evans.com/benedictevans/2020/12/13/what-comes-after-smart

In [None]:
import calendar
calendar.timegm

In [3]:
import thinkpiecer
import feedparser
from bs4 import BeautifulSoup
from whoosh import index
from whoosh.fields import *
from whoosh.writing import AsyncWriter
from whoosh.qparser import QueryParser
import whoosh.qparser as qparser
from whoosh.qparser.dateparse import DateParserPlugin
from whoosh import highlight
import calendar
from datetime import datetime
import os, os.path

# Local modules
import utilities
from utilities import safe_get
import feeds


In [6]:
# Try searching for recent high quality articles
ix = thinkpiecer.load_index()

with ix.searcher() as searcher:
    # Search for the newest items.
    # We have to search for SOMETHING so let's filter out all the
    # articles that are too short. Then we can sort by date desc.
    parser = QueryParser("content_word_count", ix.schema)
    
    # Add a parser to do numerical comparisons
    # This is the "greater than, less than" plugin
    parser.add_plugin(qparser.GtLtPlugin())
    search_term = "content_word_count:>=250"
    query = parser.parse(search_term)
    

    # We have to search for SOMETHING
    # so we'll start by just searching for recent articles.
#     parser = QueryParser("published", ix.schema)
#     parser.add_plugin(DateParserPlugin())
    # Let's limit the amount of sorting we have to do by only looking
    # at pieces in recent history
#     search_term = "'-26 weeks to now'"
#     query = parser.parse(search_term)
    results = searcher.search(query,
        limit=10, sortedby="published", reverse=True)

    # Convert each Hit into a dict
    def extract_hit_info(hit):
        return {
            'title': hit.get('title'),
            'publication': hit.get('publication'),
            'author': hit.get('author'),
            'url': hit.get('url'),
            'published': hit.get('published'),
            'content': hit.get('content'),
            'content_word_count': hit.get('content_word_count'),
            'score': hit.score
        }

    hit_list = [extract_hit_info(h) for h in results]
#     print(hit_list)
    print([h['content'] for h in hit_list])

['One of the clearest examples of healthcare’s backwardness is the lack of control patients have over their own health data. In today’s world, we take data access for granted in other industries — being able to send documents from one person to another, to aggregate meetings from multiple calendars into a single view, or to manage several bank accounts using a single app. It’s hard to imagine a similar experience in healthcare. Just the other day, a friend of mine was trying to figure out if she’d gotten all her tetanus shots when she was younger. It should’ve been a simple question, but the only answer I could think of was for her to call every single primary care provider she’s had since childhood and ask for her vaccination records. At that point, it would’ve been a coin flip as to whether they would fax the info or use the groundbreaking new technology of email. We’re going to dive into this issue today and look at how things work today as well as the slow changes taking place. If 

In [11]:
z = "Putting it All Together. Facebook is an advertising juggernaut on pace to do more than $80 billion in revenue this year at 80+% gross margins, which has only become more critical for advertisers during the pandemic. Because it’s hated, you get all of that at a slight discount without accounting for Facebook’s wild bets on the future. Just to be extra clear: this is not a value judgement. Facebook may very well be evil. As I was writing this post, I hopped on to Twitter, and this is the first tweet that popped into my feed: But because we all think Facebook is evil, we don’t spend much time trying to understand the full scope of the business behind the products on which we spend 65 minutes per day. What is Facebook? Facebook is so omnipresent that the analysis of the company is all trees, no forest. When I started researching this piece, I realized that despite reading Stratechery daily and spending a lot of time on tech twitter, I don’t fully understand everything the company does and how it all fits together. So let’s all admit that we’ve lost track, hit reset, zoom out, and admire the forest. Nearly half of the world’s population uses one or more of Facebook’s four main social media properties - Facebook , Messenger , Instagram , and WhatsApp . Facebook is also building what it hopes is the next computing platform through Facebook Reality Labs, which houses Oculus, Portal, Spark AR, CTRL-labs, and more. The main Facebook product had 1.8 billion Daily Active Users (DAUs) and 2.7 billion Monthly Active Users"
z[:250]

'Putting it All Together. Facebook is an advertising juggernaut on pace to do more than $80 billion in revenue this year at 80+% gross margins, which has only become more critical for advertisers during the pandemic. Because it’s hated, you get all of'