In [2]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

# Pre-process the scraped data

## Reading

In [None]:
df = pd.read_csv("scraped_newsletters.csv")

In [None]:
df.head()

## Cleaning and filtering the sender

In [None]:
datascienceweekly_newsletter_df = df[(df["from"].str.contains("datascienceweekly")) & (df["subject"].str.contains("Issue"))].reset_index(drop=True)
datascienceweekly_newsletter_df["newsletter"] = "datascienceweekly"

In [None]:
tldr_newsletter_df = df[(df["from"].str.contains("tldr")) & (~df["from"].str.contains("crypto"))].reset_index(drop=True)
tldr_newsletter_df["newsletter"] = "tldr"

In [None]:
box_of_amazing_newsletter_df = df[(df['date'] > '2022-04-01') & (df["from"] == "rahim@rahimhirji.com")].reset_index(drop=True)
box_of_amazing_newsletter_df["newsletter"] = "box of amazing"

## Save the original HTMLs for later lookup

In [97]:
datascienceweekly_newsletter_df[["newsletter","date","content"]].to_csv("datascienceweekly_lookup.csv", index=False, quoting=1)
tldr_newsletter_df[["newsletter","date","content"]].to_csv("tldr_lookup.csv", index=False, quoting=1)
box_of_amazing_newsletter_df[["newsletter","date","content"]].to_csv("box_of_amazing_lookup.csv", index=False, quoting=1)

## Processing the HTMLs

### DataScienceWeekly

In [None]:
def get_split_stories_datascienceweekly(html_string, date):
    data = BeautifulSoup(html_string, "html.parser")
    all_uls = data.find_all("ul")
    processed_data = []

    for ul in all_uls:
        # Necessary to the the previous tag and not filler, see https://www.crummy.com/software/BeautifulSoup/bs4/doc/#next-sibling-and-previous-sibling
        previous = ul.previous_sibling.previous_sibling

        # Get the topic header
        if((previous.name == "h2") and ("#34495e" in previous["style"])):
            current_topic = previous.text.strip()

        # Deprecated HTML tag, but they use it anyway
        font = ul.find("font")
        try:
            children = list(font.children)
            headline = children[0].text.strip()
            body = " ".join([x.text.strip() for x in children[2:] if "<br" not in x])

            # TODO maybe add the hyperlink to the article here
            processed_data.append({
                "newsletter" : "datascienceweekly",
                "date" : date,
                #"topic" : current_topic,
                "headline" : headline,
                "body" : body
            })

        except Exception as e:
            print(e)
    
    return processed_data

In [None]:
# Testing if it works
example = datascienceweekly_newsletter_df.sample(1)
pd.DataFrame(get_split_stories_datascienceweekly(example["content"].item(), example["date"].item()))

In [None]:
# Converting the whole dataframe into the substory dataframe, which has one row for every full newsletter
temp_df = datascienceweekly_newsletter_df.apply(lambda x: get_split_stories_datascienceweekly(x["content"], x["date"]), axis=1)

In [None]:
# Converting the rows (which are lists of entries) into the actual final form
final_df = pd.DataFrame([x for li in temp_df.ravel() for x in li])

In [None]:
# Again some random checking if everything looks good
final_df.sample(10)

In [None]:
# Save it to disk
final_df.to_csv("datascienceweekly_stories.csv", index=False, quoting=1)

### TLDR

In [None]:
def get_split_stories_tldr(html_string, date):
    data = BeautifulSoup(html_string, "html.parser")
    text_blocks = list(filter(lambda x: True if x.find("span").find(
        "a") else False, data.find_all("div", {"class": "text-block"})))
    text_blocks = text_blocks[:-2]

    articles = list(map(lambda x: {
        "newsletter": "TLDR",
        "date": date,
        # "topic" : "",
        "headline": x.find("span").find_all("span")[0].text,
        "body": x.find("span").find_all("span")[1].text
    }, text_blocks))

    return articles

In [None]:
# Testing if it works
example = tldr_newsletter_df.sample(1)
pd.DataFrame(get_split_stories_tldr(example["content"].item(), example["date"].item()))

In [None]:
# Converting the whole dataframe into the substory dataframe, which has one row for every full newsletter
temp_df = tldr_newsletter_df.apply(lambda x: get_split_stories_tldr(x["content"], x["date"]), axis=1)

In [None]:
# Converting the rows (which are lists of entries) into the actual final form
final_df = pd.DataFrame([x for li in temp_df.ravel() for x in li])

In [None]:
# Again some random checking if everything looks good
final_df.sample(10)

In [None]:
# Save it to disk
final_df.to_csv("tldr_stories.csv", index=False, quoting=1)

### Box of Amazing

In [None]:
def get_split_stories_box_of_amazing(html_string, date):
    data = BeautifulSoup(html_string, "html.parser")
    headlines = list(map(lambda x: x.text, data.find_all("a",{"style":"color: #3498DB; text-decoration: none;"})))
    texts = list(map(lambda x: x.text, data.find_all("div",{"class":"link-description"})))

    articles = []
    # TODO could also be done with zip
    for i in range(len(headlines)):
        articles.append({
            "newsletter":"Box of Amazing",
            "date" : date,
            "headline" : headlines[i],
            "body" : texts[i].strip()
        })

    return articles

In [None]:
# Testing if it works
example = box_of_amazing_newsletter_df.sample(1)
pd.DataFrame(get_split_stories_box_of_amazing(example["content"].item(), example["date"].item()))

In [None]:
# Converting the whole dataframe into the substory dataframe, which has one row for every full newsletter
temp_df = box_of_amazing_newsletter_df.apply(lambda x: get_split_stories_box_of_amazing(x["content"], x["date"]), axis=1)

In [None]:
# Converting the rows (which are lists of entries) into the actual final form
final_df = pd.DataFrame([x for li in temp_df.ravel() for x in li])

In [None]:
# Again some random checking if everything looks good
final_df.sample(10)

In [None]:
# Save it to disk
final_df.to_csv("box_of_amazing_stories.csv", index=False, quoting=1)

## Put it all together

In [3]:
box_of_amazing_stories = pd.read_csv("box_of_amazing_stories.csv", quoting = 1)
tldr_stories = pd.read_csv("tldr_stories.csv", quoting = 1)
datascienceweekly_stories = pd.read_csv("datascienceweekly_stories.csv", quoting = 1)

In [10]:
all_stories = pd.concat([box_of_amazing_stories, tldr_stories, datascienceweekly_stories])
all_stories = all_stories.reset_index(drop=True)

In [12]:
all_stories["ID"] = list(all_stories.index)

In [14]:
all_stories.to_csv("all_newsletter_stories.csv", index = False, quoting = 1)