In [None]:
import pandas as pd
from bs4 import BeautifulSoup

# Pre-process the scraped data

## Reading

In [None]:
df = pd.read_csv("../scraped_newsletters.csv")

In [None]:
df.head()

## Cleaning and filtering the sender

In [None]:
# TODO Running this we still got confirmation emails in the data, add a second filter!
datascienceweekly_newsletter_df = df[(df["from"].str.contains("datascienceweekly")) & (True)].reset_index(drop=True)
datascienceweekly_newsletter_df["newsletter"] = "datascienceweekly"

In [None]:
# TODO Running this we still got a crypto newsletter in the data, add a second filter!
tldr_newsletter_df = df[(df["from"].str.contains("tldr")) & (True)].reset_index(drop=True)
tldr_newsletter_df["newsletter"] = "tldr"

In [None]:
# TODO Running this we still got confirmation emails in the data, add a second filter!
box_of_amazing_newsletter_df = df[(True) & (df["from"] == "rahim@rahimhirji.com")].reset_index(drop=True)
box_of_amazing_newsletter_df["newsletter"] = "box of amazing"

## Save the original HTMLs for later lookup

In [None]:
lookup_df = pd.concat([datascienceweekly_newsletter_df, tldr_newsletter_df, box_of_amazing_newsletter_df])[["newsletter","date","content"]]

In [None]:
lookup_df.to_csv("html_lookup.csv", index = False, quoting=1)

## Processing the HTMLs

### DataScienceWeekly

In [None]:
def get_split_stories_datascienceweekly(html_string, date):
    data = BeautifulSoup(html_string, "html.parser")
    
    # TODO find all ul tags in "data"
    all_uls = []

    processed_data = []

    for ul in all_uls:
        # TODO get the font tag child from the ul tag
        font = None

        try:
            children = list(font.children)
            
            # TODO extract the headline and the body
            headline = ""
            body = ""

            processed_data.append({
                "newsletter" : "datascienceweekly",
                "date" : date,
                "headline" : headline,
                "body" : body
            })

        except Exception as e:
            print(e)
    
    return processed_data

In [None]:
# Testing if it works
example = datascienceweekly_newsletter_df.sample(1)
pd.DataFrame(get_split_stories_datascienceweekly(example["content"].item(), example["date"].item()))

In [None]:
# TODO Finish this apply statement to convert the whole dataframe into the substory dataframe, which has one row for every full newsletter
temp_df = datascienceweekly_newsletter_df.apply(lambda x: x)

In [None]:
# Converting the rows (which are lists of entries) into the actual final form
final_df = pd.DataFrame([x for li in temp_df.ravel() for x in li])

In [None]:
# Again some random checking if everything looks good
final_df.sample(10)

In [None]:
# Save it to disk
final_df.to_csv("datascienceweekly_stories.csv", index=False, quoting=1)

### TLDR

In [None]:
def get_split_stories_tldr(html_string, date):
    data = BeautifulSoup(html_string, "html.parser")
    text_blocks = list(filter(lambda x: True if x.find("span").find(
        "a") else False, data.find_all("div", {"class": "text-block"})))
    # TODO check if all text_blocks make sense or if we want to drop some of them
    text_blocks = text_blocks # filtering or is simple indexing enough?

    articles = list(map(lambda x: {
        "newsletter": "TLDR",
        "date": date,
        "headline": x.find("span").find_all("span")[0].text,
        # TODO extract body in a similar matter to headline
        "body": ""
    }, text_blocks))

    return articles

In [None]:
# TODO test if it works similar to the first newsletter
example = None
pd.DataFrame({})

In [None]:
# Converting the whole dataframe into the substory dataframe, which has one row for every full newsletter
temp_df = tldr_newsletter_df.apply(lambda x: get_split_stories_tldr(x["content"], x["date"]), axis=1)

In [None]:
# Converting the rows (which are lists of entries) into the actual final form
final_df = pd.DataFrame([x for li in temp_df.ravel() for x in li])

In [None]:
# Again some random checking if everything looks good
final_df.sample(10)

In [None]:
# Save it to disk
final_df.to_csv("tldr_stories.csv", index=False, quoting=1)

### Box of Amazing

In [None]:
def get_split_stories_box_of_amazing(html_string, date):
    data = BeautifulSoup(html_string, "html.parser")
    # headlines are the <a>-tags ('cause they're links) with a certain styling on them, we get the headline texts by calling ".text" on them
    headlines = list(map(lambda x: x.text, data.find_all("a",{"style":"color: #3498DB; text-decoration: none;"})))

    # TODO look for similar attributes to extract the texts and find the relevant HTML tag
    texts = list(map(lambda x: x.text, data.find_all()))

    articles = []
    for i in range(len(headlines)):
        articles.append({
            "newsletter":"Box of Amazing",
            "date" : date,
            "headline" : headlines[i],
            "body" : texts[i].strip()
        })

    return articles

In [None]:
# Testing if it works
example = box_of_amazing_newsletter_df.sample(1)
pd.DataFrame(get_split_stories_box_of_amazing(example["content"].item(), example["date"].item()))

In [None]:
# Converting the whole dataframe into the substory dataframe, which has one row for every full newsletter
temp_df = box_of_amazing_newsletter_df.apply(lambda x: get_split_stories_box_of_amazing(x["content"], x["date"]), axis=1)

In [None]:
# Converting the rows (which are lists of entries) into the actual final form
final_df = pd.DataFrame([x for li in temp_df.ravel() for x in li])

In [None]:
# Again some random checking if everything looks good
final_df.sample(10)

In [None]:
# Save it to disk
final_df.to_csv("box_of_amazing_stories.csv", index=False, quoting=1)

## Put it all together

In [None]:
box_of_amazing_stories = pd.read_csv("box_of_amazing_stories.csv", quoting = 1)
tldr_stories = pd.read_csv("tldr_stories.csv", quoting = 1)
datascienceweekly_stories = pd.read_csv("datascienceweekly_stories.csv", quoting = 1)

In [None]:
all_stories = pd.concat([box_of_amazing_stories, tldr_stories, datascienceweekly_stories])
all_stories = all_stories.reset_index(drop=True)

In [None]:
all_stories["ID"] = list(all_stories.index)

In [None]:
all_stories.to_csv("all_newsletter_stories.csv", index = False, quoting = 1)