* Adds scraped.jsonl together with chunks and articles from the download_chunks step.
* Also cleans up scraped articles to match the same format.

In [144]:
import json 
import pandas as pd

In [145]:
MIN_COUNT = 25
STRIDE = 20
LENGTH = 40

In [146]:
articles = pd.read_parquet(f'data/articles_top{MIN_COUNT}_len{LENGTH}_stride{STRIDE}.parquet')
max_article_id = articles.article_id.max()

In [147]:
def truncate_text(text):
    """
    Clip off the references and notes section that appear
    at the end of articles
    """
    if 'References' in text:
        return text.split('References', 1)[0]
    elif 'Notes' in text:
        return text.split('Notes', 1)[0]
    # elif 'Citations' in text:
    #     return text.split('Citations', 1)[0]
    else:
        return text

In [148]:
scraped = []
with open('data/scraped.jsonl', 'r') as file:
    for line in file:
        scraped.append(json.loads(line))
scraped = pd.DataFrame(scraped)

# scraped.jsonl may be cached from an earlier pipeline run.
scraped = scraped[scraped['count'] >= MIN_COUNT]

# Exclude user pages
scraped = scraped[~scraped['lower_title'].str.contains('user:')]

scraped = scraped.sort_values('count', ascending=False)
scraped = scraped.drop_duplicates(subset=['url'], keep='first')

scraped = scraped.reset_index()
scraped['article_id'] = scraped.index + max_article_id + 1

scraped['title'] = scraped.lower_title

scraped = scraped.drop(['index'], axis=1)

# scraped['text'] = scraped['text'].apply(truncate_text)

In [149]:
print(len(articles))

articles = pd.concat([articles, scraped], axis=0)
print(len(articles))

del scraped

articles['lower_title'] = articles['lower_title'].apply(lambda x: x.strip())

articles = articles.sort_values('article_id').drop_duplicates('lower_title', keep='first')
print(len(articles))

articles.to_parquet(f'data/combined_articles_top{MIN_COUNT}_len{LENGTH}_stride{STRIDE}.parquet')

82040
91723
91228
