In [71]:
import json

def read_from_storage(filename: str) -> list:
    with open(f"../Data/extract/{filename}.json", 'r') as f:
        return json.loads(f.read())

In [72]:
science_articles = read_from_storage('science')
conspiracy_articles = read_from_storage('conspiracy')

In [73]:
min_len = min(len(science_articles), len(conspiracy_articles))
science_articles = science_articles[:min_len]
conspiracy_articles = conspiracy_articles[:min_len]

In [74]:
min_len

280

In [75]:
import pandas as pd
import numpy as np

In [76]:
df_science = pd.DataFrame.from_dict(science_articles)
df_conspiracy = pd.DataFrame.from_dict(conspiracy_articles)

In [77]:
df_science['article_type'] = 1
df_conspiracy['article_type'] = 0

In [78]:
df_science

Unnamed: 0,title,link,article_type
0,\n \n \n Quercetin for Seasonal Allergies?\n \...,https://www.consumerlab.com/reviews/quercetin-...,1
1,\n \n \n How Many Times Should You Test for CO...,https://www.consumerlab.com/answers/how-and-wh...,1
2,\n 08/08/2022\n Woman Pleads Guilty for Sellin...,https://www.consumerlab.com/recalls/14684/woma...,1
3,\n 08/08/2022\n Seller of CBD Warned for COVID...,https://www.consumerlab.com/recalls/14683/sell...,1
4,\n \n Product Reviews and Answers to Questions...,https://www.consumerlab.com/topic/coronavirus/,1
...,...,...,...
275,David Oliver: The overwhelming reaction to my ...,https://www.bmj.com/content/378/bmj.o2017,1
276,Investigating the monkeypox outbreak,https://www.bmj.com/content/377/bmj.o1314,1
277,Monkeypox: what we know about the 2022 outbrea...,https://www.bmj.com/content/378/bmj.o2058,1
278,Risk of preterm birth and stillbirth after cov...,https://www.bmj.com/content/378/bmj-2022-071416,1


In [79]:
df_conspiracy

Unnamed: 0,title,link,article_type
0,Fauci to Step Down in December \xe2\x80\x94 Wi...,https://www.sgtreport.com/2022/08/fauci-to-ste...,0
1,\n\t\t\t\t\t\t\t\t\t\tJudging the Covid-19 Pan...,https://americanfreepress.net/judging-the-covi...,0
2,Is the COVID-19 Vaccine a Miracle?,https://biologos.org/post/is-the-covid-19-vacc...,0
3,Is the COVID-19 vaccine safe?,https://biologos.org/resources/is-the-covid-19...,0
4,A Christian Statement on Science for Pandemic ...,https://biologos.org/post/a-christian-statemen...,0
...,...,...,...
275,Most people infected with Omicron weren't even...,https://www.sott.net/article/471164-Most-peopl...,0
276,"Zoonotic Langya virus found in China, CDC says",https://www.sott.net/article/470863-Zoonotic-L...,0
277,CDC quietly removes 'claim' that spike protein...,https://www.sott.net/article/471150-CDC-quietl...,0
278,9/11 vs. COVID: The Sorry Tale of a Nation Gon...,https://www.veteranstoday.com/2022/08/12/9-11-...,0


In [80]:
# merge the two dataframes (preserves order)
df = pd.concat([df_science, df_conspiracy], axis=0, ignore_index=True)

In [81]:
df

Unnamed: 0,title,link,article_type
0,\n \n \n Quercetin for Seasonal Allergies?\n \...,https://www.consumerlab.com/reviews/quercetin-...,1
1,\n \n \n How Many Times Should You Test for CO...,https://www.consumerlab.com/answers/how-and-wh...,1
2,\n 08/08/2022\n Woman Pleads Guilty for Sellin...,https://www.consumerlab.com/recalls/14684/woma...,1
3,\n 08/08/2022\n Seller of CBD Warned for COVID...,https://www.consumerlab.com/recalls/14683/sell...,1
4,\n \n Product Reviews and Answers to Questions...,https://www.consumerlab.com/topic/coronavirus/,1
...,...,...,...
555,Most people infected with Omicron weren't even...,https://www.sott.net/article/471164-Most-peopl...,0
556,"Zoonotic Langya virus found in China, CDC says",https://www.sott.net/article/470863-Zoonotic-L...,0
557,CDC quietly removes 'claim' that spike protein...,https://www.sott.net/article/471150-CDC-quietl...,0
558,9/11 vs. COVID: The Sorry Tale of a Nation Gon...,https://www.veteranstoday.com/2022/08/12/9-11-...,0


In [82]:
# splitting the df into 30 different parts
split_df = np.array_split(df, 30)

In [83]:
split_df[0]

Unnamed: 0,title,link,article_type
0,\n \n \n Quercetin for Seasonal Allergies?\n \...,https://www.consumerlab.com/reviews/quercetin-...,1
1,\n \n \n How Many Times Should You Test for CO...,https://www.consumerlab.com/answers/how-and-wh...,1
2,\n 08/08/2022\n Woman Pleads Guilty for Sellin...,https://www.consumerlab.com/recalls/14684/woma...,1
3,\n 08/08/2022\n Seller of CBD Warned for COVID...,https://www.consumerlab.com/recalls/14683/sell...,1
4,\n \n Product Reviews and Answers to Questions...,https://www.consumerlab.com/topic/coronavirus/,1
5,"\n \n Latest Reviews of N95, KN95 and Other Ma...",https://www.consumerlab.com/answers/how-to-mak...,1
6,Coronavirus,https://nejm.org/coronavirus,1
7,\n Japanese Encephalitis in Australia\n C. Wal...,https://nejm.org/doi/full/10.1056/NEJMc2207004...,1
8,\n Case Series of Children with Acute Hepatiti...,https://nejm.org/doi/full/10.1056/NEJMoa220629...,1
9,\n \n \n \n \n Coronavirus\n,https://nejm.org/coronavirus,1


In [84]:
from newspaper import Article
import time

In [85]:
def parse_chunk(articles: list) -> list:
    start = time.time()
    extracted_text = []
    for article in articles:
        try:
            current_article = Article(article[1])
            current_article.download(), current_article.parse()
            extracted_text.append(current_article.text)
        except Exception:
            extracted_text.append("N/A")
    end = time.time()
    print(end-start, "seconds elapsed")
    return extracted_text

In [86]:
len(split_df)

30

In [87]:
all_text = []

In [88]:
for index, chunk in enumerate(split_df):
    print(f'---- Chunk #{index+1} ----')
    parsed_current_chunk = parse_chunk(chunk.values)
    all_text.append(parsed_current_chunk)

---- Chunk #1 ----
11.690568923950195 seconds elapsed
---- Chunk #2 ----
8.243860960006714 seconds elapsed
---- Chunk #3 ----
8.018892765045166 seconds elapsed
---- Chunk #4 ----
6.338898181915283 seconds elapsed
---- Chunk #5 ----
4.923300266265869 seconds elapsed
---- Chunk #6 ----
10.99209189414978 seconds elapsed
---- Chunk #7 ----
17.96299695968628 seconds elapsed
---- Chunk #8 ----
47.86184501647949 seconds elapsed
---- Chunk #9 ----
37.89288783073425 seconds elapsed
---- Chunk #10 ----
12.749147176742554 seconds elapsed
---- Chunk #11 ----
13.294353008270264 seconds elapsed
---- Chunk #12 ----
8.166202783584595 seconds elapsed
---- Chunk #13 ----
9.209846019744873 seconds elapsed
---- Chunk #14 ----
21.25211811065674 seconds elapsed
---- Chunk #15 ----
25.579724073410034 seconds elapsed
---- Chunk #16 ----
13.44348692893982 seconds elapsed
---- Chunk #17 ----
23.935818910598755 seconds elapsed
---- Chunk #18 ----
9.723777055740356 seconds elapsed
---- Chunk #19 ----
15.402298927

In [89]:
df.shape

(560, 3)

In [91]:
assert sum(len(elem) for elem in all_text) == df.shape[0]

In [93]:
sum(len(elem) for elem in all_text)

560

In [94]:
df

Unnamed: 0,title,link,article_type
0,\n \n \n Quercetin for Seasonal Allergies?\n \...,https://www.consumerlab.com/reviews/quercetin-...,1
1,\n \n \n How Many Times Should You Test for CO...,https://www.consumerlab.com/answers/how-and-wh...,1
2,\n 08/08/2022\n Woman Pleads Guilty for Sellin...,https://www.consumerlab.com/recalls/14684/woma...,1
3,\n 08/08/2022\n Seller of CBD Warned for COVID...,https://www.consumerlab.com/recalls/14683/sell...,1
4,\n \n Product Reviews and Answers to Questions...,https://www.consumerlab.com/topic/coronavirus/,1
...,...,...,...
555,Most people infected with Omicron weren't even...,https://www.sott.net/article/471164-Most-peopl...,0
556,"Zoonotic Langya virus found in China, CDC says",https://www.sott.net/article/470863-Zoonotic-L...,0
557,CDC quietly removes 'claim' that spike protein...,https://www.sott.net/article/471150-CDC-quietl...,0
558,9/11 vs. COVID: The Sorry Tale of a Nation Gon...,https://www.veteranstoday.com/2022/08/12/9-11-...,0


In [95]:
df['text'] = 'undetermined'
df.insert(len(df.columns)-1, 'article_type', df.pop('article_type'))

In [96]:
current_index = 0
for current_chunk in all_text:
    for current_text in current_chunk:
        df.at[current_index, 'text'] = current_text
        current_index += 1

In [97]:
df['text'].value_counts()['N/A']

80

In [98]:
df

Unnamed: 0,title,link,text,article_type
0,\n \n \n Quercetin for Seasonal Allergies?\n \...,https://www.consumerlab.com/reviews/quercetin-...,Save to favorites\n\nThis feature is restricte...,1
1,\n \n \n How Many Times Should You Test for CO...,https://www.consumerlab.com/answers/how-and-wh...,"Answer:\n\nBefore we get into specifics, let's...",1
2,\n 08/08/2022\n Woman Pleads Guilty for Sellin...,https://www.consumerlab.com/recalls/14684/woma...,"On July 27, 2022, Diana Daffin, owner of Savvy...",1
3,\n 08/08/2022\n Seller of CBD Warned for COVID...,https://www.consumerlab.com/recalls/14683/sell...,"On August 4, 2022, the FDA sent a warning lett...",1
4,\n \n Product Reviews and Answers to Questions...,https://www.consumerlab.com/topic/coronavirus/,Save to favorites\n\nThis feature is restricte...,1
...,...,...,...,...
555,Most people infected with Omicron weren't even...,https://www.sott.net/article/471164-Most-peopl...,The lack of public awareness about being infec...,0
556,"Zoonotic Langya virus found in China, CDC says",https://www.sott.net/article/470863-Zoonotic-L...,© Daily PRABHAT/simplifay\n\n\n\nThe 26 patien...,0
557,CDC quietly removes 'claim' that spike protein...,https://www.sott.net/article/471150-CDC-quietl...,The mRNA vaccines cannot give you COVID-19. Th...,0
558,9/11 vs. COVID: The Sorry Tale of a Nation Gon...,https://www.veteranstoday.com/2022/08/12/9-11-...,By John Kaminski\n\nWhat goes around comes aro...,0


In [99]:
# drop all columns with N/A in their text column
df = df[df['text'] != "N/A"]

In [100]:
df.shape

(480, 4)

In [101]:
df

Unnamed: 0,title,link,text,article_type
0,\n \n \n Quercetin for Seasonal Allergies?\n \...,https://www.consumerlab.com/reviews/quercetin-...,Save to favorites\n\nThis feature is restricte...,1
1,\n \n \n How Many Times Should You Test for CO...,https://www.consumerlab.com/answers/how-and-wh...,"Answer:\n\nBefore we get into specifics, let's...",1
2,\n 08/08/2022\n Woman Pleads Guilty for Sellin...,https://www.consumerlab.com/recalls/14684/woma...,"On July 27, 2022, Diana Daffin, owner of Savvy...",1
3,\n 08/08/2022\n Seller of CBD Warned for COVID...,https://www.consumerlab.com/recalls/14683/sell...,"On August 4, 2022, the FDA sent a warning lett...",1
4,\n \n Product Reviews and Answers to Questions...,https://www.consumerlab.com/topic/coronavirus/,Save to favorites\n\nThis feature is restricte...,1
...,...,...,...,...
555,Most people infected with Omicron weren't even...,https://www.sott.net/article/471164-Most-peopl...,The lack of public awareness about being infec...,0
556,"Zoonotic Langya virus found in China, CDC says",https://www.sott.net/article/470863-Zoonotic-L...,© Daily PRABHAT/simplifay\n\n\n\nThe 26 patien...,0
557,CDC quietly removes 'claim' that spike protein...,https://www.sott.net/article/471150-CDC-quietl...,The mRNA vaccines cannot give you COVID-19. Th...,0
558,9/11 vs. COVID: The Sorry Tale of a Nation Gon...,https://www.veteranstoday.com/2022/08/12/9-11-...,By John Kaminski\n\nWhat goes around comes aro...,0


- remove all escape sequences and non-ascii unicode characters (like \\xe2)
- .strip() to remove any unnecessary spaces
- standardize to single space between word

In [102]:
import re

# remove escape characters (\n works, unicode characters stuff like \\xe90 doesn't work the best - e.g. the latter part of possessive or contractions are cut off)
def clean_str(unfiltered_str: str):
    formatted_str = re.sub('\\\\\w+', '', unfiltered_str).strip()
    return str((' ').join(formatted_str.split()))

In [103]:
for i, row in df.iterrows():
    df.at[i, 'title'] = clean_str(df.at[i, 'title'])
    df.at[i, 'text'] = clean_str(df.at[i, 'text'])

In [104]:
df

Unnamed: 0,title,link,text,article_type
0,Quercetin for Seasonal Allergies? A recent stu...,https://www.consumerlab.com/reviews/quercetin-...,Save to favorites This feature is restricted t...,1
1,How Many Times Should You Test for COVID? It m...,https://www.consumerlab.com/answers/how-and-wh...,"Answer: Before we get into specifics, let's st...",1
2,08/08/2022 Woman Pleads Guilty for Selling and...,https://www.consumerlab.com/recalls/14684/woma...,"On July 27, 2022, Diana Daffin, owner of Savvy...",1
3,08/08/2022 Seller of CBD Warned for COVID-19 C...,https://www.consumerlab.com/recalls/14683/sell...,"On August 4, 2022, the FDA sent a warning lett...",1
4,Product Reviews and Answers to Questions About...,https://www.consumerlab.com/topic/coronavirus/,Save to favorites This feature is restricted t...,1
...,...,...,...,...
555,Most people infected with Omicron weren't even...,https://www.sott.net/article/471164-Most-peopl...,The lack of public awareness about being infec...,0
556,"Zoonotic Langya virus found in China, CDC says",https://www.sott.net/article/470863-Zoonotic-L...,© Daily PRABHAT/simplifay The 26 patients deve...,0
557,CDC quietly removes 'claim' that spike protein...,https://www.sott.net/article/471150-CDC-quietl...,The mRNA vaccines cannot give you COVID-19. Th...,0
558,9/11 vs. COVID: The Sorry Tale of a Nation Gon...,https://www.veteranstoday.com/2022/08/12/9-11-...,By John Kaminski What goes around comes around...,0


In [105]:
processed_json = df.to_json('../Data/processed/processed_data.json', orient='records', indent=4)