accessing previously extracted data from scraping

In [1]:
import json

# creating a function to read in stored JSON files
def read_articles_from_storage(filename: str) -> list:
    # specifying a filename where to create a new file
    filename = f"Data/extract_articles/{filename}.json"

    # creating a new file located at filename and writing JSON-ified articles into that file
    with open(filename, 'r') as f:
        return json.loads(f.read())

combining both kinds of articles and converting the combination to a DataFrame

In [2]:
# reading the scientific and conspiracy articles stored in the data_collection process
science_articles = read_articles_from_storage('science')
conspiracy_articles = read_articles_from_storage('conspiracy')

figure out correct way to approach this issue of an imbalanced data set

In [3]:
import pandas as pd

In [4]:
# creating a pandas dataframe from the two lists for the two types of articles
df_science = pd.DataFrame.from_dict(science_articles)
df_conspiracy = pd.DataFrame.from_dict(conspiracy_articles)

In [5]:
# creating a new column to distinguish scientific articles from conspiracy ones
df_science['article_type'] = 1
df_conspiracy['article_type'] = 0

In [6]:
df_science

Unnamed: 0,title,link,article_type
0,Sound Environment during Dental Treatment in R...,https://www.mdpi.com/journal/acoustics/2624-59...,1
1,Factors Associated with the Prevalence and Tre...,https://www.mdpi.com/journal/adolescents/2673-...,1
2,The Impact of Comprehensive Rehabilitation on ...,https://www.mdpi.com/journal/arm/2543-6031/91/...,1
3,COVID-19 Acute Respiratory Distress Syndrome: ...,https://www.mdpi.com/journal/arm/2543-6031/91/...,1
4,The Clinical Significance of Aspergillus Detec...,https://www.mdpi.com/journal/arm/2543-6031/91/...,1
...,...,...,...
684,\xe2\x80\x9cDowngrade your mask before enterin...,https://www.bmj.com/content/378/bmj.o1929,1
685,What can we learn from the first 100 days of t...,https://www.bmj.com/content/378/bmj.o1966,1
686,Re: Paediatric \xe2\x80\x9cvirtual wards\xe2\x...,https://www.bmj.com/content/378/bmj.o1856/rr,1
687,Covid-19: is omicron less lethal than delta?Co...,https://www.bmj.com/content/378/bmj.o1806,1


In [7]:
df_conspiracy

Unnamed: 0,title,link,article_type
0,Italian Health Minister Under Investigation fo...,https://www.abovetopsecret.com/forum/thread134...,0
1,COVID vaccination rates \xe2\x80\x98alarmingly...,https://www.abovetopsecret.com/forum/thread134...,0
2,"25% of COVID Vaxxed Now Have VAIDS, Cambridge ...",https://www.abovetopsecret.com/forum/thread134...,0
3,54% of US Youth are Chronically Ill* America's...,https://childrenshealthdefense.org/follow-the-...,0
4,lobbied for COVID-19,https://www.leefang.com/p/pfizer-quietly-finan...,0
...,...,...,...
1129,New\n Zealand Covid deaths soar to RECORD high...,https://www.dailymail.co.uk/health/article-110...,0
1130,COVID vaccines deaths now exceed the HOLOCAUST,https://www.brighteon.com/4e3b7ea1-346c-416a-a...,0
1131,(Natural News)\n If your child is suffering fr...,https://www.naturalnews.com/2022-08-16-childre...,0
1132,\n\n\n\n \n\n\n\nHealth3 weeks ago\n\nTips on ...,https://www.getholistichealth.com/120321/tips-...,0


In [8]:
# merging the two dataframes whilst also preserving order
df = pd.concat([df_science, df_conspiracy], axis=0, ignore_index=True)

In [9]:
df

Unnamed: 0,title,link,article_type
0,Sound Environment during Dental Treatment in R...,https://www.mdpi.com/journal/acoustics/2624-59...,1
1,Factors Associated with the Prevalence and Tre...,https://www.mdpi.com/journal/adolescents/2673-...,1
2,The Impact of Comprehensive Rehabilitation on ...,https://www.mdpi.com/journal/arm/2543-6031/91/...,1
3,COVID-19 Acute Respiratory Distress Syndrome: ...,https://www.mdpi.com/journal/arm/2543-6031/91/...,1
4,The Clinical Significance of Aspergillus Detec...,https://www.mdpi.com/journal/arm/2543-6031/91/...,1
...,...,...,...
1818,New\n Zealand Covid deaths soar to RECORD high...,https://www.dailymail.co.uk/health/article-110...,0
1819,COVID vaccines deaths now exceed the HOLOCAUST,https://www.brighteon.com/4e3b7ea1-346c-416a-a...,0
1820,(Natural News)\n If your child is suffering fr...,https://www.naturalnews.com/2022-08-16-childre...,0
1821,\n\n\n\n \n\n\n\nHealth3 weeks ago\n\nTips on ...,https://www.getholistichealth.com/120321/tips-...,0


scraping to obtain the textual content in all short-listed articles

In [10]:
import numpy as np

In [11]:
# splitting the df into 30 different parts to improve async scraping efficiency
split_df = np.array_split(df, 30)

  return bound(*args, **kwds)


In [12]:
split_df[0]

Unnamed: 0,title,link,article_type
0,Sound Environment during Dental Treatment in R...,https://www.mdpi.com/journal/acoustics/2624-59...,1
1,Factors Associated with the Prevalence and Tre...,https://www.mdpi.com/journal/adolescents/2673-...,1
2,The Impact of Comprehensive Rehabilitation on ...,https://www.mdpi.com/journal/arm/2543-6031/91/...,1
3,COVID-19 Acute Respiratory Distress Syndrome: ...,https://www.mdpi.com/journal/arm/2543-6031/91/...,1
4,The Clinical Significance of Aspergillus Detec...,https://www.mdpi.com/journal/arm/2543-6031/91/...,1
...,...,...,...
56,COVID-19The latest news related\r\nto COVID-19...,https://www.eurekalert.org/covid,1
57,\n \n \n \n \n GPCR structure: Research reveal...,https://www.eurekalert.org/news-releases/1029787,1
58,The Get-Together: Psoriasis and Psoriatic Arth...,https://www.everydayhealth.com/psoriatic-arthr...,1
59,Coronavirus / COVID-19,https://www.everydayhealth.com/coronavirus/,1


In [13]:
from newspaper import Article
import time

# obtaining the texts for the articles present in the current chunk
def parse_chunk(articles: list) -> list:
    # creating a variable to store the article texts for the current chunk
    extracted_text = []

    # looping over all of the articles
    for article in articles:
        try:
            # attempting to obtain the article text for the current article
            current_article = Article(article[1])
            current_article.download(), current_article.parse()

            # adding the article text to storage
            extracted_text.append(current_article.text)
        except Exception:
            # adding in a sentinel value to storage to indicate failure
            extracted_text.append("N/A")

    return extracted_text

In [14]:
len(split_df)

30

In [15]:
# creating a variable to store all the text for all of the articles
all_text = []

In [16]:
# loop over
for index, chunk in enumerate(split_df):
    # printing out a status message - specifying the current chunk number
    print(f'---- Chunk #{index+1} ----')

    # creating a variable to store the time execution is started
    start = time.time()

    # parsing the current chunk
    parsed_current_chunk = parse_chunk(chunk.values)

    # creating a variable to store the time execution is ended
    end = time.time()

    # printing out a status message - specifying the number of seconds elapsed
    print(end-start, "seconds elapsed")

    # adding all of the articles in the current chunk to storage
    all_text.append(parsed_current_chunk)

---- Chunk #1 ----
44.82429003715515 seconds elapsed
---- Chunk #2 ----
69.06636595726013 seconds elapsed
---- Chunk #3 ----
78.94835996627808 seconds elapsed
---- Chunk #4 ----
67.07026982307434 seconds elapsed
---- Chunk #5 ----
114.39912605285645 seconds elapsed
---- Chunk #6 ----
81.1420259475708 seconds elapsed
---- Chunk #7 ----
69.78620886802673 seconds elapsed
---- Chunk #8 ----
38.096014976501465 seconds elapsed
---- Chunk #9 ----
71.31576776504517 seconds elapsed
---- Chunk #10 ----
75.67331409454346 seconds elapsed
---- Chunk #11 ----
79.27014422416687 seconds elapsed
---- Chunk #12 ----
55.794458866119385 seconds elapsed
---- Chunk #13 ----
43.28125786781311 seconds elapsed
---- Chunk #14 ----
29.59796977043152 seconds elapsed
---- Chunk #15 ----
63.08168005943298 seconds elapsed
---- Chunk #16 ----
113.0541307926178 seconds elapsed
---- Chunk #17 ----
156.42103815078735 seconds elapsed
---- Chunk #18 ----
26.70067286491394 seconds elapsed
---- Chunk #19 ----
42.20689606666

In [17]:
df.shape

(1823, 3)

In [18]:
# ensuring no extra columns have been created by any account
assert sum(len(elem) for elem in all_text) == df.shape[0]

In [19]:
df

Unnamed: 0,title,link,article_type
0,Sound Environment during Dental Treatment in R...,https://www.mdpi.com/journal/acoustics/2624-59...,1
1,Factors Associated with the Prevalence and Tre...,https://www.mdpi.com/journal/adolescents/2673-...,1
2,The Impact of Comprehensive Rehabilitation on ...,https://www.mdpi.com/journal/arm/2543-6031/91/...,1
3,COVID-19 Acute Respiratory Distress Syndrome: ...,https://www.mdpi.com/journal/arm/2543-6031/91/...,1
4,The Clinical Significance of Aspergillus Detec...,https://www.mdpi.com/journal/arm/2543-6031/91/...,1
...,...,...,...
1818,New\n Zealand Covid deaths soar to RECORD high...,https://www.dailymail.co.uk/health/article-110...,0
1819,COVID vaccines deaths now exceed the HOLOCAUST,https://www.brighteon.com/4e3b7ea1-346c-416a-a...,0
1820,(Natural News)\n If your child is suffering fr...,https://www.naturalnews.com/2022-08-16-childre...,0
1821,\n\n\n\n \n\n\n\nHealth3 weeks ago\n\nTips on ...,https://www.getholistichealth.com/120321/tips-...,0


In [20]:
# creating a new column to store the texts for a particular article
df['text'] = 'undetermined'

# moving the article_type column to the far end of the dataframe, as where it should be
df.insert(len(df.columns)-1, 'article_type', df.pop('article_type'))

In [21]:
# creating a variable to store the current index in the all_text list
current_index = 0

# looping over all of the articles
for current_chunk in all_text:

    # looping over the articles parsed in the current chunk
    for current_text in current_chunk:

        # replacing the value in the current row with current_text
        df.at[current_index, 'text'] = current_text

        # incrementing the current_index
        current_index += 1

In [22]:
df

Unnamed: 0,title,link,text,article_type
0,Sound Environment during Dental Treatment in R...,https://www.mdpi.com/journal/acoustics/2624-59...,,1
1,Factors Associated with the Prevalence and Tre...,https://www.mdpi.com/journal/adolescents/2673-...,,1
2,The Impact of Comprehensive Rehabilitation on ...,https://www.mdpi.com/journal/arm/2543-6031/91/...,,1
3,COVID-19 Acute Respiratory Distress Syndrome: ...,https://www.mdpi.com/journal/arm/2543-6031/91/...,,1
4,The Clinical Significance of Aspergillus Detec...,https://www.mdpi.com/journal/arm/2543-6031/91/...,,1
...,...,...,...,...
1818,New\n Zealand Covid deaths soar to RECORD high...,https://www.dailymail.co.uk/health/article-110...,Advertisement\n\nCovid death rates have reache...,0
1819,COVID vaccines deaths now exceed the HOLOCAUST,https://www.brighteon.com/4e3b7ea1-346c-416a-a...,0:00 Intro\n\n1:50 Anti-White Racism\n\n9:25 B...,0
1820,(Natural News)\n If your child is suffering fr...,https://www.naturalnews.com/2022-08-16-childre...,Warning: Young children now prescribed dangero...,0
1821,\n\n\n\n \n\n\n\nHealth3 weeks ago\n\nTips on ...,https://www.getholistichealth.com/120321/tips-...,"When you purchase any type of product online, ...",0


In [23]:
# print out how many invalid articles there are
df['text'].value_counts()['N/A']

547

In [24]:
# drop all columns with N/A in their text column
df = df[df['text'] != "N/A"]

In [25]:
df

Unnamed: 0,title,link,text,article_type
5,Children and COVID-19 Vaccination Trends,https://www.aap.org/en/pages/2019-novel-corona...,Summary of data publicly reported by the Cente...,1
6,COVID-19 State-Level Data Reports,https://www.aap.org/en/pages/2019-novel-corona...,"On May 11, 2023, the United States ended the P...",1
7,\n\t\t\n\t\t\t\t\n\t\t\t\t\n \n\n \n \n \n \n ...,https://www.cancer.org/cancer/risk-prevention/...,Our highly trained specialists are available 2...,1
8,COVID-19,https://www.lung.org/lung-health-diseases/lung...,Can we help you find more info? Start by selec...,1
9,\n \n End Youth Vaping\n Let\'s join together ...,https://www.lung.org/quit-smoking/end-youth-va...,Research – Youth Vaping and Lung Health\n\nThe...,1
...,...,...,...,...
1817,Jacinda Ardern will bring BACK\n hated Covid r...,https://www.dailymail.co.uk/news/article-11008...,New Zealand is set to tighten its Covid restri...,0
1818,New\n Zealand Covid deaths soar to RECORD high...,https://www.dailymail.co.uk/health/article-110...,Advertisement\n\nCovid death rates have reache...,0
1819,COVID vaccines deaths now exceed the HOLOCAUST,https://www.brighteon.com/4e3b7ea1-346c-416a-a...,0:00 Intro\n\n1:50 Anti-White Racism\n\n9:25 B...,0
1820,(Natural News)\n If your child is suffering fr...,https://www.naturalnews.com/2022-08-16-childre...,Warning: Young children now prescribed dangero...,0


cleaning the data obtained to ensure <i>only</i> textual content is taken

- remove all escape sequences and non-ascii unicode characters (like \\xe2)
- .strip() to remove any unnecessary spaces
- standardize to single space between word

In [26]:
import re

# remove escape characters (\n works, unicode characters stuff like \\xe90 doesn't work the best - e.g. the latter part of possessive or contractions are cut off)
def clean_str(unfiltered_str: str):
    formatted_str = re.sub('\\\\\w+', '', unfiltered_str).strip()
    return str((' ').join(formatted_str.split()))

In [27]:
# loop over all of the columns and and replace existing contents with cleaned up contents
for i, row in df.iterrows():
    df.at[i, 'title'] = clean_str(df.at[i, 'title'])
    df.at[i, 'text'] = clean_str(df.at[i, 'text'])

In [28]:
df

Unnamed: 0,title,link,text,article_type
5,Children and COVID-19 Vaccination Trends,https://www.aap.org/en/pages/2019-novel-corona...,Summary of data publicly reported by the Cente...,1
6,COVID-19 State-Level Data Reports,https://www.aap.org/en/pages/2019-novel-corona...,"On May 11, 2023, the United States ended the P...",1
7,Prevention Papillomavirus can cause 6 types of...,https://www.cancer.org/cancer/risk-prevention/...,Our highly trained specialists are available 2...,1
8,COVID-19,https://www.lung.org/lung-health-diseases/lung...,Can we help you find more info? Start by selec...,1
9,End Youth Vaping Let\'s join together to end t...,https://www.lung.org/quit-smoking/end-youth-va...,Research – Youth Vaping and Lung Health The Am...,1
...,...,...,...,...
1817,Jacinda Ardern will bring BACK hated Covid res...,https://www.dailymail.co.uk/news/article-11008...,New Zealand is set to tighten its Covid restri...,0
1818,New Zealand Covid deaths soar to RECORD high a...,https://www.dailymail.co.uk/health/article-110...,Advertisement Covid death rates have reached p...,0
1819,COVID vaccines deaths now exceed the HOLOCAUST,https://www.brighteon.com/4e3b7ea1-346c-416a-a...,0:00 Intro 1:50 Anti-White Racism 9:25 Biden 1...,0
1820,(Natural News) If your child is suffering from...,https://www.naturalnews.com/2022-08-16-childre...,Warning: Young children now prescribed dangero...,0


saving the DataFrame for short-listed articles as JSON

In [44]:
# saving the dataframe under the 'Data' directory for further use
processed_json = df.to_json('Data/processed_data.json', orient='records', indent=4)