accessing previously extracted data from scraping

In [3]:
import json

with open("data/article_metadata/science.json", 'r') as file:
    science_articles = json.loads(file.read())

In [4]:
with open("data/article_metadata/conspiracy.json", 'r') as file:
    conspiracy_articles = json.loads(file.read())

combining both kinds of articles and converting the combination to a DataFrame

In [5]:
import pandas as pd

In [8]:
# creating a pandas dataframe from the two lists for the two types of articles
df_science = pd.DataFrame.from_dict(science_articles)
df_conspiracy = pd.DataFrame.from_dict(conspiracy_articles)

In [9]:
# creating a new column to distinguish scientific articles from conspiracy ones
df_science['article_type'] = "science"
df_conspiracy['article_type'] = "conspiracy"

In [10]:
df_science.head()

Unnamed: 0,title,link,article_type
0,Sound Environment during Dental Treatment in R...,https://www.mdpi.com/journal/acoustics/2624-59...,science
1,Factors Associated with the Prevalence and Tre...,https://www.mdpi.com/journal/adolescents/2673-...,science
2,The Impact of Comprehensive Rehabilitation on ...,https://www.mdpi.com/journal/arm/2543-6031/91/...,science
3,COVID-19 Acute Respiratory Distress Syndrome: ...,https://www.mdpi.com/journal/arm/2543-6031/91/...,science
4,The Clinical Significance of Aspergillus Detec...,https://www.mdpi.com/journal/arm/2543-6031/91/...,science


In [11]:
df_conspiracy.head()

Unnamed: 0,title,link,article_type
0,Italian Health Minister Under Investigation fo...,https://www.abovetopsecret.com/forum/thread134...,conspiracy
1,COVID vaccination rates \xe2\x80\x98alarmingly...,https://www.abovetopsecret.com/forum/thread134...,conspiracy
2,"25% of COVID Vaxxed Now Have VAIDS, Cambridge ...",https://www.abovetopsecret.com/forum/thread134...,conspiracy
3,54% of US Youth are Chronically Ill* America's...,https://childrenshealthdefense.org/follow-the-...,conspiracy
4,lobbied for COVID-19,https://www.leefang.com/p/pfizer-quietly-finan...,conspiracy


In [14]:
# merging the two dataframes whilst also preserving order
df = pd.concat([df_science, df_conspiracy], axis=0, ignore_index=True)

In [15]:
df.head()

Unnamed: 0,title,link,article_type
0,Sound Environment during Dental Treatment in R...,https://www.mdpi.com/journal/acoustics/2624-59...,science
1,Factors Associated with the Prevalence and Tre...,https://www.mdpi.com/journal/adolescents/2673-...,science
2,The Impact of Comprehensive Rehabilitation on ...,https://www.mdpi.com/journal/arm/2543-6031/91/...,science
3,COVID-19 Acute Respiratory Distress Syndrome: ...,https://www.mdpi.com/journal/arm/2543-6031/91/...,science
4,The Clinical Significance of Aspergillus Detec...,https://www.mdpi.com/journal/arm/2543-6031/91/...,science


scraping to obtain the textual content in all short-listed articles

In [16]:
import numpy as np

In [17]:
# splitting the df into 30 different parts to improve async scraping efficiency
split_df = np.array_split(df, 30)

  return bound(*args, **kwds)


In [18]:
split_df[0]

Unnamed: 0,title,link,article_type
0,Sound Environment during Dental Treatment in R...,https://www.mdpi.com/journal/acoustics/2624-59...,science
1,Factors Associated with the Prevalence and Tre...,https://www.mdpi.com/journal/adolescents/2673-...,science
2,The Impact of Comprehensive Rehabilitation on ...,https://www.mdpi.com/journal/arm/2543-6031/91/...,science
3,COVID-19 Acute Respiratory Distress Syndrome: ...,https://www.mdpi.com/journal/arm/2543-6031/91/...,science
4,The Clinical Significance of Aspergillus Detec...,https://www.mdpi.com/journal/arm/2543-6031/91/...,science
...,...,...,...
57,\n \n \n \n \n GPCR structure: Research reveal...,https://www.eurekalert.org/news-releases/1029787,science
58,The Get-Together: Psoriasis and Psoriatic Arth...,https://www.everydayhealth.com/psoriatic-arthr...,science
59,Coronavirus / COVID-19,https://www.everydayhealth.com/coronavirus/,science
60,Original Research \n Accepted on 22 Dec 2023\n...,https://www.frontiersin.org/articles/10.3389/f...,science


In [19]:
from newspaper import Article
import time

# obtaining the texts for the articles in current chunk
def parse_chunk(articles: list) -> list:
    extracted_text = []

    for article in articles:
        try:
            # attempting to obtain the article text for the current article
            current_article = Article(article[1])
            current_article.download(), current_article.parse()

            extracted_text.append(current_article.text)
        except Exception:
            # adding in a placeholder value to storage to indicate failure
            extracted_text.append("N/A")

    return extracted_text

In [20]:
len(split_df)

30

In [21]:
# creating a variable to store all the text for all of the articles
all_text = []

In [22]:
# loop over
for index, chunk in enumerate(split_df):
    # printing out a status message - specifying the current chunk number
    print(f'---- Chunk #{index+1} ----')

    # creating a variable to store the time execution is started
    start = time.time()

    # parsing the current chunk
    parsed_current_chunk = parse_chunk(chunk.values)

    # creating a variable to store the time execution is ended
    end = time.time()

    # printing out a status message - specifying the number of seconds elapsed
    print(end-start, "seconds elapsed")

    # adding all of the articles in the current chunk to storage
    all_text.append(parsed_current_chunk)

---- Chunk #1 ----
45.489850997924805 seconds elapsed
---- Chunk #2 ----
64.76089906692505 seconds elapsed
---- Chunk #3 ----
68.44151020050049 seconds elapsed
---- Chunk #4 ----
65.22908592224121 seconds elapsed
---- Chunk #5 ----
93.50036597251892 seconds elapsed
---- Chunk #6 ----
78.28867483139038 seconds elapsed
---- Chunk #7 ----
70.02627515792847 seconds elapsed
---- Chunk #8 ----
44.25364112854004 seconds elapsed
---- Chunk #9 ----
71.53936171531677 seconds elapsed
---- Chunk #10 ----
85.68718695640564 seconds elapsed
---- Chunk #11 ----
69.15550994873047 seconds elapsed
---- Chunk #12 ----
54.429301023483276 seconds elapsed
---- Chunk #13 ----
30.052423000335693 seconds elapsed
---- Chunk #14 ----
26.814396858215332 seconds elapsed
---- Chunk #15 ----
60.601370096206665 seconds elapsed
---- Chunk #16 ----
157.3115737438202 seconds elapsed
---- Chunk #17 ----
177.0445899963379 seconds elapsed
---- Chunk #18 ----
20.673630952835083 seconds elapsed
---- Chunk #19 ----
37.57565903

In [23]:
df.shape

(1848, 3)

In [24]:
# ensuring no extra columns have been created by any account
assert sum(len(elem) for elem in all_text) == df.shape[0]

In [25]:
df.head()

Unnamed: 0,title,link,article_type
0,Sound Environment during Dental Treatment in R...,https://www.mdpi.com/journal/acoustics/2624-59...,science
1,Factors Associated with the Prevalence and Tre...,https://www.mdpi.com/journal/adolescents/2673-...,science
2,The Impact of Comprehensive Rehabilitation on ...,https://www.mdpi.com/journal/arm/2543-6031/91/...,science
3,COVID-19 Acute Respiratory Distress Syndrome: ...,https://www.mdpi.com/journal/arm/2543-6031/91/...,science
4,The Clinical Significance of Aspergillus Detec...,https://www.mdpi.com/journal/arm/2543-6031/91/...,science


In [26]:
# creating a new column to store the texts for a particular article
df['text'] = 'undetermined'

# moving the article_type column to the far end of the dataframe, as where it should be
df.insert(len(df.columns)-1, 'article_type', df.pop('article_type'))

In [27]:
# creating a variable to store the current index in the all_text list
current_index = 0

# looping over all of the articles
for current_chunk in all_text:

    # looping over the articles parsed in the current chunk
    for current_text in current_chunk:

        # replacing the value in the current row with current_text
        df.at[current_index, 'text'] = current_text

        # incrementing the current_index
        current_index += 1

In [28]:
df.head()

Unnamed: 0,title,link,text,article_type
0,Sound Environment during Dental Treatment in R...,https://www.mdpi.com/journal/acoustics/2624-59...,,science
1,Factors Associated with the Prevalence and Tre...,https://www.mdpi.com/journal/adolescents/2673-...,,science
2,The Impact of Comprehensive Rehabilitation on ...,https://www.mdpi.com/journal/arm/2543-6031/91/...,,science
3,COVID-19 Acute Respiratory Distress Syndrome: ...,https://www.mdpi.com/journal/arm/2543-6031/91/...,,science
4,The Clinical Significance of Aspergillus Detec...,https://www.mdpi.com/journal/arm/2543-6031/91/...,,science


In [29]:
# print out how many invalid articles there are
df['text'].value_counts()['N/A']

570

In [30]:
# drop all columns with N/A in their text column
df = df[df['text'] != "N/A"]

In [31]:
df.head()

Unnamed: 0,title,link,text,article_type
5,Children and COVID-19 Vaccination Trends,https://www.aap.org/en/pages/2019-novel-corona...,Summary of data publicly reported by the Cente...,science
6,COVID-19 State-Level Data Reports,https://www.aap.org/en/pages/2019-novel-corona...,"On May 11, 2023, the United States ended the P...",science
7,\n\t\t\n\t\t\t\t\n\t\t\t\t\n \n\n \n \n \n \n ...,https://www.cancer.org/cancer/risk-prevention/...,Our highly trained specialists are available 2...,science
8,COVID-19,https://www.lung.org/lung-health-diseases/lung...,Can we help you find more info? Start by selec...,science
9,\n \n End Youth Vaping\n Let\'s join together ...,https://www.lung.org/quit-smoking/end-youth-va...,Research – Youth Vaping and Lung Health\n\nThe...,science


cleaning the data obtained to ensure <i>only</i> textual content is taken

- remove all escape sequences and non-ascii unicode characters (like \\xe2)
- .strip() to remove any unnecessary spaces
- standardize to single space between word

In [32]:
import re

# remove escape characters (\n works, unicode characters stuff like \\xe90 doesn't work the best - e.g. the latter part of possessive or contractions are cut off)
def clean_str(unfiltered_str: str):
    formatted_str = re.sub('\\\\\w+', '', unfiltered_str).strip()
    return str((' ').join(formatted_str.split()))

In [33]:
# loop over all of the columns and and replace existing contents with cleaned up contents
for i, row in df.iterrows():
    df.at[i, 'title'] = clean_str(df.at[i, 'title'])
    df.at[i, 'text'] = clean_str(df.at[i, 'text'])

In [34]:
df.head()

Unnamed: 0,title,link,text,article_type
5,Children and COVID-19 Vaccination Trends,https://www.aap.org/en/pages/2019-novel-corona...,Summary of data publicly reported by the Cente...,science
6,COVID-19 State-Level Data Reports,https://www.aap.org/en/pages/2019-novel-corona...,"On May 11, 2023, the United States ended the P...",science
7,Prevention Papillomavirus can cause 6 types of...,https://www.cancer.org/cancer/risk-prevention/...,Our highly trained specialists are available 2...,science
8,COVID-19,https://www.lung.org/lung-health-diseases/lung...,Can we help you find more info? Start by selec...,science
9,End Youth Vaping Let\'s join together to end t...,https://www.lung.org/quit-smoking/end-youth-va...,Research – Youth Vaping and Lung Health The Am...,science


saving the DataFrame for short-listed articles as JSON

In [35]:
with open("data/articles.json") as file:
    current_data = json.load(file)

In [36]:
len(current_data)

1276

In [37]:
# check if existing database contains any duplicates, shouldn't

# def remove_dup(curr):
#     titles = set()
#     rev = []
#     for item in curr:
#         if item['title'] in titles:
#             continue
#         rev.append(item)
#         titles.add(item['title'])
#     return rev

# a = remove_dup(current_data)
# print(len(a))

In [60]:
extracted_articles = json.loads(df.to_json(orient='records'))

In [51]:
df.head()

Unnamed: 0,title,link,text,article_type
5,Children and COVID-19 Vaccination Trends,https://www.aap.org/en/pages/2019-novel-corona...,Summary of data publicly reported by the Cente...,science
6,COVID-19 State-Level Data Reports,https://www.aap.org/en/pages/2019-novel-corona...,"On May 11, 2023, the United States ended the P...",science
7,Prevention Papillomavirus can cause 6 types of...,https://www.cancer.org/cancer/risk-prevention/...,Our highly trained specialists are available 2...,science
8,COVID-19,https://www.lung.org/lung-health-diseases/lung...,Can we help you find more info? Start by selec...,science
9,End Youth Vaping Let\'s join together to end t...,https://www.lung.org/quit-smoking/end-youth-va...,Research – Youth Vaping and Lung Health The Am...,science


In [61]:
extracted_articles[:3]

[{'title': 'Children and COVID-19 Vaccination Trends',
  'link': 'https://www.aap.org/en/pages/2019-novel-coronavirus-covid-19-infections/children-and-covid-19-vaccination-trends/',
  'text': 'Summary of data publicly reported by the Centers for Disease Control and Prevention Date: 5/3/23 [Note: We will be updating this report every month, depending on the continued availability of the data.] The American Academy of Pediatrics recommends COVID-19 vaccination for all children and adolescents 6 months of age and older who do not have contraindications using a vaccine authorized for use for their age. See AAP policy. In this report, drawing on data posted by the Centers for Disease Control and Prevention (CDC) as of 5.3.23, the AAP reports progress in vaccinating US children, including the 50 States and District of Columbia. The report covers the vaccine data available for children 6 months-4 years, and the longer-term data for children 5-11 years and for children 12-17 years of age. See 

In [64]:
# adds new articles discovered to existing database (extending the database)
def consolidate_data(existing_data, new_data):
    def remove_dup(curr):
        titles = set()
        rev = []
        for item in curr:
            if item['title'] in titles:
                continue
            rev.append(item)
            titles.add(item['title'])
        return rev

    with open(f"data/articles.json") as file:
        existing_data = json.loads(file.read())

    print(f"Before combining: {len(existing_data)}")
    existing_data += new_data
    existing_data = remove_dup(existing_data)
    print(f"After combining and removing duplicates: {len(existing_data)}")

    return remove_dup(existing_data)

In [65]:
rev_data = consolidate_data(existing_data = current_data,
                            new_data = extracted_articles)

Before combining: 1276
After combining and removing duplicates: 1262


In [40]:
with open(f"data/articles.json", 'w') as file:
    json.dump(rev_data, file, indent = 4)