## Imports

In [1]:
import requests
import json
from bs4 import BeautifulSoup

## Reading file

In [2]:
with open('scraping_urls/ghost_apis.json', 'r') as file:
    ghost_apis = json.load(file)

In [3]:
with open('scraping_urls/tag_urls.json', 'r') as file:
    tag_urls = json.load(file)

## Helper functions

In [4]:
def extract(post):
    return({'title' : post['title'], 
                    'slug':post['slug'], 
                    'feature_image':post['feature_image'], 
                    'link': f'https://www.deeplearning.ai/the-batch/{post["slug"]}' })
    

In [5]:
def scrape_Ghost_API(url : str):
    pg_num = 1 
    titles = []
    while True:
        response = requests.get(f'{url}{pg_num}')
        data = response.json()
        posts_info = data['posts']

        if len(posts_info) == 0:
            break

        for post in posts_info:
            titles.append(extract(post))
        pg_num += 1
    
    return titles


In [6]:
def scrape_issues(base_url : str, max_arcticles_num):
    
    pg_num = 1
    titles_issues = []
    not_found = {'notFound': True}
  
    while True:

        response = requests.get(base_url.format(pg_num, pg_num))
        data = response.json()

        if pg_num > max_arcticles_num:
            break

        if data == not_found:
            pg_num += 1
            continue
        
        post = data['pageProps']['cmsData']['post']

        titles_issues.append(extract(post))
        pg_num += 1

    return titles_issues


def scrape_letters(base_url : str):
    pg_num = 1
    titles_letters = []

    while True:

        response = requests.get(base_url.format(pg_num, pg_num))
        data = response.json()
        posts_info = data['pageProps']['posts'] 

        if len(posts_info) == 0:
            break

        for post in posts_info:
            titles_letters.append(extract(post))

        pg_num += 1
    return titles_letters


def scrape_ai_society(base_url):
  
    response = requests.get(base_url)
    data = response.json()
    post = data['pageProps']['cmsData']['post']
    
    return [extract(post)]

## Scraping the titles, images and links


In [7]:
dct_titles = {}

for tag, url in ghost_apis.items():
    dct_titles[tag] = scrape_Ghost_API(url)


In [8]:
for tag, url in ghost_apis.items():
    print(tag, len(dct_titles[tag]))


data-points 58
research 359
business 173
science 75
culture 22
hardware 28
ai-careers 19


In [9]:
dct_titles['issues'] = scrape_issues(tag_urls['issues'], 300)
print(len(dct_titles['issues']))

268


In [10]:
print(len(dct_titles['issues']))

268


In [11]:
dct_titles['letters'] = scrape_letters(tag_urls['letters'])
print(len(dct_titles['letters']))

256


In [12]:
dct_titles['ai-society'] = scrape_ai_society(tag_urls['ai-society'])
print(len(dct_titles['ai-society']))

1


## Scrape articles texts

In [17]:
for i in dct_titles:
    print(i)

data-points
research
business
science
culture
hardware
ai-careers
issues
letters
ai-society


In [19]:
data = {}

In [20]:
for tag in dct_titles:
    data[tag] = []
    for post in dct_titles[tag]:
        # print(title['link'])
        reqs = requests.get(post['link'])
        soup = BeautifulSoup(reqs.text, 'html.parser')  
        
        article_div = soup.find('div', class_="prose--styled justify-self-center post_postContent__wGZtc")

# Extract all the text from that div (including both <p> and <li> tags)
        if article_div:
            article_text = ' '.join([element.get_text() for element in article_div.find_all(['p', 'li'])])
            post['text'] = article_text
            data[tag].append(post)
        else:
            print("post: ", post)
            print("Div with the specified class not found")
            break

## Save to JSON file

In [21]:
with open('raw_data.json', 'w') as json_file:
    json.dump(dct_titles, json_file)