# Table of Contents

1. [Wired](#wired)
2. [Dexigner](#dexigner)
3. [Dezeen](#dezeen)
4. [Techcrunch](#techcrunch)
5. [The Verge](#theverge)
6. [Euronews](#euronews)
7. [Design Milk](#designmilk)
8. [Creative Review](#creativereview)
9. [Creative Bloq](#creativebloq)
10. [AWN](#awn)
11. [Architectural Digest](#architecturaldigest)
12. [Cartoon Brew](#cartoonbrew)

# Importing Dependencies

In [1]:
!pip --quiet install selenium
!pip --quiet install beautifulsoup4
!apt-get update -y                    # update 
!apt install chromium-chromedriver -y #install chrome driver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jupyterlab-git 0.11.0 requires nbdime<2.0.0,>=1.1.0, but you have nbdime 2.1.0 which is incompatible.
earthengine-api 0.1.261 requires google-api-python-client<2,>=1.12.1, but you have google-api-python-client 1.8.0 which is incompatible.[0m
Get:1 http://packages.cloud.google.com/apt gcsfuse-bionic InRelease [5004 B]
Get:2 http://packages.cloud.google.com/apt cloud-sdk-bionic InRelease [6396 B] 
Get:3 http://packages.cloud.google.com/apt cloud-sdk InRelease [6361 B]        
Err:1 http://packages.cloud.google.com/apt gcsfuse-bionic InRelease            
  The following signatures couldn't be verified because the public key is not available: NO_PUBKEY B53DC80D13EDEF05
Err:2 http://packages.cloud.google.com/apt cloud-sdk-bionic InRelease          
  The following signatures couldn't be verified because the publ

In [2]:
import pandas as pd
import time
from datetime import datetime
from tqdm import tqdm
from string import capwords
import re

from selenium import webdriver #import webDriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup as bs

# Summarize

1. Load finetuned t5-small model
2. Prepare function to create summaries of News articles 

In [3]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the saved model and tokenizer from a specific path
load_directory = '/kaggle/input/t5s-summarization'
model = T5ForConditionalGeneration.from_pretrained(load_directory)
tokenizer = T5Tokenizer.from_pretrained(load_directory)

In [4]:
class Summarize:
    # returns list of summaries
    def summarize(self, articles, model, tokenizer):
        summaries = []
        print("||| Generating Summaries... |||")
        try:
            for input_text in tqdm(articles):
                # Tokenize the input text
                input_ids = tokenizer.encode(input_text, truncation=True, padding='longest', return_tensors='pt')

                # Generate the summary
                summary_ids = model.generate(input_ids, max_length=2048, num_beams=2, early_stopping=True)
                summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

                summaries.append(summary)
    #             print(summary)

        except Exception as e:
            print(f"Unable to summarize() due to {e}")
        return summaries
    
    # print articles and summaries randomly 
    def print_summaries(self, df, t):
        sample = df.sample(n=t)
        for i in sample.index:
            print('\n\n|| Article ||\n')
            print(df['article'].loc[i])
            print('\n|| Summary ||\n')
            print(df['summary'].loc[i])
            
sm = Summarize()

# Web scraping

In [5]:
class WebScrap:
    # selectortype mapping
    st_dict = {
        0: By.CLASS_NAME,
        1: By.ID, 
        2: By.TAG_NAME,
        3: By.XPATH
    }
    
    # returns chrome webdriver element using url
    def get_wd(self, url):
        try:
            options = Options()

            options.add_argument('--headless')        # remove this for easy debbuing
            options.add_argument('--incognito')
            options.add_argument('--no-sandbox')                             
            options.add_argument('--disable-dev-shm-usage')

            wd = webdriver.Chrome(options=options)

            wd.get(url)
            return wd
        except Exception as e:
            print(f"Unable to get_wd() due to {e}")
            
    # st: selctor type : class:0/id:1/tag:2/xpath:3
    # sname: selectorname : name of class/id/tag/xpath
    # multi: True => find_elements, False => find_element
    def get_elements(self, wd, st, sname, multi=True):
        stype = self.st_dict[st]
        try:
            if multi:
                elements = wd.find_elements(stype, sname)
            else:
                elements = wd.find_element(stype, sname)
            return elements
        except NoSuchElementException:
            print("Element Exception Encountered.")
        return None
    
    # checks if entered link matches a desired url pattern
    def checklink(self, link, pattern):
        if not link or type(link)!=str:
            return False

        match = re.compile(pattern).match(link)
        if match:
            return True
        else:
            return False  
    
    # create data frame from populated dictionary and save csv file
    def get_df(self, df, sitename="df"):
        df = pd.DataFrame.from_dict(df) # `pd.DataFrame(df)` will also work
        df.drop_duplicates().reset_index(inplace=True, drop=True)
        df.to_csv(sitename+'.csv', index=False)
        return df
    
    # datanames: names of data we have to scrap
    def webscrap(self, sitename, sitefunction, datanames, news_cards):
        # 1.create data dictionary
        data = {}
        for dn in datanames:
            data[dn.lower()] = []

        # 2.scrap data from news_cards
        try:
            data = sitefunction(news_cards, data, self.st_dict)
            print(f"||| Web Scraping Complete for {sitename.upper()}. |||")

            # 3. summarize articles
            try:
                data['summary'] = sm.summarize(data['article'], model, tokenizer)
                print("||| Articles Summarized. |||")
            except Exception as e:
                print(f"Unable to summarize() in webscrap() due to {e}")
                return None

        except Exception as e:
            print(f"Unable to sitefunction() in webscrap() due to {e}")
            return None

        # 4. Creating and Saving Dataframe
        try:
            print("||| Creating Dataframe, Saving as CSV |||")
            df = ws.get_df(data, sitename)
        except Exception as e:
            print(f"Unable to get_df() in webscrap() due to {e}")
            return None
        
        # return webscraped data
        return df
        
ws = WebScrap()

---

<a id='wired'></a>
# Wired

In [6]:
def ws_wired(news_cards, data, st_dict):
    print('||| Beginning Web Scraping for Wired |||')
    try:
        for news in tqdm(news_cards):
            sname = ".//div[@class = 'RubricWrapper-dKmCNX eiIpgC rubric rubric--discovery SummaryItemRubric-dguGKN lapGFj summary-item__rubric']"
            topic = ws.get_elements(news, 3, sname, False).text.lower()
#             print(topic)
            
            sname = ".//a[@class = 'SummaryItemHedLink-civMjp ejgyuy summary-item-tracking__hed-link summary-item__hed-link']"
            link = ws.get_elements(news, 3, sname, False).get_attribute('href')
            wd = ws.get_wd(link)
            p = ws.get_elements(wd, 2, "p")
            sp = ws.get_elements(wd, 0, "paywall")
            txt = p[5].text 
            for t in sp:
                txt += t.text
            article = ''.join(txt.split('\n'))
            wd.quit()
#             print(link)
#             print(article)
            
            sname = ".//h3[@class = 'SummaryItemHedBase-hiFYpQ cJGBzK summary-item__hed']"
            headline = ws.get_elements(news, 3, sname, False).text
#             print(headline)
    
            # crosscheck list index
            sname = ".//span[@class = 'BylineName-kwmrLn cYaBaU byline__name']"
            author = ws.get_elements(news, 3, sname, False).text[3:].title()
#             print(author)

            data['topic'].append(topic)
            data['link'].append(link)
            data['headline'].append(headline)
            data['author'].append(author)
            data['article'].append(article)
        print("||| Tabulation Complete. |||")

    except Exception as e:
        print(f"Unable to ws_wired() due to {e}")
    
    return data

In [7]:
url = "https://www.wired.com/tag/design"
sname = "summary-item__content"
wd = ws.get_wd(url)

news_cards = ws.get_elements(wd, 0, sname)

In [8]:
datanames = ['topic', 'link', 'headline', 'author', 'article', 'summary']
# takes about 2-5 mins
df_wired = ws.webscrap(sitename="wired", sitefunction=ws_wired, datanames=datanames, news_cards=news_cards)
wd.quit()

  0%|          | 0/24 [00:00<?, ?it/s]

||| Beginning Web Scraping for Wired |||


100%|██████████| 24/24 [05:06<00:00, 12.76s/it]
  0%|          | 0/24 [00:00<?, ?it/s]

||| Tabulation Complete. |||
||| Web Scraping Complete for WIRED. |||
||| Generating Summaries... |||


100%|██████████| 24/24 [01:31<00:00,  3.81s/it]

||| Articles Summarized. |||
||| Creating Dataframe, Saving as CSV |||





In [9]:
df_wired.sample(5)

Unnamed: 0,topic,link,headline,author,article,summary
9,ideas,https://www.wired.com/story/cities-architectur...,Cities Need to Realize the Value of Emotional ...,Thomas Heatherwick,"Modern buildings have become boring—flat, plai...","buildings have become boring—flat, plain, shin..."
19,ideas,https://www.wired.com/story/pulsar-digital-wat...,The Watch That Made Everything Now,Charlotte Kent,THE PULSAR EMERGED in the era of the space rac...,"space, and a future imagined as sleek, glossy,..."
18,ideas,https://www.wired.com/story/robot-dog-artifici...,Who Killed the Robot Dog?,Britt H. Young,END USERThat’s how we have imagined the robot ...,"the robot dog, and animaloids in general, for ..."
13,gear,https://www.wired.com/story/ikea-turntable-and...,Ikea's New Turntable and 9 More Standouts From...,Jeremy White,"Save your jokes about prison toilets, please, ...",", this hard-wearing design features stylish, i..."
5,gear,https://www.wired.com/story/tomtex-chitosan-le...,Replace the Leather in Your Wallet With Seafoo...,Alden Wicker,"It looked, quite frankly, like the future. For...",", the future. It looked, quite frankly, like t..."


In [10]:
sm.print_summaries(df_wired, 1)



|| Article ||

Likewise, amid the back and forth about Elon Musk’s plan to buy Twitter, many people who use the platform have expressed concerns over his bid to forefront algorithmic content moderation and other design changes on the whim of his $44 billion fancy. Bringing in recommendations from someone with no framework of risk and harms to highly marginalized people leads to proclamations of “authenticating all humans.” This seems to be a push to remove online anonymity, something I’ve written about very personally. It is ill-thought-through, harmful to those most at risk, and backed by no actual methodology or evidence. Beyond his unclear outbursts for changes, Musk’s previous actions combined with the existing harms from Twitter’s current structures have made it clear that we’re heading toward further impacts on marginalized groups, such as Black and POC Twitter users and trans folks. Meanwhile, lack of safety infrastructure is hitting home hard in the US since the leak of the d

---

<a id='dexigner'></a>
# Dexigner

In [11]:
def ws_dexigner(news_cards, data, st_dict):
    print('||| Beginning Web Scraping |||')
    try:
        for news in tqdm(news_cards):
            link = ws.get_elements(news, 3, ".//a", False).get_attribute('href')
            wd = ws.get_wd(link)
            article = ws.get_elements(wd, 1, "article-body", False)
            article = ''.join(article.text.split('\n'))
            wd.quit()
#             print(article)
#             print(link)
    
            headline = ws.get_elements(news, 3, ".//h3", False).text
#             print(headline)
    
            subline = ws.get_elements(news, 3, ".//p", False).text
#             print(subline)
    
            date = ws.get_elements(news, 3, ".//time", False).get_attribute('datetime')
            date = datetime.strptime(date, '%Y-%m-%d').date()          
#             print(date)

            data['link'].append(link)
            data['headline'].append(headline)
            data['subline'].append(subline)
            data['date'].append(date)
            data['article'].append(article)
        print("||| Tabulation Complete. |||")
            
    except Exception as e:
        print(f"Unable to ws_dexigner() due to {e}")
    
    return data

In [12]:
url = "https://www.dexigner.com"
sname = "item" #class
wd = ws.get_wd(url)

news_cards = ws.get_elements(wd, 0, sname)

In [13]:
datanames = ['subline', 'link', 'headline', 'date', 'article', 'summary']
df_dexigner = ws.webscrap(sitename="dexigner", sitefunction=ws_dexigner, datanames=datanames, news_cards=news_cards)
wd.quit()

  0%|          | 0/29 [00:00<?, ?it/s]

||| Beginning Web Scraping |||


100%|██████████| 29/29 [00:54<00:00,  1.88s/it]
  0%|          | 0/29 [00:00<?, ?it/s]

||| Tabulation Complete. |||
||| Web Scraping Complete for DEXIGNER. |||
||| Generating Summaries... |||


100%|██████████| 29/29 [01:21<00:00,  2.81s/it]

||| Articles Summarized. |||
||| Creating Dataframe, Saving as CSV |||





In [14]:
df_dexigner.sample(5)

Unnamed: 0,subline,link,headline,date,article,summary
19,The Butrint National Park Visitor Center Inter...,https://www.dexigner.com/news/33923,Butrint National Park Visitor Center Internati...,2022-09-22,The Butrint National Park Visitor Center Inter...,The butrint National Park Visitor Center Inter...
12,The Ceramics of Italy Tile Competition - the l...,https://www.dexigner.com/news/33930,Ceramics of Italy Tile Competition 2023,2022-09-24,The Ceramics of Italy Tile Competition - the l...,is accepting submissions for the 2023 edition....
28,Morrama recently developed a more sustainable ...,https://www.dexigner.com/news/33914,Morrama Develops Sustainable Packaging Solutio...,2022-09-15,Morrama recently developed a more sustainable ...,wagamama has developed a more sustainable pack...
26,Material Matters 2022 is a new fixture on the ...,https://www.dexigner.com/news/33916,Material Matters 2022,2022-09-15,Material Matters 2022 is a new fixture on the ...,2022 is a new fixture on the London design cal...
2,Foster + Partners has designed a state-of-the-...,https://www.dexigner.com/news/33940,Foster + Partners Designs Coral-inspired Museu...,2022-09-27,Foster + Partners has designed a state-of-the-...,", the state-of-the-art marine life institute i..."


In [15]:
sm.print_summaries(df_dexigner, 1)



|| Article ||

Organized by Red Dot, the Contemporary Good Design Award allows companies and designers to present their products to an internationally renowned and independent jury for assessment. Successful participants are permitted to use the label, which ensures product visibility on the Chinese market and provides guidance for Chinese customers.The international jury for the Contemporary Good Design Award comprises twelve internationally respected experts, including journalists, professors and consultants. They assess the entries on the basis of criteria such as level of innovation, functionality, longevity and formal quality.Entry is open to manufacturers and designers from all over the world whose products are made using an industrial manufacturing process. The products must have already been launched on the market, and the market launch must have taken place within the past three years.The deadline for submissions is October 14, 2022.more: cgdaward.com (205)

|| Summary ||

t

---

<a id='dezeen'></a>
# [Dezeen](https://www.dezeen.com/news/)

In [16]:
def ws_dezeen(news_cards, data, st_dict):
    print('||| Beginning Web Scraping |||')
    try:
        for news in tqdm(news_cards):
            sname = ".//article//header//h4[@class='category-type-foreground']"
            topic = ws.get_elements(news, 3, sname, False).text.lower()
            
            sname = ".//article//header//h3//a"
            headline = ws.get_elements(news, 3, sname, False).text
            
            sname = ".//article//p"
            subline = ws.get_elements(news, 3, sname, False).text[:-5]
            
            sname = ".//article//footer//a"
            author = ws.get_elements(news, 3, sname, False).text
            
            sname = ".//article//header//a"
            link = ws.get_elements(news, 3, sname, False).get_attribute("href")
            
            try:
                sname = ".//article//footer//time"
                date = ws.get_elements(news, 3, sname, False).text
                date = datetime.strptime(date, '%d %B %Y').date()
            except Exception as e:
                print(f"Some Exception: {e}")
                continue
            
            try:
                wd = ws.get_wd(link)
                news = ws.get_elements(wd, 0, "main-article-body", False)
                text = ws.get_elements(news, 2, "p")
                article = ''
                for t in text:
                    article += t.text
                article = ''.join(article.split('\n'))
                wd.quit()
    
            except NoSuchElementException:
                print("No Such Element Exception")
                
            data['topic'].append(topic)
            data['headline'].append(headline)
            data['subline'].append(subline)
            data['author'].append(author)
            data['link'].append(link)
            data['date'].append(date)
            data['article'].append(article)

        print("||| Tabulation Complete. |||")

    except Exception as e:
        print(f"Unable to ws_dezeen() due to: {e}")
    
    return data

In [17]:
url = "https://www.dezeen.com/news/"
sname = "main-story-list"
wd = ws.get_wd(url)

mainlist = ws.get_elements(wd, 0, sname, False)
news_cards = ws.get_elements(mainlist, 2, "li")
news_cards.pop(1)

<selenium.webdriver.remote.webelement.WebElement (session="09b3ce5e11c8362479447eca5117e73c", element="7e1af937-43ab-4416-9d7b-53135d9aa6e5")>

In [18]:
datanames = ['topic', 'headline', 'subline', 'author', 'link', 'date', 'article', 'summary']
df_dezeen = ws.webscrap(sitename="dezeen", sitefunction=ws_dezeen, datanames=datanames, news_cards=news_cards)
wd.quit()

  0%|          | 0/19 [00:00<?, ?it/s]

||| Beginning Web Scraping |||


 11%|█         | 2/19 [00:08<01:01,  3.63s/it]

Some Exception: time data '8 hours ago' does not match format '%d %B %Y'


 16%|█▌        | 3/19 [00:08<00:33,  2.09s/it]

Some Exception: time data '17 hours ago' does not match format '%d %B %Y'


100%|██████████| 19/19 [02:35<00:00,  8.16s/it]
  0%|          | 0/17 [00:00<?, ?it/s]

||| Tabulation Complete. |||
||| Web Scraping Complete for DEZEEN. |||
||| Generating Summaries... |||


100%|██████████| 17/17 [00:47<00:00,  2.81s/it]

||| Articles Summarized. |||
||| Creating Dataframe, Saving as CSV |||





In [19]:
df_dezeen.sample(5)

Unnamed: 0,topic,headline,subline,author,link,date,article,summary
0,architecture,"""World's largest wooden city"" set to be built ...",Scandinavian studios Henning Larsen and White ...,Cajsa Carlson,https://www.dezeen.com/2023/06/22/worlds-large...,2023-06-22,Scandinavian studios Henning Larsen and White ...,", the ""world's largest wooden city"" is being d..."
13,architecture,Heatherwick Studio reveals design for Shanghai...,British architecture firm Heatherwick Studio h...,Christina Yao,https://www.dezeen.com/2023/06/09/heatherwick-...,2023-06-09,British architecture firm Heatherwick Studio h...,has revealed the design of the West Bund Orbit...
7,3 days of design,"NINE promises ""more female designers in the pi...",Recently launched furniture brand NINE has unv...,Amy Frearson,https://www.dezeen.com/2023/06/16/nine-furniture/,2023-06-16,Recently launched furniture brand NINE has unv...,", Sine, a family of pendant lamps formed of tw..."
10,architecture,MAD wraps China Philharmonic Concert Hall in t...,"The translucent, wavy facade of China Philharm...",Cajsa Carlson,https://www.dezeen.com/2023/06/13/china-philha...,2023-06-13,"The translucent, wavy facade of China Philharm...","of the building, MAD says. facade of the build..."
2,news,High-tech pioneer Michael Hopkins dies aged 88,RIBA Royal Gold Medal-winning architect Michae...,Tom Ravenscroft,https://www.dezeen.com/2023/06/19/michael-hopk...,2023-06-19,RIBA Royal Gold Medal-winning architect Michae...,and architect Michael Hopkins died aged 88 sur...


In [20]:
sm.print_summaries(df_dezeen, 1)



|| Article ||

Scandinavian studios Henning Larsen and White Arkitekter are designing Stockholm Wood City, which will become the world's largest mass-timber development and have the "serenity of a forest".Set to be built in the Stockholm neighbourhood of Sickla, the project was dubbed the "world's largest wooden city" by developer Atrium Ljungberg as it will use more timber that any other project in development.Stockholm Wood City, which will have 7,000 office spaces and 2,000 homes and cover 250,000 square metres, is being designed by Danish studio Henning Larsen and Swedish firm White Arkitekter.According to the developer, the district will feature nature-informed elements and was designed to have the feel of a forest."We sought to create an urban environment infused with the serenity of a forest, resulting in a dense, open space that bears the distinctively minimalistic and functional aesthetic of Scandinavian design," Atrium Ljungberg told Dezeen."The architects innovatively inco

---

<a id='techcrunch'></a>
# [Techcrunch](https://techcrunch.com)

In [11]:
def ws_techcrunch(news_cards, data, st_dict):
    print('||| Beginning Web Scraping |||')
    try:
        for news in tqdm(news_cards):
            sname = ".//h2[@class='post-block__title']//a"
            link = ws.get_elements(news, 3, sname, False).get_attribute('href')
#             print("link: ", link)
            
            sname = ".//div[@class = 'article__primary-category']"
            topic = ws.get_elements(news, 3, sname, False)
            if not topic:
                continue
            topic = topic.text.lower()
#             print("topic: ", topic)
            
            sname = ".//h2//a[@class = 'post-block__title__link']"
            headline = ws.get_elements(news, 3, sname, False)
            if not headline:
                continue
            headline = headline.text
#             print("headline: ", headline)
    
            sname = ".//div//span[@class = 'river-byline__authors']"
            author = ws.get_elements(news, 3, sname, False)
            if not author:
                continue
            author = author.text
#             print("author: ", author)

            sname = ".//div//time"
            date = ws.get_elements(news, 3, sname, False).get_attribute('datetime')
            if not date:
                continue
            date = date[:10]
            date = datetime.strptime(date, '%Y-%m-%d').date()
#             print("date: ", date)
            
            # error
            wd = ws.get_wd(link)
            sname = ".//div[@class='article-content']"
            doc = ws.get_elements(wd, 3, sname, False)
            if not doc:
                continue
            spsummary = ws.get_elements(doc, 1, "speakable-summary", False).text
#             print("spsummary: ", spsummary)
            
            p = ws.get_elements(doc, 2, "p", st_dict)
            p.pop(0)
            txt = ""
            for t in p:
                txt += t.text
            article = ''.join(txt.split('\n'))
            wd.quit()
#             print("article: ", article)
    
            data['topic'].append(topic)
            data['link'].append(link)
            data['headline'].append(headline)
            data['author'].append(author)
            data['date'].append(date)
            data['article'].append(article)
            data['spsummary'].append(spsummary)

#             print("topic: ", topic)
#             print("headline: ", headline)
#             print("spsummary: ", spsummary)
#             print("link: ", link)
#             print("author: ", author)
#             print("date: ", date)
#             print("article: ", article)
        
        print("||| Tabulation Complete. |||")
            
    except Exception as e:
        print(f"Unable to ws_techcrunch() due to {e}")
    
    return data

In [17]:
url = "https://techcrunch.com"
wd = ws.get_wd(url)

# accept privacy policy
try:
    sname = "accept-all"
    acceptall = WebDriverWait(wd, 5).until(EC.element_to_be_clickable((By.CLASS_NAME, sname)))  
    acceptall.click()
except Exception as e:
    print("Privacy Setting already selected.")

sname = ".//header[@class='post-block__header']"
news_cards = ws.get_elements(wd, 3, sname)

Privacy Setting already selected.


In [18]:
datanames = ['topic', 'link', 'headline', 'author', 'date', 'article', 'summary', 'spsummary']
df_techcrunch = ws.webscrap(sitename="techcrunch", sitefunction=ws_techcrunch, datanames=datanames, news_cards=news_cards)
wd.quit()

  0%|          | 0/20 [00:00<?, ?it/s]

||| Beginning Web Scraping |||


 25%|██▌       | 5/20 [00:32<01:37,  6.47s/it]

Element Exception Encountered.


100%|██████████| 20/20 [02:05<00:00,  6.28s/it]
  0%|          | 0/19 [00:00<?, ?it/s]

||| Tabulation Complete. |||
||| Web Scraping Complete for TECHCRUNCH. |||
||| Generating Summaries... |||


100%|██████████| 19/19 [01:02<00:00,  3.31s/it]

||| Articles Summarized. |||
||| Creating Dataframe, Saving as CSV |||





In [19]:
sm.print_summaries(df_techcrunch, 1)



|| Article ||

The promise of DuckDB is that as an in-process SQL-compatible analytics process database, developers will get the benefits of an analytics database without the complexity of managing it. Unlike databases like Postgres, there’s very little to set up and since it’s embedded in the application, data transfers are very fast. This makes DuckDB ideally suited for local data analysis. MotherDuck itself likens DuckDB to “SQLite for analytics workloads.”Image Credits: MotherDuckImage Credits: MotherDuck“As nearly every organization, large and small, strives to figure out how to get value out of their data, scale becomes much less interesting than making it easy to derive insights. While today is a momentous day for our company, it is just a first flap of our wings on this journey,” said MotherDuck CEO Tigani. “We’re confident that the combined innovation between MotherDuck’s platform and DuckDB Labs’s database will be a critical piece of how the modern data stack evolves over t

In [20]:
df_techcrunch.sample(5)

Unnamed: 0,topic,link,headline,author,date,article,summary,spsummary
16,venture,https://techcrunch.com/2023/06/22/kubik/,Plastic upcycling startup Kubik closes $3.34M ...,Annie Njanja,2023-06-22,Plastic pollution is already a menace said to ...,", removing 45,000 kgs of plastic waste from la...",The amount of plastic waste produced globally ...
8,startups,https://techcrunch.com/2023/06/22/soulcycle-to...,SoulCycle to offer trainer-guided virtual work...,Mike Butcher,2023-06-22,It’s now partnering with iconic exercise brand...,is partnering with SoulCycle to bring its “ene...,FLOWN is a startup that took the idea of the s...
12,social,https://techcrunch.com/2023/06/22/reddit-is-ba...,Reddit is battling against moderators marking ...,Ivan Mehta,2023-06-22,"Protesting API changes made by Reddit, moderat...",", r/birthcontrol, r/perfectlycutscreams, r/gam...",After adopting different forms of protest like...
9,transportation,https://techcrunch.com/2023/06/22/kodiak-loads...,"Kodiak, Loadsmith to put 800 trucks on new aut...",Rebecca Bellan,2023-06-22,"Loadsmith says its newly formed entity, Loadsm...",the Kodiak partnership with Kodiak will serve ...,Autonomous trucking startup Kodiak Robotics ha...
13,startups,https://techcrunch.com/2023/06/22/bridge-2-tec...,Bridge 2 Technologies is making it easier for ...,Ron Miller,2023-06-22,"Today, the company announced the official laun...",the platform was launched with around 500 corp...,Bridge 2 Technologies began with the notion th...


---

<a id='theverge'></a>
# [The Verge]()

In [21]:
def ws_theverge(news_cards, data, st_dict):
    print('||| Beginning Web Scraping |||')
    # desired pattern for theverge article links
    mainpattern = r"https?:\/\/(www\.)?theverge\.com/[0-9][0-9][0-9][0-9]/"
#     mainpattern2 = r"https?:\/\/(www\.)?theverge\.com/[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]/"
    try:
        for news in tqdm(news_cards):
            sname = ".//h2//a"
            link = ws.get_elements(news, 3, sname, False)
            if not link:
                continue

            link = link.get_attribute('href')
            if not ws.checklink(link, mainpattern):
                continue

            wd = ws.get_wd(link)
            sname = ".//article[@id='content']//div"
            maincard = ws.get_elements(wd, 3, sname, False)
            if not maincard:
                continue

            sname = ".//ul"
            topic = ws.get_elements(maincard, 3, sname, False).text
            topics = ", ".join([x.lower() for x in topic.split('/')])

            sname = ".//div//div//div" 
            card = ws.get_elements(maincard, 3, sname)
            group1 = (card[0].text).split("/")
            if len(group1)%2:
                continue

#             print(link)
            headline,subline = group1[0],group1[1]
#             print(headline, subline)

            sname = ".//p[@class='duet--article--article-byline']"
            group2 = ws.get_elements(card[1], 2, "p", False).text
            author = group2[3:group2.find(",")]
#             print(author)
            
            authordesc = group2[group2.find(",")+2:]
#             print(authordesc)
            
            sname = ".//time"
            date = ws.get_elements(maincard, 3, sname, False).text[:12]
            try:
                date = datetime.strptime(date, '%b %d, %Y').date()
            except Exception as e:
                continue
                    
            sname = ".//div[@class='duet--article--article-body-component']//p"
            divs = ws.get_elements(wd, 3, sname)
#             article = get_elements(articlecard, 3, "", False).text
            article = ''
            for d in divs:
                article+=d.text
            article = ''.join(article.split('\n'))
#             print(article)
            
            wd.quit()

            data['topic'].append(topics) #done
            data['headline'].append(headline) #done
            data['link'].append(link) #done
            data['author'].append(author)
            data['date'].append(date)
            data['article'].append(article)
            data['subline'].append(subline) #done
            data['authordesc'].append(authordesc)
        print("||| Tabulation Complete. |||")

    except Exception as e:
        print(f"Unable to ws_theverge() due to {e}")
    
    return data

In [22]:
url = "https://www.theverge.com"
sname = "duet--content-cards--content-card" #class
wd = ws.get_wd(url)

news_cards = ws.get_elements(wd, 0, sname)

In [23]:
datanames = ['topic', 'headline', 'link', 'subline', 'author', 'authordesc', 'date', 'article', 'summary']
df_theverge = ws.webscrap(sitename="theverge", sitefunction=ws_theverge, datanames=datanames, news_cards=news_cards)
wd.quit()

  0%|          | 0/46 [00:00<?, ?it/s]

||| Beginning Web Scraping |||


  4%|▍         | 2/46 [00:08<03:11,  4.36s/it]

Element Exception Encountered.


 17%|█▋        | 8/46 [00:34<02:55,  4.61s/it]

Element Exception Encountered.


 41%|████▏     | 19/46 [01:21<02:01,  4.50s/it]

Element Exception Encountered.


 48%|████▊     | 22/46 [01:36<01:54,  4.76s/it]

Element Exception Encountered.


 57%|█████▋    | 26/46 [01:53<01:33,  4.68s/it]

Element Exception Encountered.


 72%|███████▏  | 33/46 [02:07<00:32,  2.50s/it]

Element Exception Encountered.
Element Exception Encountered.
Element Exception Encountered.
Element Exception Encountered.
Element Exception Encountered.


 80%|████████  | 37/46 [02:15<00:20,  2.25s/it]

Element Exception Encountered.


 87%|████████▋ | 40/46 [02:23<00:14,  2.39s/it]

Element Exception Encountered.
Element Exception Encountered.


100%|██████████| 46/46 [02:39<00:00,  3.46s/it]
  0%|          | 0/15 [00:00<?, ?it/s]

Element Exception Encountered.
||| Tabulation Complete. |||
||| Web Scraping Complete for THEVERGE. |||
||| Generating Summaries... |||


100%|██████████| 15/15 [00:47<00:00,  3.19s/it]

||| Articles Summarized. |||
||| Creating Dataframe, Saving as CSV |||





In [24]:
df_theverge.sample(5)

Unnamed: 0,topic,headline,link,subline,author,authordesc,date,article,summary
4,"transpo, policy, tech",Heavy-duty trucks would be required to have au...,https://www.theverge.com/2023/6/22/23769791/he...,"Vehicles over 10,000 pounds would be required...",Andrew J. Hawkins,transportation editor with 10+ years of experi...,2023-06-22,Semi trucks and other heavy-duty vehicles woul...,", the biden administration said. “We have been..."
6,"deals, tech, amazon prime day 2023",PSA: your friends at Verge Deals are here to g...,https://www.theverge.com/2023/6/22/23768680/ve...,"This year’s coverage will include roundups, a...",Antonio G. Di Benedetto,a writer covering tech deals and The Verge’s D...,2023-06-22,Amazon Prime Day is coming around once again o...,", and we’ll be highlighting quality deals acro..."
8,"deals, tech, tvs",LG’s brilliant C3 OLED just hit its lowest pri...,https://www.theverge.com/2023/6/22/23768363/lg...,Bose’s portable SoundLink Flex speaker is als...,Sheena Vasani,"a writer covering commerce, e-readers, and tec...",2023-06-22,Memorial Day weekend is known for being one of...,Anker’s mini wall charger is capable of chargi...
11,"virtual reality, tech, meta",Meta is increasing the performance of its Ques...,https://www.theverge.com/2023/6/21/23769053/me...,The new version 55 software update for the Me...,Umar Shakir,a news writer fond of the electric vehicle lif...,2023-06-21,Mark Zuckerberg announced the Meta Quest 3 VR ...,", Meta says. The v55 update is rolling out “gr..."
5,"google, tech, google pixel",iPhone is dumb and besotted with Pixel in Goog...,https://www.theverge.com/2023/6/22/23769713/go...,"Do it, Google. Make ‘em kiss.",Jess Weatherbed,"a news writer focused on creative industries, ...",2023-06-22,Google has found a fun new way to dunk on Appl...,"the iPhone is portrayed as an outdated, inexpe..."


In [25]:
sm.print_summaries(df_theverge, 1)



|| Article ||

Memorial Day weekend is known for being one of the best times of the year to buy a TV, so it was no surprise when LG’s new C3 OLED dropped to a new record low last month. What is surprising, though, is that the TV is even cheaper than it was in May less than a month later.Right now, you can buy LG’s 55-inch C3 OLED at an all-time low of around $1,497 ($403 off) at Amazon, Best Buy, and direct from LG. Both larger and smaller sizes are on sale as well, including the 48-inch configuration, which is available for a new low of $1,249.99 ($250 off) from Best Buy and LG.In a nutshell, LG’s new C3 boasts the kind of impressive black levels, vivid colors, and rich contrast that have come to characterize OLED TVs in recent years. It also features a number of other welcome features, including support for Dolby Vision, AMD FreeSync, Nvidia G-Sync, and a speedy 120Hz refresh rate that gamers will surely appreciate. Plus, it features several new picture modes and more processing po

---

<a id='euronews'></a>
# [Euronews](https://www.euronews.com/tag/design)

In [26]:
def ws_euronews(news_cards, data, st_dict):
    print('||| Beginning Web Scraping |||')
    mainpattern = r"https?:\/\/(www\.)?euronews\.com/[0-9][0-9][0-9][0-9]/"
    try:
        for news in tqdm(news_cards):
            sname = "m-object__title__link"
            link = ws.get_elements(news, 0, sname, False).get_attribute('href')
            
            if ws.checklink(link, mainpattern):
                continue
            
            sname = "m-object__title__link"
            headline = ws.get_elements(news, 0, sname, False).text
            
            sname = "program-name"
            topic = ws.get_elements(news, 0, sname, False).text

            wd = ws.get_wd(link)
            
            sname = ".//div[@class='c-article-contributors']//a"
            author = ws.get_elements(wd, 3, sname)[1].text

            sname = ".//div[@class = 'c-article-contributors']//time"
            date = ws.get_elements(wd, 3, sname, False).get_attribute('datetime')
            date = datetime.strptime(date, '%Y-%m-%d').date()
            
            sname = ".//p[@class='c-article-summary']"
            spsummary = ws.get_elements(wd, 3, sname)[1].text
            
            group = ws.get_elements(wd, 3, ".//div[@id='poool-content']//p")
            article = ''
            for p in group:
                article += p.text
            article = ''.join(article.split('\n'))
#             print(article)

            wd.quit()

            data['topic'].append(topic)
            data['link'].append(link)
            data['headline'].append(headline)
            data['article'].append(article)
            data['author'].append(author)
            data['date'].append(date)
            data['spsummary'].append(spsummary)
            
        print("||| Tabulation Complete. |||")
            
    except Exception as e:
        print(f"Unable to ws_euronews() due to {e}")
    
    return data

In [27]:
url = "https://www.euronews.com/tag/design"
sname = ".//div[@class='o-block-listing__articles']//article" #class
wd = ws.get_wd(url)

news_cards = ws.get_elements(wd, 3, sname)

In [28]:
datanames = ['topic', 'link', 'headline', 'author', 'date', 'article', 'summary', 'spsummary']
df_euronews = ws.webscrap(sitename="euronews", sitefunction=ws_euronews, datanames=datanames, news_cards=news_cards)
wd.quit()

  0%|          | 0/20 [00:00<?, ?it/s]

||| Beginning Web Scraping |||


100%|██████████| 20/20 [01:19<00:00,  3.99s/it]
  0%|          | 0/20 [00:00<?, ?it/s]

||| Tabulation Complete. |||
||| Web Scraping Complete for EURONEWS. |||
||| Generating Summaries... |||


100%|██████████| 20/20 [01:00<00:00,  3.02s/it]

||| Articles Summarized. |||
||| Creating Dataframe, Saving as CSV |||





In [29]:
df_euronews.sample(5)

Unnamed: 0,topic,link,headline,author,date,article,summary,spsummary
18,SEE,https://www.euronews.com/culture/2023/04/19/cu...,Culture Re-View: A look back at this day in hi...,Jonny Walfisz,2023-04-19,19 April 1999: How Norman Foster changed Berli...,", the Reichstag is one of the most visited att...","On this day, the newly designed Reichstag was ..."
13,TOUCH,https://www.euronews.com/culture/2023/04/22/fo...,How a ceramics brand is keeping things traditi...,Saskia O'Donoghue,2023-04-22,"If you’re in Milan for Design Week, don’t miss...","the brand founded in the 1940s, famous for its...","Italian design house Fornasetti, famed for its..."
19,TOUCH,https://www.euronews.com/culture/2023/04/17/ev...,Everything you need to know about Milan Design...,Saskia O'Donoghue,2023-04-17,The Milan Furniture Fair - or Salone Internazi...,",,, furniture and accessories for the home, in...",Euronews Culture is heading to Milan Design We...
15,TOUCH,https://www.euronews.com/culture/2023/04/20/re...,Revolutionising a classic - Dior and Starck up...,,2023-04-20,"The Salone del Mobile in Milan, otherwise know...",",, is a tribute to Dior's late founder, Christ...",French designer Phillipe Starck and Dior Maiso...
4,TOUCH,https://www.euronews.com/culture/2023/05/18/de...,Look inside Alexander Lervik's renovated kit h...,Saskia O'Donoghue,2023-05-18,High-profile Scandinavian designer Alexander L...,",,,,,,, was originally a kit house from 1961 -...","As part of our series on designers' homes, Eur..."


In [30]:
sm.print_summaries(df_euronews, 1)



|| Article ||

The largest retrospective of works by British architect, Norman Foster has opened at the Centre Pompidou in Paris, spanning the last six decades of his career.Drawings, workbooks, scale models, prototypes and videos enable visitors to delve into 130 major projects on display in the centre, which itself is considered one of the first examples of the "High Tech" architectural trend that Foster helped pioneer.The exhibition reviews the different periods of the architect’s work and highlights cutting-edge creations, such as the headquarters of the Hongkong and Shanghai Banking Corporation (Hong Kong, 1979-1986), the Carré d’Art (Nîmes, 1984-1993), Hong Kong International Airport (1992-1998) and Apple Park (Cupertino, United States, 2009-2017).The architect’s work is explored through seven themes: Nature and Urbanity; Skin and Bones; Vertical City; History and Tradition; Planning and Place; Networks and Mobility and Future.Sources of inspiration for Foster, including Fernan

---

<a id='designmilk'></a>
# [Design Milk](https://design-milk.com/category/technology/)

In [31]:
# tabulate data dictionary
def ws_designmilk(news_cards, data, st_dict):
    print('||| Beginning Web Scraping |||')
    try:
        for news in tqdm(news_cards):
            # from news element
            # topic, headline, subline, link
            link = ws.get_elements(news, 0, "post-title", False).get_attribute('href')
            
            headline = ws.get_elements(news, 0, "post-title", False).text
            
            subline = ws.get_elements(news, 0, "post-excerpt", False).text
            
            topic = ws.get_elements(news, 3, ".//a", False).text.lower()
            
            # from link
            # author, date, article
            wd = ws.get_wd(link)
            
            sname = ".//div[@class='date-and-author']//a"
            author = ws.get_elements(wd, 3, sname, False).text
            
            sname = ".//div[@class='date-and-author']"
            date = ws.get_elements(wd, 3, sname, False).text[:8]
            date = datetime.strptime(date, "%m.%d.%y").date()
            
            sname = ".//div//p"
            p = ws.get_elements(wd, 3, sname)
            article = ''
            for a in p:
                if a.text[:5] != "Photo":
                    article += a.text
            article = ''.join(article.split('\n'))
            
            wd.quit()
            
            data['topic'].append(topic)
            data['headline'].append(headline)
            data['subline'].append(subline)
            data['link'].append(link)
            data['author'].append(author)
            data['date'].append(date)
            data['article'].append(article)

#             print("headline: ", headline)
#             print("subline: ", subline)
#             print("link: ", link)
#             print("author: ", author)
#             print("date: ", date)
#             print("topic: ", topic)
#             print("article: ", article)
            
        print("||| Tabulation Complete. |||")

    except Exception as e:
        print(f"Unable to ws_designmilk() due to {e}")
    
    return data

In [32]:
url = "https://design-milk.com/category/technology/"
sname = "post-list-item"
wd = ws.get_wd(url)

news_cards = ws.get_elements(wd, 0, sname)

In [33]:
datanames = ['topic', 'date', 'author', 'headline', 'link', 'subline', 'article', 'summary']
# takes about 2-5 mins
df_designmilk = ws.webscrap(sitename="designmilk", sitefunction=ws_designmilk, datanames=datanames, news_cards=news_cards)
wd.quit()

  0%|          | 0/15 [00:00<?, ?it/s]

||| Beginning Web Scraping |||


100%|██████████| 15/15 [01:49<00:00,  7.32s/it]
  0%|          | 0/15 [00:00<?, ?it/s]

||| Tabulation Complete. |||
||| Web Scraping Complete for DESIGNMILK. |||
||| Generating Summaries... |||


100%|██████████| 15/15 [00:46<00:00,  3.10s/it]

||| Articles Summarized. |||
||| Creating Dataframe, Saving as CSV |||





In [34]:
df_designmilk.sample(5)

Unnamed: 0,topic,date,author,headline,link,subline,article,summary
5,technology,2023-06-15,Gregory Han,Google's Holographic Project Starline Adds a N...,https://design-milk.com/googles-holographic-pr...,Google's desk-sized video chat technology adds...,Google recognizes many of us are still working...,",,,,, and musings can be found at gregoryhan.c..."
10,technology,2023-06-05,Gregory Han,Teenage Engineering TP–7 Offers the Reel Sensa...,https://design-milk.com/teenage-engineering-tp...,Teenage Engineering's master class audio devic...,The touchscreen display is one of the greatest...,– and polarizing – inventions in modern times....
4,technology,2023-06-15,Gregory Han,Samsung New Bespoke Family Hub Refrigerator Do...,https://design-milk.com/samsung-new-bespoke-fa...,Have you ever considered adding a large screen...,The earliest versions of Samsung’s Family Hub ...,"is available in full depth ($4,999) and counte..."
3,technology,2023-06-16,Gregory Han,Rivian Wants to Redesign EV Shopping Into an I...,https://design-milk.com/rivian-wants-to-redesi...,Rivian wants to make buying one of their EV tr...,“Disruptive” is thrown reflexively and regular...,is the first of several brick-and-mortar retai...
2,technology,2023-06-19,Gregory Han,TELO is a Tiny Electric Truck With Big Load Ca...,https://design-milk.com/telo-electric-truck/,"As small as a MINI, as capacious as a Tacoma, ...",Trucks are hugely popular across the United St...,", TELO, a new electric truck arrives as the an..."


In [35]:
sm.print_summaries(df_designmilk, 1)



|| Article ||

It’s unofficially time for summer to begin, and we’re celebrating with a bold, geometric Designer Desktop by Daniel Ramirez Perez! A true creative, Daniel works in illustration, design, and motion, launching his own illustration studio in 2014. A background in fashion – he interned at Vivienne Westwood! – helped to quickly advance Daniel’s style and put his career on the fast track. Since then, he’s worked with clients throughout advertising, editorial, music, and culture, such as Lufthansa, Thom Yorke, The Wall Street Journal, Ballhaus Ost, and TBWA/Chiat/Day.Artwork for Thom YorkeLove Is Love ProjectFruitsJoy Breeze Mall InstallationsDaniel Ramirez PerezDownload yours with the links below!DESKTOP: 1024×768 \\\ 1280×1024 \\\ 1680×1050 \\\ 1900×1200 \\\ 2560×1440MOBILE: iPhone XS \\\ iPhone XS Max \\\ iPad ProLearn more about Daniel Ramirez Perez here and follow on IG here.View and download past Designer Desktops here.Kelly Beall is senior editor at Design Milk. The Pi

---

<a id='creativereview'></a>
# [Creative Review](https://www.creativereview.co.uk)

In [36]:
def ws_creativereview(news_cards, data, st_dict):
    print('||| Beginning Web Scraping |||')
    try:
        for news in tqdm(news_cards):
            sname = ".//h2[@class='hentry-title medium']//a"
            link = ws.get_elements(news, 3, sname, False)
            if not link:
                continue
            link = link.get_attribute('href')
    
            sname = ".//footer//time"
            date = ws.get_elements(news, 3, sname, False)
            if not date:
                continue
            date = date.get_attribute('datetime')[:10]
            date = datetime.strptime(date, "%Y-%m-%d").date()

            wd = ws.get_wd(link)
            
            sname = ".//a[@class='term-link-taxonomy-type']"
            topic = ws.get_elements(wd, 3, sname, False)
            if not topic:
                continue
            if not len(topic.text):
                continue
            topic = topic.text.lower()
            
            sname = ".//h1[@class='page-title']"
            headline = ws.get_elements(wd, 3, sname, False)
            if not headline:
                continue
            if not len(headline.text):
                continue
            headline = headline.text
            
            sname = ".//div[@class='standfirst']//p"
            subline = ws.get_elements(wd, 3, sname, False)
            if not subline:
                continue
            subline = subline.text
            
            sname = ".//span[@class='author vcard']"
            author = ws.get_elements(wd, 3, sname, False)
            if not author:
                continue
            author = author.text
            
            sname = ".//div[@class='article-body']//p"
            p = ws.get_elements(wd, 3, sname)
            if not p:
                continue
            article = ''
            for a in p:
                article += a.text
            article = ''.join(article.split('\n'))
            
            wd.quit()
            
            data['topic'].append(topic)
            data['headline'].append(headline)
            data['subline'].append(subline)
            data['link'].append(link)
            data['date'].append(date)
            data['author'].append(author)
            data['article'].append(article)
            
#             print("topic: ", topic)
#             print("headline: ", headline)
#             print("link: ", link)
#             print("date: ", date)
#             print("subline: ", subline)
#             print("author", author)
#             print("article", article)

        print("||| Tabulation Complete. |||")

    except Exception as e:
        print(f"Unable to ws_() due to {e}")
    
    return data

In [37]:
url = "https://www.creativereview.co.uk"
sname = "block-lead"
wd = ws.get_wd(url)

news_cards = ws.get_elements(wd, 0, sname)

In [38]:
datanames = ['topic', 'link', 'headline', 'date', 'subline', 'author', 'article', 'summary']
# takes about 2-5 mins
df_creativereview = ws.webscrap(sitename="creativereview", sitefunction=ws_creativereview, datanames=datanames, news_cards=news_cards)
wd.quit()

  0%|          | 0/74 [00:00<?, ?it/s]

||| Beginning Web Scraping |||


  1%|▏         | 1/74 [00:08<10:31,  8.65s/it]

Element Exception Encountered.


  4%|▍         | 3/74 [00:23<08:55,  7.54s/it]

Element Exception Encountered.


  5%|▌         | 4/74 [00:33<09:56,  8.52s/it]

Element Exception Encountered.


 15%|█▍        | 11/74 [00:46<02:15,  2.15s/it]

Element Exception Encountered.
Element Exception Encountered.
Element Exception Encountered.
Element Exception Encountered.
Element Exception Encountered.
Element Exception Encountered.


 27%|██▋       | 20/74 [01:57<07:16,  8.08s/it]

Element Exception Encountered.
Element Exception Encountered.


 31%|███       | 23/74 [02:13<05:47,  6.82s/it]

Element Exception Encountered.


 32%|███▏      | 24/74 [02:21<05:53,  7.06s/it]

Element Exception Encountered.


 34%|███▍      | 25/74 [02:30<06:13,  7.61s/it]

Element Exception Encountered.


 36%|███▋      | 27/74 [02:50<06:56,  8.86s/it]

Element Exception Encountered.


 38%|███▊      | 28/74 [02:56<06:08,  8.01s/it]

Element Exception Encountered.


 39%|███▉      | 29/74 [03:04<06:00,  8.01s/it]

Element Exception Encountered.


 41%|████      | 30/74 [03:10<05:32,  7.55s/it]

Element Exception Encountered.


 47%|████▋     | 35/74 [03:47<05:01,  7.74s/it]

Element Exception Encountered.


 49%|████▊     | 36/74 [03:55<05:05,  8.05s/it]

Element Exception Encountered.
Element Exception Encountered.
Element Exception Encountered.


 54%|█████▍    | 40/74 [04:09<02:56,  5.20s/it]

Element Exception Encountered.


 58%|█████▊    | 43/74 [04:36<03:47,  7.34s/it]

Element Exception Encountered.


 62%|██████▏   | 46/74 [04:58<03:22,  7.24s/it]

Element Exception Encountered.


 66%|██████▌   | 49/74 [05:26<03:21,  8.06s/it]

Element Exception Encountered.


 69%|██████▉   | 51/74 [05:45<03:18,  8.61s/it]

Element Exception Encountered.


 73%|███████▎  | 54/74 [05:52<01:28,  4.41s/it]

Element Exception Encountered.
Element Exception Encountered.
Element Exception Encountered.


 74%|███████▍  | 55/74 [05:58<01:34,  4.95s/it]

Element Exception Encountered.


 76%|███████▌  | 56/74 [06:08<01:50,  6.14s/it]

Element Exception Encountered.


 77%|███████▋  | 57/74 [06:14<01:46,  6.26s/it]

Element Exception Encountered.


 78%|███████▊  | 58/74 [06:19<01:33,  5.83s/it]

Element Exception Encountered.
Element Exception Encountered.


 81%|████████  | 60/74 [06:26<01:05,  4.65s/it]

Element Exception Encountered.


 82%|████████▏ | 61/74 [06:33<01:08,  5.28s/it]

Element Exception Encountered.


 88%|████████▊ | 65/74 [07:08<01:10,  7.80s/it]

Element Exception Encountered.


 89%|████████▉ | 66/74 [07:14<00:59,  7.38s/it]

Element Exception Encountered.


 91%|█████████ | 67/74 [07:19<00:46,  6.58s/it]

Element Exception Encountered.


 95%|█████████▍| 70/74 [07:27<00:15,  3.76s/it]

Element Exception Encountered.
Element Exception Encountered.
Element Exception Encountered.
Element Exception Encountered.


100%|██████████| 74/74 [07:27<00:00,  6.05s/it]
  0%|          | 0/27 [00:00<?, ?it/s]

Element Exception Encountered.
Element Exception Encountered.
Element Exception Encountered.
||| Tabulation Complete. |||
||| Web Scraping Complete for CREATIVEREVIEW. |||
||| Generating Summaries... |||


100%|██████████| 27/27 [01:18<00:00,  2.91s/it]

||| Articles Summarized. |||
||| Creating Dataframe, Saving as CSV |||





In [39]:
df_creativereview.sample(5)

Unnamed: 0,topic,link,headline,date,subline,author,article,summary
8,creative inspiration,https://www.creativereview.co.uk/creativity-su...,Creativity Sucks! podcast asks what Gen Z real...,2023-06-07,"In our latest edition of the CR podcast, guest...",Creative Review,It doesn’t feel very long ago that brands and ...,",,,, and share, it really helps. and share, it..."
17,creative inspiration,https://www.creativereview.co.uk/wes-anderson-...,Wes Anderson’s Asteroid City comes to life in ...,2023-06-19,"An array of original sets, props, miniatures, ...",Aimee Mclaughlin,Set in the fictional American desert town of A...,is at 180 Studios until July 8; 180studios.com...
7,creative insight,https://www.creativereview.co.uk/creative-awar...,What do the D&AD Awards reveal about the state...,2023-06-09,No D&AD Black Pencils were awarded in advertis...,Patricia McDonald,"More than any other creative industry award, a...",",,,,, was absolute consensus. What there wasn’..."
0,creative inspiration,https://www.creativereview.co.uk/pilo-branding/,"Youth hostel Pilo gets a fun, fluffy visual id...",2023-06-22,"Designed by 5.5, the branding embodies the you...",Daniel Milroy Maher,"Pilo, a new generation youth hostel in Lyon, F...","Pilo, a new generation youth hostel in Lyon, F..."
24,sponsored content,https://www.creativereview.co.uk/get-the-most-...,Get the most from any shoot location,2022-09-08,Once you’ve found the ideal location for your ...,Creative Review,"It takes planning, vision, and research to sou...",", but with the right location and enough creat..."


In [40]:
sm.print_summaries(df_creativereview, 1)



|| Article ||

As art and design graduates around the UK gear up to present their very best work, we’ve assembled a guide to undergraduate shows taking place across the country in the next few months. We suggest checking the links for specific details on presentation formats and opening times, as many showcases will be closed over the late May bank holiday weekend.There are more shows yet to be confirmed so we’ll be updating this list on a rolling basis. If your BA degree show or one you know of isn’t listed, let us know through social media or email a member of the editorial team so we can add it in.Anglia Ruskin University (May 19-June 23)Arts University Bournemouth (June 30-July 7, and online from July 31)Arts University Plymouth (June 23-29)Belfast School of Art (June 10-17)Birmingham City University, Inspired Festival (June 10-25)Buckinghamshire New University (June 16-21)Camberwell College of Arts, UAL (June 9-17, and online)Cardiff School of Art and Design (June 7-13)Central S

---

<a id='creativebloq'></a>
# [Creative Bloq](https://www.creativebloq.com/)

In [41]:
def ws_creativebloq(news_cards, data, st_dict):
    print('||| Beginning Web Scraping |||')
    try:
        for news in tqdm(news_cards):
            group = ws.get_elements(news, 0, "listingResult")
            
            for g in tqdm(group):
                # from news element
                # link, headline, subline
                sname = "article-link"
                link = ws.get_elements(g, 0, sname, False)
                if not link:
                    continue
                link = link.get_attribute('href')
                
                sname = "article-name"
                headline = ws.get_elements(g, 0, sname, False)
                if not headline:
                    continue
                headline = headline.text
            
                sname = "synopsis"
                subline = ws.get_elements(news, 0, sname, False)
                if not subline:
                    continue
                subline = subline.text
            
                # from link
                # topic, author
                wd = ws.get_wd(link)
            
                sname = "author-byline__author-name"
                author = ws.get_elements(wd, 0, sname, False)
                if not author:
                    continue
                author = author.text
            
                sname = ".//nav[@class='breadcrumb']//ol//li"
                topic = ws.get_elements(wd, 3, sname)[-1]
                if not topic:
                    continue
                topic = topic.text.lower()
                
                sname = ".//div[@id='article-body']//p"
                p = ws.get_elements(wd, 3, sname)
                article = ''
                for t in p:
                    article += t.text
                article = ''.join(article.split('\n'))
            
                wd.quit()

                data['topic'].append(topic)
                data['headline'].append(headline)
                data['subline'].append(subline)
                data['link'].append(link)
                data['author'].append(author)
                data['article'].append(article)
            
#                 print("topic: ", topic)
#                 print("headline: ", headline)
#                 print("subline: ", subline)
#                 print("link: ", link)
#                 print("author: ", author)
#                 print("article: ", article)

        print("||| Tabulation Complete. |||")

    except Exception as e:
        print(f"Unable to ws_creativebloq() due to {e}")
    
    return data

In [42]:
url = "https://www.creativebloq.com/"
sname = "listingResults"
wd = ws.get_wd(url)

news_cards = ws.get_elements(wd, 0, sname)

In [43]:
datanames = ['topic', 'link', 'headline', 'author', 'subline', 'article', 'summary']
# takes about 2-5 mins
df_creativebloq = ws.webscrap(sitename="creativebloq", sitefunction=ws_creativebloq, datanames=datanames, news_cards=news_cards)
wd.quit()

  0%|          | 0/6 [00:00<?, ?it/s]
  0%|          | 0/9 [00:00<?, ?it/s][A

||| Beginning Web Scraping |||
Element Exception Encountered.



 22%|██▏       | 2/9 [00:11<00:40,  5.73s/it][A
 33%|███▎      | 3/9 [00:22<00:48,  8.12s/it][A
 44%|████▍     | 4/9 [00:32<00:43,  8.72s/it][A

Element Exception Encountered.



 67%|██████▋   | 6/9 [00:35<00:14,  4.98s/it][A
 78%|███████▊  | 7/9 [00:38<00:08,  4.42s/it][A
100%|██████████| 9/9 [00:46<00:00,  5.17s/it][A
 17%|█▋        | 1/6 [00:46<03:52, 46.59s/it]
  0%|          | 0/3 [00:00<?, ?it/s][A

Element Exception Encountered.



 33%|███▎      | 1/3 [00:37<01:14, 37.16s/it][A
 67%|██████▋   | 2/3 [00:46<00:20, 20.91s/it][A
100%|██████████| 3/3 [00:49<00:00, 16.44s/it][A
 33%|███▎      | 2/6 [01:35<03:12, 48.20s/it]
  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:08<00:25,  8.57s/it][A
 50%|█████     | 2/4 [00:46<00:51, 25.89s/it][A
 75%|███████▌  | 3/4 [00:55<00:18, 18.29s/it][A
100%|██████████| 4/4 [01:34<00:00, 23.63s/it][A
 50%|█████     | 3/6 [03:10<03:28, 69.37s/it]
  0%|          | 0/3 [00:00<?, ?it/s][A
 33%|███▎      | 1/3 [00:09<00:18,  9.01s/it][A
 67%|██████▋   | 2/3 [00:19<00:09,  9.68s/it][A
100%|██████████| 3/3 [00:57<00:00, 19.24s/it][A
 67%|██████▋   | 4/6 [04:08<02:09, 64.78s/it]
  0%|          | 0/3 [00:00<?, ?it/s][A
 33%|███▎      | 1/3 [00:37<01:14, 37.30s/it][A
 67%|██████▋   | 2/3 [00:40<00:17, 17.14s/it][A
100%|██████████| 3/3 [00:43<00:00, 14.56s/it][A
 83%|████████▎ | 5/6 [04:51<00:57, 57.18s/it]
  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌      

||| Tabulation Complete. |||
||| Web Scraping Complete for CREATIVEBLOQ. |||
||| Generating Summaries... |||


100%|██████████| 23/23 [01:14<00:00,  3.24s/it]

||| Articles Summarized. |||
||| Creating Dataframe, Saving as CSV |||





In [44]:
df_creativebloq.sample(5)

Unnamed: 0,topic,link,headline,author,subline,article,summary
16,digital art,https://www.creativebloq.com/reviews/c-smash-v...,C-Smash VRS review: art and arcade collide on ...,Ian Dean,"Serving up a Dreamcast cult classic in VR, is ...",If we’re talking about video gaming with style...,"is a remake of a cult Sega Dreamcast game, now..."
2,digital art,https://www.creativebloq.com/features/video-ga...,I'm inspired by the best video games of the 80s,Martyn Carroll,The best FMV games include retro revivals and ...,When looking back at the best video games of t...,: Atari 2600I’m obviously not championing this...
6,news,https://www.creativebloq.com/news/colour-gradi...,These clever colour gradient food stickers are...,Joseph Foley,And they've just won at the Cannes Lions.,"Apparently, we chuck out a ridiculous amount o...","stickers are for a range of produce, from toma..."
7,digital art,https://www.creativebloq.com/news/kitbash-3d-l...,KitBash3D's Cargo is an eye-opening asset libr...,Ian Dean,And they've just won at the Cannes Lions.,"So, this is quite remarkable. The newly launch...",", Cargo from KitBash3D enables creatives to ac..."
0,digital art,https://www.creativebloq.com/features/best-fmv...,"Wildly inventive and a little cheesy, these ar...",Jess Kinghorn,The best FMV games include retro revivals and ...,The best FMV games offer a mix of invention an...,",,,,: 'What is FMV?': 'What is FMV?' 'What is ..."


In [45]:
sm.print_summaries(df_creativebloq, 1)



|| Article ||

McDonald's is known for its eye-catching adverts. It has an extremely recognisable colour scheme, those Golden Arches and the courage to create playful marketing campaigns that sometimes don't even mention its product.But which are the best McDonald's adverts ever? There are a lot to choose from, from drunken McDonald's messages to an ingenious campaign in France where doors were made to look like fries. One user on Twitter has rounded up what he thinks are the best ones ever, and we've highlighted a few below. Did your favourite make the cut?In the number one spot is this ad for chicken nuggets, which is a simple idea, executed well. It shows the product, and riffs on the cravings people have for said chicken nugget. And the ketchup plus nugget is a subtle nod to the McDonald's colours. (See our why McDonald's uses red and yellow post for more about the brand's colour palette.)At number two, there's a father's day ad, which shows a cup holding hands with two smaller c

---

<a id='awn'></a>
# [AWN](https://www.awn.com/news)

In [46]:
def ws_awn(news_cards, data, st_dict):
    print('||| Beginning Web Scraping |||')
    try:
        for news in tqdm(news_cards):
            sname = "descriptive-tags"
            topic = ws.get_elements(news, 0, sname, False)
            if not topic:
                continue
            topic = topic.text.lower()
            
            sname = "title-with-tag"
            headline = ws.get_elements(news, 0, sname, False)
            if not headline:
                continue
            headline = headline.text

            sname = "field-item"
            subline = ws.get_elements(news, 0, sname)[-1]
            if not subline:
                continue
            subline = subline.text
            
            sname = ".//span[@class='title-with-tag']//a"
            link = ws.get_elements(news, 3, sname, False)
            if not link:
                continue
            link = link.get_attribute('href')
            
            sname = "username"
            author = ws.get_elements(news, 0, sname, False)
            if not author:
                continue
            author = author.text
            
            # unclean text on the website
            sname = "submitted"
            s = ws.get_elements(news, 0, sname, False).text
            d = s[s.find(',')+2 : s.find('at')-1]
            if not d:
                continue
            date = datetime.strptime(d, "%B %d, %Y").date()
            
            sname = ".//span[@class='taxonomy-term']//a"
            geoloc = ws.get_elements(news, 3, sname)[-1]
            if not geoloc:
                continue
            geoloc = geoloc.text.lower()
            
            wd = ws.get_wd(link)
            
            p = ws.get_elements(wd, 0, "field-name-body", False)
            if not p:
                continue
            article = ''.join(p.text.split('\n'))

            wd.quit()
        
            data['topic'].append(topic)
            data['headline'].append(headline)
            data['subline'].append(subline)
            data['link'].append(link)
            data['author'].append(author)
            data['date'].append(date)
            data['geoloc'].append(geoloc)
            data['article'].append(article)

#             print("topic: ", topic)
#             print("headline: ", headline)
#             print("subline: ", subline)
#             print("link: ", link)
#             print("author: ", author)
#             print("date: ", date)
#             print("geoloc: ", geoloc)
#             print("article: ", article)
            
        print("||| Tabulation Complete. |||")

    except Exception as e:
        print(f"Unable to ws_awn() due to {e}")
    
    return data

In [47]:
url = "https://www.awn.com/news"
sname = "node"
wd = ws.get_wd(url)

news_cards = ws.get_elements(wd, 0, sname)

In [48]:
datanames = ['topic', 'headline', 'subline', 'link', 'author', 'date', 'geoloc', 'article', 'summary']
# takes about 2-5 mins
df_awn = ws.webscrap(sitename="awn", sitefunction=ws_awn, datanames=datanames, news_cards=news_cards)
wd.quit()

  0%|          | 0/25 [00:00<?, ?it/s]

||| Beginning Web Scraping |||


100%|██████████| 25/25 [01:49<00:00,  4.37s/it]
  0%|          | 0/23 [00:00<?, ?it/s]

||| Tabulation Complete. |||
||| Web Scraping Complete for AWN. |||
||| Generating Summaries... |||


100%|██████████| 23/23 [01:09<00:00,  3.00s/it]

||| Articles Summarized. |||
||| Creating Dataframe, Saving as CSV |||





In [49]:
df_awn.sample(5)

Unnamed: 0,topic,headline,subline,link,author,date,geoloc,article,summary
1,awards,‘Meneath: The Mirrors of Ethics’ Wins ‘New Voi...,Based on Terril Calder’s award-winning 2021 an...,https://www.awn.com/news/meneath-mirrors-ethic...,Debbie Diamond Sarto,2023-06-20,north america,The National Film Board of Canada’s (NFB) inst...,"and,,, of Gail Maurice (Cardinal, Tricksters) ..."
19,more ip gold,Sarah Polley to Direct Disney’s ‘Bambi’ Live-A...,The Oscar winning ‘Women Talking’ writer/direc...,https://www.awn.com/news/sarah-polley-direct-d...,Laurén Alexa,2023-06-16,all,"Disney’s animated classic, ‘Bambi,’ (1942). Im...",", Sarah Polley (Women Talking) is set to helm ..."
15,business,Adult Swim Greenlights ‘Common Side Effects’ f...,"Announced at Annecy 2023, the new animated com...",https://www.awn.com/news/adult-swim-greenlight...,Debbie Diamond Sarto,2023-06-16,all,"Today at Annecy 2023, Adult Swim announced it ...",",,, and executive producers.com.com.com.com.co..."
18,the new dcu,Andy Muschietti to Direct New DCU Batman Film ...,‘The Flash’ director will helm the upcoming Da...,https://www.awn.com/news/andy-muschietti-direc...,Laurén Alexa,2023-06-16,all,The upcoming Batman film The Brave and the Bol...,", and we couldn’t have better or more inspirin..."
21,we want more old disney shorts!,Disney+ Releasing 28 Newly Restored Classic Sh...,The streamer will debut a trove of films featu...,https://www.awn.com/news/disney-releasing-28-n...,Debbie Diamond Sarto,2023-06-15,all,"'Aquamania' (1961), one of 28 newly restored c...","(1961), one of 28 newly restored classic Disne..."


In [50]:
sm.print_summaries(df_awn, 1)



|| Article ||

Last week at Annecy 2023, Piece of Magic Entertainment (POM) launched its new subsidiary, POM Anime – coinciding with the acquisition of distribution rights to the anime film, The First Slam Dunk, which screened at the festival. The film, to be released under the POM Anime banner, is slated for theatrical release across Scandinavia, Poland, Baltics, Greece, and the Central European region late Summer 2023.In recent years, POM has distributed popular anime titles, including Demon Slayer: Mugen Train, Jujutsu Kaisen 0, and One-Piece Film: Red. With the launch of POM Anime, the company plans to bring the genre to a larger global audience. Additional anime title announcements are expected in the coming weeks.POM Anime’s first release, The First Slam Dunk, according to POM, has already achieved box-office success, grossing over $260 million worldwide in Asian markets to date, making it the fifth highest-grossing anime film of all time. The film is based on the Takehiko Inou

---

<a id='architecturaldigest'></a>
# [Architectural Digest](https://www.architecturaldigest.com)

In [51]:
def ws_architecturaldigest(news_cards, data, st_dict):
    print('||| Beginning Web Scraping |||')
    try:
        for news in tqdm(news_cards):
            # from news
            # headline, link, subline 
            sname = "SummaryItemHedLink-civMjp"
            headline = ws.get_elements(news, 0, sname, False)
            if not headline:
                continue
            headline = headline.text
            
            sname = "SummaryItemHedLink-civMjp"
            link = ws.get_elements(news, 0, sname, False)
            if not link:
                continue
            link = link.get_attribute('href')
            
            sname = "BaseWrap-sc-gjQpdd"
            subline = ws.get_elements(news, 0, sname, False)
            if not subline:
                continue
            subline = subline.text
            
            # from link
            # author, date, topic, article
            wd = ws.get_wd(link)
            
            sname = "BylineName-kwmrLn"
            author = ws.get_elements(wd, 0, sname, False)
            if not author:
                continue
            author = author.text[3:]
            
            sname = ".//time[@class='BaseWrap-sc-gjQpdd BaseText-ewhhUZ ContentHeaderPublishDate-eIBicG iUEiRd ehKhdr gqCBkE']"
            date = ws.get_elements(wd, 3, sname, False)
            if not date:
                continue
            date = date.get_attribute("datetime")[:10]
            date = datetime.strptime(date, "%Y-%m-%d").date()
            
            sname = ".//span[@class='RubricName-fVtemz cLxcNi']"
            topic = ws.get_elements(wd, 3, sname, False)
            if not topic:
                continue
            topic = topic.text.lower()
            
            sname = ".//div[@class='BodyWrapper-kufPGa lguTLT body GalleryPageIntroBody-jRNmRA ePDgJv body__container']"
            p = ws.get_elements(wd, 3, sname, False)
            if not p:
                continue
            article = ''.join(p.text.split('\n'))
            
            wd.quit()

            data['topic'].append(topic)
            data['link'].append(link)
            data['headline'].append(headline)
            data['subline'].append(subline)
            data['author'].append(author)
            data['date'].append(date)
            data['article'].append(article)
            
#             print("topic: ", topic)
#             print("link: ", link)
#             print("headline: ", headline)
#             print("subline: ", subline)
#             print("author: ", author)
#             print("date: ", date)
#             print("article: ", article)
            
        print("||| Tabulation Complete. |||")

    except Exception as e:
        print(f"Unable to ws_architecturaldigest() due to {e}")
    
    return data

In [52]:
url = "https://www.architecturaldigest.com"
sname = "SummaryItemWrapper-iwvBff"
wd = ws.get_wd(url)

news_cards = ws.get_elements(wd, 0, sname)

In [53]:
datanames = ['topic', 'link', 'headline', 'subline', 'author', 'date', 'article', 'summary']
# takes about 2-5 mins
df_architecturaldigest = ws.webscrap(sitename="architecturaldigest", sitefunction=ws_architecturaldigest, datanames=datanames, news_cards=news_cards)
wd.quit()

  0%|          | 0/53 [00:00<?, ?it/s]

||| Beginning Web Scraping |||


  8%|▊         | 4/53 [00:35<07:05,  8.67s/it]

Element Exception Encountered.


 11%|█▏        | 6/53 [00:52<06:41,  8.53s/it]

Element Exception Encountered.


 13%|█▎        | 7/53 [01:01<06:39,  8.69s/it]

Element Exception Encountered.


 15%|█▌        | 8/53 [01:10<06:37,  8.82s/it]

Element Exception Encountered.


 17%|█▋        | 9/53 [01:21<06:47,  9.26s/it]

Element Exception Encountered.


 19%|█▉        | 10/53 [01:30<06:39,  9.29s/it]

Element Exception Encountered.


 21%|██        | 11/53 [01:41<06:47,  9.69s/it]

Element Exception Encountered.


 23%|██▎       | 12/53 [01:50<06:28,  9.48s/it]

Element Exception Encountered.


 26%|██▋       | 14/53 [04:24<24:57, 38.39s/it]

Element Exception Encountered.


 28%|██▊       | 15/53 [04:34<18:47, 29.66s/it]

Element Exception Encountered.


 32%|███▏      | 17/53 [04:56<12:08, 20.23s/it]

Element Exception Encountered.


 34%|███▍      | 18/53 [05:20<12:28, 21.38s/it]

Element Exception Encountered.


 36%|███▌      | 19/53 [05:27<09:43, 17.16s/it]

Element Exception Encountered.


 38%|███▊      | 20/53 [05:38<08:22, 15.22s/it]

Element Exception Encountered.


 42%|████▏     | 22/53 [05:56<06:20, 12.26s/it]

Element Exception Encountered.


 43%|████▎     | 23/53 [06:06<05:43, 11.46s/it]

Element Exception Encountered.


 45%|████▌     | 24/53 [06:16<05:19, 11.02s/it]

Element Exception Encountered.


 49%|████▉     | 26/53 [06:35<04:37, 10.29s/it]

Element Exception Encountered.


 53%|█████▎    | 28/53 [06:53<03:59,  9.57s/it]

Element Exception Encountered.


 55%|█████▍    | 29/53 [07:01<03:34,  8.96s/it]

Element Exception Encountered.


 57%|█████▋    | 30/53 [07:10<03:31,  9.20s/it]

Element Exception Encountered.


 58%|█████▊    | 31/53 [07:20<03:26,  9.38s/it]

Element Exception Encountered.


 60%|██████    | 32/53 [07:32<03:30, 10.01s/it]

Element Exception Encountered.


 62%|██████▏   | 33/53 [07:43<03:28, 10.42s/it]

Element Exception Encountered.


 64%|██████▍   | 34/53 [07:56<03:32, 11.21s/it]

Element Exception Encountered.


 66%|██████▌   | 35/53 [08:07<03:19, 11.07s/it]

Element Exception Encountered.


 68%|██████▊   | 36/53 [08:15<02:53, 10.22s/it]

Element Exception Encountered.


 70%|██████▉   | 37/53 [08:23<02:34,  9.64s/it]

Element Exception Encountered.


 72%|███████▏  | 38/53 [08:33<02:23,  9.58s/it]

Element Exception Encountered.


 74%|███████▎  | 39/53 [08:41<02:07,  9.13s/it]

Element Exception Encountered.


 75%|███████▌  | 40/53 [08:51<02:02,  9.41s/it]

Element Exception Encountered.


 77%|███████▋  | 41/53 [09:00<01:53,  9.42s/it]

Element Exception Encountered.


 79%|███████▉  | 42/53 [09:15<02:01, 11.04s/it]

Element Exception Encountered.


 81%|████████  | 43/53 [09:25<01:46, 10.68s/it]

Element Exception Encountered.


 83%|████████▎ | 44/53 [09:34<01:32, 10.24s/it]

Element Exception Encountered.


 87%|████████▋ | 46/53 [09:54<01:09,  9.91s/it]

Element Exception Encountered.


 91%|█████████ | 48/53 [10:13<00:47,  9.49s/it]

Element Exception Encountered.


 92%|█████████▏| 49/53 [10:22<00:37,  9.39s/it]

Element Exception Encountered.


 94%|█████████▍| 50/53 [10:31<00:27,  9.15s/it]

Element Exception Encountered.


 98%|█████████▊| 52/53 [10:48<00:08,  8.91s/it]

Element Exception Encountered.


100%|██████████| 53/53 [10:59<00:00, 12.45s/it]
  0%|          | 0/13 [00:00<?, ?it/s]

||| Tabulation Complete. |||
||| Web Scraping Complete for ARCHITECTURALDIGEST. |||
||| Generating Summaries... |||


100%|██████████| 13/13 [00:36<00:00,  2.78s/it]

||| Articles Summarized. |||
||| Creating Dataframe, Saving as CSV |||





In [54]:
sm.print_summaries(df_architecturaldigest, 1)



|| Article ||

When it comes to creating a high-drama environment to exchange vows, it’s difficult to rival the grandiosity of a castle wedding venue. Palaces present an ideal location for couples aiming to cast all notions of humility aside on such a momentous occasion in favor of some real pomp and circumstance. The majestic structures are the stuff of real-life fairytales, where royal titleholders like the Windsors—and de facto royals, like Priyanka Chopra and Nick Jonas—have gotten hitched in the company of hundreds of their closest friends and relatives. Throwing a party inside of towering stone walls with centuries-old battlements or among ornate banquet halls with high-vaulted ceilings is a surefire way to give guests an experience they’ll never forget. While the term royal wedding may have established some very Great Britain–specific associations in recent years, there are a number of luxe castle properties functioning as wedding venues all across the globe, from Italy’s Lazi

In [55]:
df_architecturaldigest.sample(5)

Unnamed: 0,topic,link,headline,subline,author,date,article,summary
2,magazine,https://www.architecturaldigest.com/gallery/in...,Step Inside a Jeremiah Brent and William Hefne...,A young family enlisted their two design crush...,Catherine Hong,2023-06-22,How many of us swore to ourselves that when we...,"and Kevin Danesh, both born in Iran, are marri..."
6,travel,https://www.architecturaldigest.com/gallery/it...,9 Italian Islands Only Locals Know About,From Venice’s lagoon in the north to the warm ...,Marco Morello,2023-06-22,"When picturing Italy, many think of the wonder...",", these nine islands should be on your must-vi..."
3,travel,https://www.architecturaldigest.com/gallery/wo...,The World’s 11 Most Famous Artist Homes That Y...,From Pablo Picasso’s first home to Jackson Pol...,Katherine McGrath,2023-06-21,To enter into an artist’s home is indisputably...,"their work, and for good reason—what better wa..."
9,culture + lifestyle,https://www.architecturaldigest.com/gallery/op...,9 Opulent Castle Wedding Venues Around the World,These storybook settings create a royal weddin...,Charlotte Collins,2023-06-16,When it comes to creating a high-drama environ...,"s are the stuff of real-life fairytales, where..."
5,innovative design,https://www.architecturaldigest.com/gallery/mo...,10 Most Beautifully Repurposed Buildings,These innovative structures honor their past l...,Charlotte Collins,2023-05-04,Modern architects are undoubtedly under more p...,", the, and simply inhabiting a space ill-suite..."


---

<a id='cartoonbrew'></a>
# [Cartoon Brew](https://www.cartoonbrew.com)

In [89]:
def ws_cartoonbrew(news_cards, data, st_dict):
    print('||| Beginning Web Scraping |||')
    try:
        for news in tqdm(news_cards):
            # from news
            # topic, headline, link, subline, date
            sname = "category-slug"
            topic = ws.get_elements(news, 0, sname, False)
            if not topic:
                continue
            topic = topic.text.lower()
            
            sname = "entry-title"
            headline = ws.get_elements(news, 0, sname, False)
            if not headline:
                continue
            headline = headline.text
            
            sname = ".//h2[@class='entry-title']//a"
            link = ws.get_elements(news, 3, sname, False)
            if not link:
                continue
            link = link.get_attribute('href')
#             print("link: ", link)
            
            sname = "entry-summary"
            subline = ws.get_elements(news, 0, sname, False)
            if not subline:
                continue
            subline = subline.text
            
            sname = ".//p[@class='byline author vcard']//time"
            date = ws.get_elements(news, 3, sname, False)
            if not date:
                continue
            date = date.get_attribute("datetime")[:10]
            date = datetime.strptime(date, "%Y-%m-%d").date()
            
            # from link
            # article, author
            wd = ws.get_wd(link)
            
            sname = ".//a[@class='fn byline']"
            author = ws.get_elements(wd, 3, sname, False)
            if not author:
                continue
            author = author.text.title()
    
            sname = ".//div[@id='pico']//p"
            p = ws.get_elements(wd, 3, sname)
            if not p:
                wd.quit()
                continue
            article = ''
            for t in p:
                article += t.text
            article = ''.join(article.split('\n'))
            
            wd.quit()
            
#             print("topic: ", topic)
#             print("headline: ", headline)
#             print("subline: ", subline)
#             print("link: ", link)
#             print("author: ", author)
#             print("date: ", date)
#             print("article: ", article)

            data['topic'].append(topic)
            data['headline'].append(headline)
            data['subline'].append(subline)
            data['link'].append(link)
            data['author'].append(author)
            data['date'].append(date)
            data['article'].append(article)

        print("||| Tabulation Complete. |||")

    except Exception as e:
        print(f"Unable to ws_cartoonbrew() due to {e}")
    
    return data

In [90]:
url = "https://www.cartoonbrew.com"
sname = "cb-post-content"
wd = ws.get_wd(url)

news_cards = ws.get_elements(wd, 0, sname)

In [91]:
datanames = ['topic', 'link', 'headline', 'subline', 'author', 'date', 'article', 'summary']
# takes about 2-5 mins
df_cartoonbrew = ws.webscrap(sitename="cartoonbrew", sitefunction=ws_cartoonbrew, datanames=datanames, news_cards=news_cards)
wd.quit()

  0%|          | 0/18 [00:00<?, ?it/s]

||| Beginning Web Scraping |||


 39%|███▉      | 7/18 [00:45<01:12,  6.56s/it]
  0%|          | 0/7 [00:00<?, ?it/s]

Unable to ws_cartoonbrew() due to Message: stale element reference: element is not attached to the page document
  (Session info: headless chrome=112.0.5615.49); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#stale-element-reference-exception
Stacktrace:
#0 0x5c84d39d6b23 <unknown>
#1 0x5c84d36ebb21 <unknown>
#2 0x5c84d36ef24c <unknown>
#3 0x5c84d36eefbf <unknown>
#4 0x5c84d36ef2c7 <unknown>
#5 0x5c84d3724610 <unknown>
#6 0x5c84d374dd02 <unknown>
#7 0x5c84d371e474 <unknown>
#8 0x5c84d374e1ce <unknown>
#9 0x5c84d3767eba <unknown>
#10 0x5c84d374e173 <unknown>
#11 0x5c84d371c662 <unknown>
#12 0x5c84d371dd7e <unknown>
#13 0x5c84d3a03c73 <unknown>
#14 0x5c84d39b9dde <unknown>
#15 0x5c84d39b97b0 <unknown>
#16 0x5c84d39ba5f5 <unknown>
#17 0x5c84d39ffabb <unknown>
#18 0x5c84d39ba9ae <unknown>
#19 0x5c84d399a9a4 <unknown>
#20 0x5c84d39c5728 <unknown>
#21 0x5c84d39c58d5 <unknown>
#22 0x5c84d39d08ff <unknown>
#23 0x7ae3db3616

100%|██████████| 7/7 [00:19<00:00,  2.84s/it]

||| Articles Summarized. |||
||| Creating Dataframe, Saving as CSV |||





In [92]:
sm.print_summaries(df_cartoonbrew, 1)



|| Article ||

Pixar’s Elemental debuted over the weekend with the second-lowest three-day opening – for a wide release – in studio history at an estimated $29.5 million while Warner Bros.’s latest cg-heavy DC hero flick bombed with just $55.1 million grossed over the frame.The only Pixar film to ever make less than Elemental over a three-day opening was the original Toy Story, which made $29.1 million over its first three-day weekend. The debuts of the two films cannot really be compared, however, and Elemental being second-worst is merely a technicality. There is simply no historical precedent for a Pixar film opening as poorly as Elemental.Toy Story, for starters, opened on a Wednesday, so it had already made $9.9m before its $29.1m weekend. It also played in 1,500 fewer theaters than Elemental. And Toy Story opened multiple generations ago, in 1995, when the average U.S. movie ticket price was less than half of what it is today. Adjusted for inflation, Toy Story’s three-day weeke

In [93]:
df_cartoonbrew.sample(5)

Unnamed: 0,topic,link,headline,subline,author,date,article,summary
5,festivals,https://www.cartoonbrew.com/festivals/indie-an...,Independent Animation Producers Share Tips On ...,Without billions in backing or decades-old IPs...,Ryan Gaur,2023-06-19,With popular IP productions dominating box off...,",,, Mad Entertainment, Lindsey Adams from Irel..."
4,cartoon brew pick,https://www.cartoonbrew.com/cartoon-brew-pick/...,Ben Meinhardt’s Meta Short ‘Living The Dream’ ...,An animated animator smiles and sings through ...,Jamie Lang,2023-06-19,"Sometimes, in the face of adversity, all you c...",the world. Ben Meinhardt is a 2d animator with...
0,business,https://www.cartoonbrew.com/executives/tonic-d...,Montreal’s Tonic DNA Acquires Ngenious Studio,The company has also appointed Ngenious founde...,Jamie Lang,2023-06-21,Montreal-based Tonic DNA has acquired Ngenious...,"the acquisition of Ngenious Studio, all of its..."
6,box office report,https://www.cartoonbrew.com/box-office-report/...,‘Elemental’ Flops With Second-Worst Debut In P...,Warner Bros.’s cg-heavy superhero epic ‘The Fl...,Jamie Lang,2023-06-19,Pixar’s Elemental debuted over the weekend wit...,", Elemental debuts at around $70 million and E..."
3,executives,https://www.cartoonbrew.com/executives/buck-op...,"Buck Opens New London Office, Appoints James B...",Producer of several entries in Netflix’s Emmy-...,Jamie Lang,2023-06-20,Global production outfit Buck is growing its E...,Buck is growing its European team with a new o...


---

<a id='webname'></a>
# CompanyName

In [None]:
def ws_(news_cards, data, st_dict):
    print('||| Beginning Web Scraping |||')
    try:
        for news in tqdm(news_cards):
            # from news
            sname = ""
             = ws.get_elements(news, , sname, False)
            if not :
                continue
             = .text
            
            # from link
            wd = ws.get_wd(link)
            
#             article = ''
#             for t in p:
#                 article += t.text
#             article = ''.join(article.split('\n'))
            wd.quit()

#             data['topic'].append(topic) # append items to the dictionary like this
        print("||| Tabulation Complete. |||")

    except Exception as e:
        print(f"Unable to ws_() due to {e}")
    
    return data

In [None]:
url = ""
sname = ""
wd = ws.get_wd(url)

news_cards = ws.get_elements(wd, 0, sname)

In [None]:
datanames = ['', '', '', '', '', '']
# takes about 2-5 mins
df_ = ws.webscrap(sitename="", sitefunction=ws_, datanames=datanames, news_cards=news_cards)
wd.quit()

In [None]:
sm.print_summaries(df_, 1)

In [None]:
df_.sample(5)