In [1]:
from bs4 import BeautifulSoup
import requests 
import re
import pandas as pd

In [2]:
# set url to a target page on nbcbayarea.com
url = "https://www.nbcbayarea.com/news/local/"

In [3]:
response = requests.get(url)

In [4]:
soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
# Find articles 
art_content = soup.find_all('a', class_="story-card__title-link")

art_content

[<a class="story-card__title-link" href="https://www.nbcbayarea.com/news/local/bay-area-attorney-who-backs-trump-describes-scene-at-capitol-riot/2439998/">
 									Bay Area Attorney Who Backs Trump Describes Scene at Capitol Riot											</a>,
 <a class="story-card__title-link" href="https://www.nbcbayarea.com/news/local/only-1-reported-flu-case-in-santa-clara-county-so-far-this-season/2439972/">
 									Only 1 Reported Flu Case in Santa Clara County So Far This Season											</a>,
 <a class="story-card__title-link" href="https://www.nbcbayarea.com/news/california/california-sees-two-day-record-of-coronavirus-deaths/2439173/">
 									California Virus Deaths Rocket Higher as ICU Space Tightens											</a>,
 <a class="story-card__title-link" href="https://www.nbcbayarea.com/news/local/lemur-theft-suspect-ordered-to-stay-away-from-san-francisco-zoo/2437850/">
 									Lemur Theft Suspect Ordered to Stay Away From San Francisco Zoo											</a>,
 <a class="story-card__tit

In [6]:
# Create a list of articles for dictionaries {"article_name": '', "link": ''}.  
all_articls = []

for ind in range(len(art_content)):
    title = art_content[ind].text.strip()
    link = art_content[ind].attrs['href']
    all_articls.append({"article_name": title, "link": link})
    
all_articls

[{'article_name': 'Bay Area Attorney Who Backs Trump Describes Scene at Capitol Riot',
  'link': 'https://www.nbcbayarea.com/news/local/bay-area-attorney-who-backs-trump-describes-scene-at-capitol-riot/2439998/'},
 {'article_name': 'Only 1 Reported Flu Case in Santa Clara County So Far This Season',
  'link': 'https://www.nbcbayarea.com/news/local/only-1-reported-flu-case-in-santa-clara-county-so-far-this-season/2439972/'},
 {'article_name': 'California Virus Deaths Rocket Higher as ICU Space Tightens',
  'link': 'https://www.nbcbayarea.com/news/california/california-sees-two-day-record-of-coronavirus-deaths/2439173/'},
 {'article_name': 'Lemur Theft Suspect Ordered to Stay Away From San Francisco Zoo',
  'link': 'https://www.nbcbayarea.com/news/local/lemur-theft-suspect-ordered-to-stay-away-from-san-francisco-zoo/2437850/'},
 {'article_name': 'Dog Stolen, Owner Attacked in San Francisco’s Russian Hill Neighborhood',
  'link': 'https://www.nbcbayarea.com/news/local/san-francisco/dog-st

In [7]:
# Take only first 3 stories

top_stories = all_articls[:3]

In [8]:
# extract content from each article, remove non letters sylabals and add content to dictionary

for story in top_stories:
    response = requests.get(story['link'])
    soup = BeautifulSoup(response.text, "html.parser")
    content = soup.find_all('p')
    
    story_content = []
    
    for ind in range(len(content)):
        temp = content[ind].text
        temp = re.sub("[^a-zA-Z]", " ", str(temp))
        story_content.append(temp)

    story.update({"content": story_content})
    
top_stories[0]

{'article_name': 'Bay Area Attorney Who Backs Trump Describes Scene at Capitol Riot',
 'link': 'https://www.nbcbayarea.com/news/local/bay-area-attorney-who-backs-trump-describes-scene-at-capitol-riot/2439998/',
 'content': ['A Bay Area attorney who supports President Donald Trump was among those at the U S  Capitol Wednesday when a mob stormed the building ',
  'Maria Rutenberg said she didn t go inside and insisted she  and most of those in attendance  did nothing wrong  ',
  'Before the chaos  Rutenberg described a peaceful scene in the nation s capital ',
  ' There was a whole bunch of kids  families  people singing anthem  people shaking hands  praying  all kinds of peaceful activities   she said   Nobody was drunk  Nobody had any weapons  Everybody was peaceful  normal  ',
  '',
  'Rutenberg said after spending a month helping investigate allegations of voter fraud in Pennsylvania  she felt she had to travel to Washington  D C ',
  ' It was a decisive day for us   she said   We al

In [21]:
# Now, create a list of dictionary with art name and unique word appearence 
# {art_name: name, word: word, appearence: 0}

final_result = []

for st in top_stories:
    for ind in range(len(st['content'])):
        words = st['content'][ind].split()
#         print(st['content'][ind])
        
        for word in words:
            final_result.append({"art_name": st["article_name"], "word": word})

final_result

[{'art_name': 'Bay Area Attorney Who Backs Trump Describes Scene at Capitol Riot',
  'word': 'A'},
 {'art_name': 'Bay Area Attorney Who Backs Trump Describes Scene at Capitol Riot',
  'word': 'Bay'},
 {'art_name': 'Bay Area Attorney Who Backs Trump Describes Scene at Capitol Riot',
  'word': 'Area'},
 {'art_name': 'Bay Area Attorney Who Backs Trump Describes Scene at Capitol Riot',
  'word': 'attorney'},
 {'art_name': 'Bay Area Attorney Who Backs Trump Describes Scene at Capitol Riot',
  'word': 'who'},
 {'art_name': 'Bay Area Attorney Who Backs Trump Describes Scene at Capitol Riot',
  'word': 'supports'},
 {'art_name': 'Bay Area Attorney Who Backs Trump Describes Scene at Capitol Riot',
  'word': 'President'},
 {'art_name': 'Bay Area Attorney Who Backs Trump Describes Scene at Capitol Riot',
  'word': 'Donald'},
 {'art_name': 'Bay Area Attorney Who Backs Trump Describes Scene at Capitol Riot',
  'word': 'Trump'},
 {'art_name': 'Bay Area Attorney Who Backs Trump Describes Scene at Cap

In [22]:
# Upload final_result to DF for future manipulations

word_study_df = pd.DataFrame(final_result)
word_study_df

Unnamed: 0,art_name,word
0,Bay Area Attorney Who Backs Trump Describes Sc...,A
1,Bay Area Attorney Who Backs Trump Describes Sc...,Bay
2,Bay Area Attorney Who Backs Trump Describes Sc...,Area
3,Bay Area Attorney Who Backs Trump Describes Sc...,attorney
4,Bay Area Attorney Who Backs Trump Describes Sc...,who
...,...,...
1563,California Virus Deaths Rocket Higher as ICU S...,Francisco
1564,California Virus Deaths Rocket Higher as ICU S...,contributed
1565,California Virus Deaths Rocket Higher as ICU S...,to
1566,California Virus Deaths Rocket Higher as ICU S...,this


In [None]:
# clean DF from popular words

word_for_remove = ['a', 'an', 'the', 'and', 'for', 'I', 'we', 'you', 'he', 'she', 'it', 'they']

