In [15]:
from bs4 import BeautifulSoup
import requests 
import re
import pandas as pd
from sqlalchemy import create_engine

In [None]:
from config import postrepass

In [2]:
# set url to a target page on nbcbayarea.com
url = "https://www.nbcbayarea.com/news/local/"

In [3]:
response = requests.get(url)

In [4]:
soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
# Find articles 
art_content = soup.find_all('a', class_="story-card__title-link")

art_content

[<a class="story-card__title-link" href="https://www.nbcbayarea.com/news/local/drive-thru-covid-19-vaccination-clinic-opens-in-san-mateo-county/2442702/">
 									Drive-Thru COVID-19 Vaccination Clinic Opens in San Mateo County											</a>,
 <a class="story-card__title-link" href="https://www.nbcbayarea.com/news/california/12-children-rescued-from-chilly-waters-after-boat-capsize/2442645/">
 									12 Children Rescued From Chilly Waters After Boats Capsize in Santa Cruz											</a>,
 <a class="story-card__title-link" href="https://www.nbcbayarea.com/news/sports/nbcsports/jerry-rice-thinks-49ers-must-go-for-deshaun-watson-trade-if-qb-wants-out/2442476/">
 									Jerry Rice Thinks 49ers Must ‘Go For' Deshaun Watson Trade If QB Wants Out											</a>,
 <a class="story-card__title-link" href="https://www.nbcbayarea.com/news/local/1-arrested-after-high-speed-hit-and-run-crash-in-san-francisco-police/2442463/">
 									1 Arrested After High-Speed Hit-and-Run Crash in San Fran

In [6]:
# Create a list of articles for dictionaries {"article_name": '', "link": ''}.  
all_articls = []

for ind in range(len(art_content)):
    title = art_content[ind].text.strip()
    link = art_content[ind].attrs['href']
    all_articls.append({"article_name": title, "link": link})
    
all_articls

[{'article_name': 'Drive-Thru COVID-19 Vaccination Clinic Opens in San Mateo County',
  'link': 'https://www.nbcbayarea.com/news/local/drive-thru-covid-19-vaccination-clinic-opens-in-san-mateo-county/2442702/'},
 {'article_name': '12 Children Rescued From Chilly Waters After Boats Capsize in Santa Cruz',
  'link': 'https://www.nbcbayarea.com/news/california/12-children-rescued-from-chilly-waters-after-boat-capsize/2442645/'},
 {'article_name': "Jerry Rice Thinks 49ers Must ‘Go For' Deshaun Watson Trade If QB Wants Out",
  'link': 'https://www.nbcbayarea.com/news/sports/nbcsports/jerry-rice-thinks-49ers-must-go-for-deshaun-watson-trade-if-qb-wants-out/2442476/'},
 {'article_name': '1 Arrested After High-Speed Hit-and-Run Crash in San Francisco: Police',
  'link': 'https://www.nbcbayarea.com/news/local/1-arrested-after-high-speed-hit-and-run-crash-in-san-francisco-police/2442463/'},
 {'article_name': "SFPD Braced for Protest at Twitter HQ, But it Didn't Materialize",
  'link': 'https://w

In [7]:
# Take only first 3 stories

top_stories = all_articls[:3]

In [8]:
# extract content from each article, remove non letters sylabals and add content to dictionary

for story in top_stories:
    response = requests.get(story['link'])
    soup = BeautifulSoup(response.text, "html.parser")
    content = soup.find_all('p')
    
    story_content = []
    
    for ind in range(len(content)):
        temp = content[ind].text
        temp = re.sub("[^a-zA-Z]", " ", str(temp))
        story_content.append(temp)

    story.update({"content": story_content})
    
top_stories[0]

{'article_name': 'Drive-Thru COVID-19 Vaccination Clinic Opens in San Mateo County',
 'link': 'https://www.nbcbayarea.com/news/local/drive-thru-covid-19-vaccination-clinic-opens-in-san-mateo-county/2442702/',
 'content': ['Starting Monday  San Mateo County is offering a drive thru COVID    vaccination clinic for eligible health care workers and vulnerable residents ',
  'The clinic    which uses the Moderna vaccine    begins at   p m  at the San Mateo County Event Center at      Saratoga Drive in San Mateo ',
  'Vaccinations will be available to workers and residents eligible under Phase  A of the state s vaccine distribution guidelines  That phase includes the following groups  those at risk of COVID    through work at health care or long term care settings  including non clinical staff   and residents of nursing facilities  assisted living facilities and long term care settings for older or vulnerable individuals ',
  '',
  'San Mateo County s Board of Supervisors president David Can

In [9]:
# Now, create a list of dictionary with art name and unique word appearence 
# {art_name: name, word: word, appearence: 0}

final_result = []

for st in top_stories:
    for ind in range(len(st['content'])):
        words = st['content'][ind].split()
#         print(st['content'][ind])
        
        for word in words:
            final_result.append({"art_name": st["article_name"], "word": word})

final_result

[{'art_name': 'Drive-Thru COVID-19 Vaccination Clinic Opens in San Mateo County',
  'word': 'Starting'},
 {'art_name': 'Drive-Thru COVID-19 Vaccination Clinic Opens in San Mateo County',
  'word': 'Monday'},
 {'art_name': 'Drive-Thru COVID-19 Vaccination Clinic Opens in San Mateo County',
  'word': 'San'},
 {'art_name': 'Drive-Thru COVID-19 Vaccination Clinic Opens in San Mateo County',
  'word': 'Mateo'},
 {'art_name': 'Drive-Thru COVID-19 Vaccination Clinic Opens in San Mateo County',
  'word': 'County'},
 {'art_name': 'Drive-Thru COVID-19 Vaccination Clinic Opens in San Mateo County',
  'word': 'is'},
 {'art_name': 'Drive-Thru COVID-19 Vaccination Clinic Opens in San Mateo County',
  'word': 'offering'},
 {'art_name': 'Drive-Thru COVID-19 Vaccination Clinic Opens in San Mateo County',
  'word': 'a'},
 {'art_name': 'Drive-Thru COVID-19 Vaccination Clinic Opens in San Mateo County',
  'word': 'drive'},
 {'art_name': 'Drive-Thru COVID-19 Vaccination Clinic Opens in San Mateo County',
 

In [10]:
# Upload final_result to DF for future manipulations

word_study_df = pd.DataFrame(final_result)
word_study_df

Unnamed: 0,art_name,word
0,Drive-Thru COVID-19 Vaccination Clinic Opens i...,Starting
1,Drive-Thru COVID-19 Vaccination Clinic Opens i...,Monday
2,Drive-Thru COVID-19 Vaccination Clinic Opens i...,San
3,Drive-Thru COVID-19 Vaccination Clinic Opens i...,Mateo
4,Drive-Thru COVID-19 Vaccination Clinic Opens i...,County
...,...,...
1170,Jerry Rice Thinks 49ers Must ‘Go For' Deshaun ...,to
1171,Jerry Rice Thinks 49ers Must ‘Go For' Deshaun ...,the
1172,Jerry Rice Thinks 49ers Must ‘Go For' Deshaun ...,ers
1173,Jerry Rice Thinks 49ers Must ‘Go For' Deshaun ...,Talk


In [11]:
#group by article names
print(word_study_df.groupby(['art_name'])['word'].count())

print('-'*10)

print(word_study_df.nunique(axis=0))

word_counter = word_study_df['word'].value_counts()

print(word_counter)


art_name
12 Children Rescued From Chilly Waters After Boats Capsize in Santa Cruz      158
Drive-Thru COVID-19 Vaccination Clinic Opens in San Mateo County              327
Jerry Rice Thinks 49ers Must ‘Go For' Deshaun Watson Trade If QB Wants Out    690
Name: word, dtype: int64
----------
art_name      3
word        488
dtype: int64
the         60
to          47
a           27
is          24
of          24
            ..
chances      1
staffing     1
cause        1
poor         1
Football     1
Name: word, Length: 488, dtype: int64


In [12]:
# Find appearence of unique word 
word_appearence = word_study_df['word'].value_counts()
word_appearence

word_df = pd.DataFrame(word_appearence)
word_df.reset_index(inplace=True)
word_df.rename(columns={'index': 'word', 'word': 'appearence'}, inplace=True)
word_df

Unnamed: 0,word,appearence
0,the,60
1,to,47
2,a,27
3,is,24
4,of,24
...,...,...
483,chances,1
484,staffing,1
485,cause,1
486,poor,1


In [13]:
# Category for words in  DF 

articles = ['a', 'an', 'the', 'and', 'for', 'I', 'we', 'you', 'he', 'she', 'it', 'they']
weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
preposition = ['for', 'aboard', 'about', 'above', 'as', 'around', 'before', 'behind', 'below', 'along']


In [14]:
word_df['category'] = ''

for ind in range(len(word_df['word'])):
    if word_df['word'][ind] in articles:
        word_df.loc[ind, 'category'] = 'article'
        #word_df['category'][ind] = 'articles'
    
    if word_df['word'][ind] in weekdays:
        word_df.loc[ind, 'category'] = 'weekday'
        #word_df['category'][ind] = 'weekdays'
        
    if word_df['word'][ind] in preposition:
        word_df.loc[ind, 'category'] = 'preposition'
        #word_df['category'][ind] = 'preposition'

word_df
    

Unnamed: 0,word,appearence,category
0,the,60,article
1,to,47,
2,a,27,article
3,is,24,
4,of,24,
...,...,...,...
483,chances,1,
484,staffing,1,
485,cause,1,
486,poor,1,
