In [24]:
from bs4 import BeautifulSoup
import requests 
import re
import pandas as pd
from sqlalchemy import create_engine


In [2]:
from config import postgrepass

### About the project

Being an ESL Student (English as Second Language) I think it is important to learn meaning of new words before you read article or book. It is really boring to read anythink if you need to translate every 3rd word in a sentence. There are a lot of apps around that show word translation by clicking on the word, but they show only one-two meaning and not always the right one. Like in any other languages, one english word can have multiple meaning depending of context. In this project I take article names and their content and provide the list of unique words that ESL student can learn before read news.

#### Resources

<ul>
    <li>News:  https://www.nbcbayarea.com/news/local/</li>
    <li>Word Category: https://www.englishclub.com/</li>
</ul>



### Extract Article names and links on them

In [3]:
# set url to a target page on nbcbayarea.com
url = "https://www.nbcbayarea.com/news/local/"

In [4]:
response = requests.get(url)

In [5]:
soup = BeautifulSoup(response.text, 'html.parser')

In [6]:
# Find articles 
art_content = soup.find_all('a', class_="story-card__title-link")

art_content

[<a class="story-card__title-link" href="https://www.nbcbayarea.com/news/local/east-bay/da-says-no-charges-will-be-filed-against-former-bart-officer-in-oscar-grants-death/2442864/">
 									DA Says No Charges Will Be Filed Against Former BART Officer in Oscar Grant's Death											</a>,
 <a class="story-card__title-link" href="https://www.nbcbayarea.com/news/local/east-bay/as-public-health-officials-in-talks-to-open-drive-thru-covid-19-vaccination-site-at-coliseum/2442858/">
 									A's, Public Health Officials in Talks to Open Drive-Thru COVID-19 Vaccination Site at Coliseum											</a>,
 <a class="story-card__title-link" href="https://www.nbcbayarea.com/news/local/push-focuses-on-speeding-up-covid-19-vaccinations/2442855/">
 									Push Focuses on Speeding Up COVID-19 Vaccinations											</a>,
 <a class="story-card__title-link" href="https://www.nbcbayarea.com/news/local/we-are-concerned-covid-19-cases-overwhelm-sf-general-hospitals-icu-staff/2442805/">
 									‘We Ar

In [7]:
# Create a list for dictionaries {"article_name": '', "link": ''}.  
all_articls = []

for ind in range(len(art_content)):
    title = art_content[ind].text.strip()
    link = art_content[ind].attrs['href']
    all_articls.append({"article_name": title, "link": link})
    
all_articls

[{'article_name': "DA Says No Charges Will Be Filed Against Former BART Officer in Oscar Grant's Death",
  'link': 'https://www.nbcbayarea.com/news/local/east-bay/da-says-no-charges-will-be-filed-against-former-bart-officer-in-oscar-grants-death/2442864/'},
 {'article_name': "A's, Public Health Officials in Talks to Open Drive-Thru COVID-19 Vaccination Site at Coliseum",
  'link': 'https://www.nbcbayarea.com/news/local/east-bay/as-public-health-officials-in-talks-to-open-drive-thru-covid-19-vaccination-site-at-coliseum/2442858/'},
 {'article_name': 'Push Focuses on Speeding Up COVID-19 Vaccinations',
  'link': 'https://www.nbcbayarea.com/news/local/push-focuses-on-speeding-up-covid-19-vaccinations/2442855/'},
 {'article_name': "‘We Are Concerned': COVID-19 Cases Overwhelm SF General Hospital's ICU, Staff",
  'link': 'https://www.nbcbayarea.com/news/local/we-are-concerned-covid-19-cases-overwhelm-sf-general-hospitals-icu-staff/2442805/'},
 {'article_name': '1 Arrested After High-Speed H

### Extract content for first 3 articles and upload it to postrgesql

In [8]:
# Take only first 3 stories

top_stories = all_articls[:3]
top_stories

[{'article_name': "DA Says No Charges Will Be Filed Against Former BART Officer in Oscar Grant's Death",
  'link': 'https://www.nbcbayarea.com/news/local/east-bay/da-says-no-charges-will-be-filed-against-former-bart-officer-in-oscar-grants-death/2442864/'},
 {'article_name': "A's, Public Health Officials in Talks to Open Drive-Thru COVID-19 Vaccination Site at Coliseum",
  'link': 'https://www.nbcbayarea.com/news/local/east-bay/as-public-health-officials-in-talks-to-open-drive-thru-covid-19-vaccination-site-at-coliseum/2442858/'},
 {'article_name': 'Push Focuses on Speeding Up COVID-19 Vaccinations',
  'link': 'https://www.nbcbayarea.com/news/local/push-focuses-on-speeding-up-covid-19-vaccinations/2442855/'}]

In [9]:
# extract content from each article, remove non letters sylabals and add content to dictionary

for story in top_stories:
    response = requests.get(story['link'])
    soup = BeautifulSoup(response.text, "html.parser")
    content = soup.find_all('p')
    
    story_content = []
    
    for ind in range(len(content)):
        temp = content[ind].text
        temp = re.sub("[^a-zA-Z]", " ", str(temp))
        story_content.append(temp)

    story.update({"content": story_content})
    
top_stories_pd = pd.DataFrame(top_stories)


### Upload first 3 stories to sql

In [10]:
# Create connection to DB
db_path = f'postgresql://postgres:{postgrepass}@localhost:5432/new_word_study'
engine = create_engine(db_path)
conn = engine.connect()

In [11]:
# Upload DF to postrge
top_stories_pd.to_sql('stories', conn, if_exists='append')

In [12]:
# extract words from content 

words_in_story = []

for i in range(3):
    query = f'SELECT * FROM stories WHERE index={i}'
    story = pd.read_sql(query, conn)
    for ind in range(len(story['content'])):
        words = story['content'][ind].split()
        
        for word in words:
            words_in_story.append({'story_index': i, 'word': re.sub("[^a-zA-Z]", " ",word)})
        
words_in_story 
        


[{'story_index': 0, 'word': '  Alameda'},
 {'story_index': 0, 'word': 'County'},
 {'story_index': 0, 'word': 'District'},
 {'story_index': 0, 'word': 'Attorney'},
 {'story_index': 0, 'word': 'Nancy'},
 {'story_index': 0, 'word': 'O'},
 {'story_index': 0, 'word': 'Malley'},
 {'story_index': 0, 'word': 'announced'},
 {'story_index': 0, 'word': 'Monday'},
 {'story_index': 0, 'word': 'that'},
 {'story_index': 0, 'word': 'her'},
 {'story_index': 0, 'word': 'office'},
 {'story_index': 0, 'word': 'will'},
 {'story_index': 0, 'word': 'not'},
 {'story_index': 0, 'word': 'file'},
 {'story_index': 0, 'word': 'any'},
 {'story_index': 0, 'word': 'charges'},
 {'story_index': 0, 'word': 'against'},
 {'story_index': 0, 'word': 'former'},
 {'story_index': 0, 'word': 'BART'},
 {'story_index': 0, 'word': 'Police'},
 {'story_index': 0, 'word': 'Officer'},
 {'story_index': 0, 'word': 'Anthony'},
 {'story_index': 0, 'word': 'Pirone'},
 {'story_index': 0, 'word': 'for'},
 {'story_index': 0, 'word': 'the'},
 

In [14]:
# create DF from lists of dictionary with words from article
words_in_story_df = pd.DataFrame(words_in_story)


### Calculate unique words per article

In [15]:
# Count unique words per article

for i in range(3):
    story = words_in_story_df[words_in_story_df['story_index'] == i]
    story_df = pd.DataFrame(story['word'].value_counts())
    story_df['art_index'] = i
    story_df.reset_index(inplace=True)
    story_df.rename(columns={'index': 'word', 'word': 'appearence'}, inplace=True)

### Get English prepositions, pronouns from web 

In [16]:
# get preposition from web

url='https://www.englishclub.com/grammar/prepositions-list.htm'
prep_response = requests.get(url)

In [17]:
# parce responce with BeautifulSoap

prep_soup = BeautifulSoup(prep_response.text, 'html.parser')

In [18]:
prepositions_draft = prep_soup.find_all('li')
prepositions = []

for preposition in prepositions_draft:
    prepositions.append(preposition.text)

prepositions_df = pd.DataFrame(prepositions, columns=['preposition'])
prepositions_df.to_sql('prepositions', conn, if_exists='append')


In [19]:
# get pronoun from web
pronoun_url = 'https://www.englishclub.com/vocabulary/common-pronouns-25.htm'
pronoun_response = requests.get(url)

In [20]:
pronoun_soup = BeautifulSoup(pronoun_response.text, 'html.parser')


In [21]:
pronouns_draft = prep_soup.find('main').find_all('li')
pronouns = []

for pronoun in pronouns_draft:
    pronouns.append(pronoun.text)

pronouns_df = pd.DataFrame(pronouns, columns=['pronoun'])
pronouns_df.to_sql( 'pronouns', conn, if_exists='append')

In [22]:
# Category for words in  DF 

articles = ['a', 'an', 'the']
weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
months = ["January", 'February', "March", "April", "May"
          , "June", "July", "August", "September", "October", "November", "December"]


### Add category for  words

In [23]:
story_df['category'] = ''

for ind in range(len(story_df['word'])):
    if story_df['word'][ind] in articles:
        story_df.loc[ind, 'category'] = 'article'
        
    
    if story_df['word'][ind] in weekdays:
        story_df.loc[ind, 'category'] = 'weekday'
        
        
    if story_df['word'][ind] in prepositions:
        story_df.loc[ind, 'category'] = 'preposition'
        
    if story_df['word'][ind] in pronouns:
        story_df.loc[ind, 'category'] = 'pronun'
        

story_df.head(30)
    

Unnamed: 0,word,appearence,art_index,category
0,the,12,2,article
1,to,11,2,pronun
2,in,6,2,pronun
3,said,6,2,
4,and,6,2,
5,Costa,5,2,
6,County,4,2,
7,is,4,2,
8,up,4,2,pronun
9,,4,2,


### Upload result to postgres

In [28]:
story_df.to_sql('story_vocabulary', conn, if_exists='append')

In [35]:
words_for_translation = pd.read_sql("SELECT word FROM story_vocabulary \
                                    WHERE category NOT IN ('pronun', 'preposition', 'article')", conn)

words_for_translation.head(30)

Unnamed: 0,word
0,said
1,and
2,Costa
3,County
4,is
5,
6,have
7,re
8,We
9,Contra
