In [1]:
# Web Scraping Lib
import requests
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import lxml
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

# Store Object lib
import pickle

# NLP Lib
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# General Lib
import re
from itertools import chain
import pandas as pd
import plotly.express as px

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Required Downloads
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True

In [4]:
# String Matching Algorithm : Tries
class Trie:
  def __init__(self):
    self.head = {}

  def insert(self, word):
    cur = self  
    for char in word:
      if char not in cur.head:
        cur.head[char] = Trie() 
      cur = cur.head[char]       
    cur.head['*'] = True

  def search(self, pattern):
    cur = self
    for char in pattern:
      if char not in cur.head: 
        return False
      cur = cur.head[char]   
    if '*' in cur.head:
      return True

  def insertAll(self, words):
    for i in words:
      self.insert(i)

In [5]:
# Step 1: General Web Scraping
def scrapText(url, header_loc, taglist, filename):
  req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
  webpage = urlopen(req).read()

  soup = BeautifulSoup(webpage, 'lxml')
  text = ''

  headers = soup.find(header_loc[0], class_ = header_loc[1])
  headers = headers.find_all(re.compile('^h[1-6]$'))
  for header in headers:
    text += header.text + '\n'

  for i in range(len(taglist)):
    divisions = soup.find_all(taglist[i][0], class_ = taglist[i][1])
    for division in divisions:
      paras = division.find_all('p')
      for para in paras:
        para = para.text
        text += para + '\n'

  # Save in files
  # Mount to your own drive
  # Change dir when using
  with open('/content/drive/MyDrive/Colab Notebooks/' + filename + '.txt', "w+") as f:
    f.write(text)

In [6]:
# Step 2: Lemmatizing sentences
def nltk_pos_tagger(nltk_tag):
  if nltk_tag.startswith('J'):
    return wordnet.ADJ
  elif nltk_tag.startswith('V'):
    return wordnet.VERB
  elif nltk_tag.startswith('N'):
    return wordnet.NOUN
  elif nltk_tag.startswith('R'):
    return wordnet.ADV
  else:          
    return None

def lemmatize(text):
  lemmatizer = WordNetLemmatizer()
  nltk_tagged = nltk.pos_tag(nltk.word_tokenize(text.lower()))
  wordnet_tagged = map(lambda x: (x[0], nltk_pos_tagger(x[1])), nltk_tagged)

  lemmed_sent = []
  for word,tag in wordnet_tagged:
    if tag is None:
      lemmed_sent.append(word)
    else:
      lemmed_sent.append(lemmatizer.lemmatize(word, tag))
  return lemmed_sent

In [7]:
# Step 3: Get Original Word Count, Distinct Word Count  
# and Ordered list of words based on its frequency
def wordCountFreq(text):
  ori_word_count = len(text.split())
  distinct_word_count = len(set(text.split()))

  wordlist = lemmatize(text)
  wordfreq = [wordlist.count(word) for word in wordlist]
  wordDict = dict(zip(wordlist, wordfreq))
  sort_orders = sorted(wordDict.items(), key = lambda x: x[1], reverse = True)
  return ori_word_count, distinct_word_count, sort_orders

In [8]:
# Scrap stop words, positive words and negative words 
# Execute once
def getStopWords():
  url = 'https://bit.ly/38uiVQH'
  stop_words = requests.get(url).text.split()
  saveObj(stop_words, "Stop Words")
  return stop_words

def getPosWords():
  url = 'https://bit.ly/3l73BfM'
  html_text = requests.get(url).text

  soup = BeautifulSoup(html_text, 'lxml')
  division = soup.find("div", class_ = 'entry-content')
  paras = list(division.find_all('p'))[3:-2]

  pos_words = [para.text.lower().replace('\xa0', '').split(', ') for para in paras]
  pos_words = list(chain.from_iterable(pos_words))
  pos_words = [word.strip() for word in pos_words]
  return pos_words

def getNegWords():
  url = 'https://bit.ly/3FKNvBM'
  html_text = requests.get(url).text

  soup = BeautifulSoup(html_text, 'lxml')
  division = soup.find("div", class_ = 'entry-content') 
  paras = list(division.find_all('p'))[1:-1]

  neg_words = [para.text.lower().replace('\xa0', '').split(', ') for para in paras]
  neg_words = list(chain.from_iterable(neg_words))
  neg_words = [word.strip() for word in neg_words]
  return neg_words
  

In [9]:
# Step 4: Filter Stop words
def filterStopWords(lst, stop_words_list):
  filtered = [i for i in lst if i[0] not in (stop_words_list and nltk.corpus.stopwords.words('english'))]
  stop_word_num = len(lst) - len(filtered)
  return stop_word_num, filtered

In [10]:
def saveObj(obj, filename):
  with open('/content/drive/MyDrive/Colab Notebooks/' + filename, 'wb') as f:
    pickle.dump(obj, f)

In [11]:
# Save positive and negative words in tries object
def savePosNegWords(pos_words_list, neg_words_list):
  pos_trie, neg_trie = Trie(), Trie()
  words_in_common = set(pos_words_list) & set(neg_words_list)
  pos_words_list = [item for item in pos_words_list if item not in words_in_common]
  neg_words_list = [item for item in neg_words_list if item not in words_in_common]

  for i in range(len(pos_words_list)):
    pos_trie.insert(pos_words_list[i])
  for i in range(len(neg_words_list)):
    neg_trie.insert(neg_words_list[i])

  saveObj(pos_trie, "Positive Word Tries")
  saveObj(neg_trie, "Negative Word Tries")
  

In [12]:
# Step 5: Find all positive and negative words in text
def findPosNegWords(lst):
  pos_words_found, neg_words_found = [], []

  with open('/content/drive/MyDrive/Colab Notebooks/Positive Word Tries', 'rb') as pos_file:
    pos_trie = pickle.load(pos_file)

    for i in range(len(lst)):
      if pos_trie.search(lst[i][0]):
        pos_words_found.append(lst[i][0])

  with open('/content/drive/MyDrive/Colab Notebooks/Negative Word Tries', 'rb') as neg_file:
    neg_trie = pickle.load(neg_file)

    for i in range(len(lst)):
      if neg_trie.search(lst[i][0]):
        neg_words_found.append(lst[i][0])
  #print(pos_words_found)
  #print(neg_words_found)
  return pos_words_found, neg_words_found

In [13]:
# Execute once
#scrapText('https://bit.ly/3NexlDz', ['span', 'hed-heading'], [['div', 'content-gated']], 'AR1')
#scrapText('https://bit.ly/3wzao8q', ['article', 'article'], [['article', 'article']], 'AR2')
#scrapText('https://on.cfr.org/3wvlziu', ['div', 'layout-content'], [['div', 'layout-content']], 'AR3')
#scrapText('https://bit.ly/3MD3gxw', ['header', 'article-header'], [['header', 'article-header'], ['div', 'articleLeft']], 'AR4')
#scrapText('https://bit.ly/3NsEJLE', ['div', 'main-column-region'], [['div', 'field-item even']], 'AR5')

#scrapText('https://tgam.ca/3FBXg5n', ['div', 'l-article-title'], [['div', 'l-article-title'], ['article', 'l-article']], 'CA1')
#scrapText('https://bit.ly/3wuToQF', ['div', 'detailMainCol sclt-storycontent'], [['div', 'detailMainCol sclt-storycontent']], 'CA2')
#scrapText('https://bit.ly/3PBHNGS', ['div', 'publication-details-header'], [['div', 'container screen-max-width']], 'CA3')
#scrapText('https://bit.ly/3wIjLTz', ['div', 'c-title'], [['div', 'c-text']], 'CA4')
#scrapText('https://bit.ly/3NiCElE', ['article', 'article'], [['div', 'article-content-body']], 'CA5')

#scrapText('https://bit.ly/3yuB6Az', ['div', 'left_content_article left_content'], [['div', 'left_content_article left_content']], 'CN1')
#scrapText('https://bit.ly/3NoEYXW', ['div', 'l-col l-col--8'], [['div', 'l-col l-col--8']], 'CN2')
#scrapText('https://bit.ly/3N8xOH5', [None, None], [['div', 'clearfix con_main']], 'CN3') 
#scrapText('https://bit.ly/3sTtt2T', ['div', 'news-hd'], [['div', 'news-cut']], 'CN4')
#scrapText('https://bit.ly/3lxy3jj', ['header', 'article-head-wrapper'], [['article', 'container-fluid article']], 'CN5')

#scrapText('https://bit.ly/39OWUMR', ['div', 'featured-heading'], [['div', 'post-content']], 'US1')
#scrapText('https://bit.ly/3wlYc9U', ['div', 'basic-content-wrap cf'], [['div', 'basic-content-wrap cf']], 'US2')
#scrapText('https://brook.gs/3G7erMz', ['div', 'headline-wrapper'], [['div', 'post-body post-body-enhanced']], 'US3')
#scrapText('https://bit.ly/3PsNHu9', ['div', 'col-md-8'], [['div', 'entry-main-content']], 'US4')
#scrapText('https://bit.ly/39FCS7w', ['div', 'hero-1 hero-1__news'], [['div', 'ExternalClass53FC3247563C431F8E5F213BAAB1A668']], 'US5')

#scrapText('https://bit.ly/39OJLmZ', ['section', None], [['div', 'topic-content pt-sm-15']], 'KR1')
#scrapText('https://bit.ly/3LBgRUL', ['header', 'article-header'], [['header', 'article-header'], ['div', 'wysiwyg wysiwyg--all-content css-1ck9wyi']], 'KR2')
#scrapText('https://herit.ag/3wFpY1c', ['div', 'content-container clearfix'], [['div', 'content-container clearfix']], 'KR3')
#scrapText('https://reut.rs/3wK5QLv', ['div', 'article-header__heading__15OpQ'], [['div', 'article-body__container__3ypuX article-body__over-6-para__1Ov64']], 'KR4')
#scrapText('https://bit.ly/39GUgIU', ['div', 'Content-outer'], [['div', 'Content-outer']], 'KR5')

#scrapText('https://bit.ly/3wATftQ', ['div', 'container page ufonts'], [['div', 'entry mar-b-10 mar-t-10 pad-t-10 clearfix']], 'JP1')
#scrapText('https://bit.ly/3sP89vj', ['article', 'clearfix post-280 encyclopedia type-encyclopedia status-publish hentry'], [['article', 'clearfix post-280 encyclopedia type-encyclopedia status-publish hentry']], 'JP2')
#scrapText('https://bit.ly/3sP8NZY', ['section', 'cardHolder open ga-tracking'], [['section', 'cardHolder open ga-tracking']], 'JP3')
#scrapText('https://bit.ly/3wBkWUA', [None, None], [['div', 'article__content']], 'JP4')
#scrapText('https://bit.ly/3MBnWG0', ['div', 'top_wrapper'], [['section', 'composer_content']], 'JP5')

#stop_words_list = getStopWords()
#pos_words_list = getPosWords()
#neg_words_list = getNegWords()
#savePosNegWords(pos_words_list, neg_words_list)

In [14]:
# Main Method
lst = ['AR', 'CA', 'CN', 'JP', 'US']
grade = ['Very Negative', 'Negative', 'Neutral', 'Positive', 'Very Positive']
dataframe = pd.DataFrame(columns = ['Country', 'Avg PWS (%)', 'Overall Sentiment'])

with open('/content/drive/MyDrive/Colab Notebooks/Stop Words', 'rb') as f:
  stop_words_list = pickle.load(f)

for i in range(len(lst)):
  sum_percentage = 0
  df = pd.DataFrame(columns = ['Article', 'Original Words Count', 
                               'Distinct Words Count', 'Stop Words Count', 
                               'Positive Words Count', 'Negative Words Count',
                               'Positive Words Percentage (%)', 'Overall Sentiment'])
  for j in range(1, 6):
    with open('/content/drive/MyDrive/Colab Notebooks/' + lst[i] + str(j) + '.txt', "r") as f:
      text = f.read()

      name = str(lst[i]) + str(j)
      ori_word_count, distinct_word_count, freqlist = wordCountFreq(text)
      stop_word_num, filtered = filterStopWords(freqlist, stop_words_list)
      pos_words_found, neg_words_found = findPosNegWords(filtered)
      pos_words_found_num = len(pos_words_found)
      neg_words_found_num = len(neg_words_found)
      pos_word_percentage = float(pos_words_found_num) * 100 / (pos_words_found_num + neg_words_found_num)
      pos_word_percentage = round(pos_word_percentage, 2)
      sum_percentage += pos_word_percentage
      sentiment = grade[int(pos_word_percentage // 20)]
      
    df = df.append({'Article' : name, 
                    'Original Words Count' : ori_word_count,
                    'Distinct Words Count' : distinct_word_count,
                    'Stop Words Count' : stop_word_num,
                    'Positive Words Count' : pos_words_found_num,
                    'Negative Words Count' : neg_words_found_num,
                    'Positive Words Percentage (%)' : pos_word_percentage,
                    'Overall Sentiment' : sentiment}, ignore_index = True) 
    
  display(df)   

  fig = px.bar(x= df['Article'], y=df['Original Words Count'], labels = dict(x='Article', y = 'Count') ,title=f"Bar Chart of Total Word Count" )
  fig.show()
  fig = px.bar(x= df['Article'], y=df['Distinct Words Count'], labels = dict(x='Article', y = 'Count') ,title=f"Bar Chart of Distinct Words Count" )
  fig.show()
  fig = px.bar(x= df['Article'], y=df['Stop Words Count'], labels = dict(x='Article', y = 'Count') ,title=f"Bar Chart of Stop Words Count" )
  fig.show()
  fig = px.bar(x= df['Article'], y=df['Positive Words Count'], labels = dict(x='Article', y = 'Count') ,title=f"Bar Chart of Positive Words Count" )
  fig.show()
  fig = px.bar(x= df['Article'], y=df['Negative Words Count'], labels = dict(x='Article', y = 'Count') ,title=f"Bar Chart of Negative Words Count" )
  fig.show()
  fig = px.bar(x= df['Article'], y=df['Positive Words Percentage (%)'], labels = dict(x='Article', y = 'Count') ,title=f"Bar Chart of Positive Words Percentage" )
  fig.show()
  
  df.to_csv('/content/drive/MyDrive/Colab Notebooks/' + str(lst[i]) + ' Output')

  average = round(sum_percentage / 5, 2)
  overall_sentiment = grade[int(average // 20)]
  dataframe = dataframe.append({'Country' : str(lst[i]),
                                'Avg PWS (%)' : average,
                                'Overall Sentiment' : overall_sentiment}, ignore_index = True)

dataframe.sort_values(by = 'Avg PWS (%)', ascending = False, inplace = True, ignore_index = True)
display(dataframe)

fig = px.bar(x = dataframe['Country'], y = dataframe['Avg PWS (%)'], labels = dict(x='Country', y = 'Avg PWS (%)') ,title = f"Bar Chart of comparing average PWS (%) between countries" )
fig.show()

dataframe.to_csv('/content/drive/MyDrive/Colab Notebooks/Overall Output')

Unnamed: 0,Article,Original Words Count,Distinct Words Count,Stop Words Count,Positive Words Count,Negative Words Count,Positive Words Percentage (%),Overall Sentiment
0,AR1,1854,833,77,54,46,54.0,Neutral
1,AR2,2243,899,80,59,49,54.63,Neutral
2,AR3,2328,1137,70,61,44,58.1,Neutral
3,AR4,3407,1679,92,91,63,59.09,Neutral
4,AR5,2294,1023,80,80,64,55.56,Neutral


Unnamed: 0,Article,Original Words Count,Distinct Words Count,Stop Words Count,Positive Words Count,Negative Words Count,Positive Words Percentage (%),Overall Sentiment
0,CA1,2928,1305,87,92,42,68.66,Positive
1,CA2,815,431,65,34,8,80.95,Very Positive
2,CA3,795,436,50,29,11,72.5,Positive
3,CA4,325,208,43,15,2,88.24,Very Positive
4,CA5,2794,1305,65,79,38,67.52,Positive


Unnamed: 0,Article,Original Words Count,Distinct Words Count,Stop Words Count,Positive Words Count,Negative Words Count,Positive Words Percentage (%),Overall Sentiment
0,CN1,1151,603,56,38,27,58.46,Neutral
1,CN2,436,267,36,8,18,30.77,Negative
2,CN3,4401,1479,69,87,27,76.32,Positive
3,CN4,1369,666,56,47,25,65.28,Positive
4,CN5,888,436,48,38,7,84.44,Very Positive


Unnamed: 0,Article,Original Words Count,Distinct Words Count,Stop Words Count,Positive Words Count,Negative Words Count,Positive Words Percentage (%),Overall Sentiment
0,JP1,985,537,55,50,9,84.75,Very Positive
1,JP2,5240,1999,76,108,60,64.29,Positive
2,JP3,1675,715,66,49,14,77.78,Positive
3,JP4,800,431,52,48,11,81.36,Very Positive
4,JP5,1028,495,55,44,2,95.65,Very Positive


Unnamed: 0,Article,Original Words Count,Distinct Words Count,Stop Words Count,Positive Words Count,Negative Words Count,Positive Words Percentage (%),Overall Sentiment
0,US1,527,314,58,15,4,78.95,Positive
1,US2,1404,395,43,33,12,73.33,Positive
2,US3,927,491,66,35,22,61.4,Positive
3,US4,1338,624,46,40,2,95.24,Very Positive
4,US5,1877,832,69,73,32,69.52,Positive


Unnamed: 0,Country,Avg PWS (%),Overall Sentiment
0,JP,80.77,Very Positive
1,US,75.69,Positive
2,CA,75.57,Positive
3,CN,63.05,Positive
4,AR,56.28,Neutral
