In [0]:
from nltk.corpus import stopwords
import re
import operator
import requests
import json
from tabulate import tabulate
from bs4 import BeautifulSoup

In [0]:
def getWordsFromUrl(url):
  
  word_list = []
  
  webpage = requests.get(url)
  
  plain_text_webpage = webpage.text
    
  bs = BeautifulSoup(plain_text_webpage, 'lxml')
    
  for data in bs.findAll('p'):
    if data.text is None:
      continue
    
    text = data.text
    
    text = text.lower().split()
    
    for word in text:
      cleaned_word = getCleanWord(word)
      
      if len(cleaned_word) > 0:
        word_list.append(cleaned_word)
  
  return word_list

In [0]:
def getCleanWord(word):
  return re.sub(r'\W+', '', word) # [^a-zA-Z0-9]

In [0]:
def getWordFrequency(words):
  
  word_dict = {}
  
  for word in words:
    if word in word_dict:
      word_dict[word] += 1
    else:
      word_dict[word] = 1
      
  return word_dict

In [0]:
def getSortedWords(words):
  return sorted(words.items(), key=operator.itemgetter(1), reverse=True)

In [0]:
def removeStopWords(words):
  stop_words = set(stopwords.words('english'))
  
  word_list = []
  
  for word, count in words:
    if word not in stop_words:
      word_list.append([word, count])
  
  return word_list

In [0]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
string_query = "ai"

In [0]:
#access wiki API. json format. query it for data. search tyep. shows list of possibilities
wikipedia_api_link = "https://en.wikipedia.org/w/api.php?format=json&action=query&list=search&srsearch="
wikipedia_link = "https://en.wikipedia.org/wiki/"

url = wikipedia_api_link + string_query

try:
  response = requests.get(url)

  data = json.loads(response.content.decode('utf-8'))

  page_tag = data['query']['search'][0]['title']

  url = wikipedia_link + page_tag.replace(' ', '_')

  web_page_words = getWordsFromUrl(url)

  word_count = getWordFrequency(web_page_words)

  sorted_words = getSortedWords(word_count)

  final_list = []

  for word, count in removeStopWords(sorted_words)[:20]:
    percentage_of_word = float((count * 100) / len(word_count))
    final_list.append([word, count, percentage_of_word])
    
except requests.exceptions.Timeout:
  print("The server didn't respond. Please, try again later.")
  
print(" +--------+")
print(" > SUMMARY ")
print(" +--------+")
print("- Wikipedia Api Link: ", wikipedia_api_link)
print("- Wikipedia Official site link: ", wikipedia_link)
print("- Query: ", string_query)
print("- Link: ", url)
print("- Total words: ", len(web_page_words))
print("- Total cleaned words: ", len(word_count))

print("+-----------------------------------------------------+")
print(tabulate(final_list, headers=["Word", "Frequency", "Frequency Percentage"], tablefmt='orgtbl'))
print("+-----------------------------------------------------+")

 +--------+
 > SUMMARY 
 +--------+
- Wikipedia Api Link:  https://en.wikipedia.org/w/api.php?format=json&action=query&list=search&srsearch=
- Wikipedia Official site link:  https://en.wikipedia.org/wiki/
- Query:  ai
- Link:  https://en.wikipedia.org/wiki/Artificial_intelligence
- Total words:  12646
- Total cleaned words:  3388
+-----------------------------------------------------+
| Word         |   Frequency |   Frequency Percentage |
|--------------+-------------+------------------------|
| ai           |         167 |               4.92916  |
| intelligence |          77 |               2.27273  |
| artificial   |          66 |               1.94805  |
| human        |          62 |               1.82999  |
| learning     |          51 |               1.50531  |
| many         |          50 |               1.4758   |
| machine      |          47 |               1.38725  |
| research     |          40 |               1.18064  |
| knowledge    |          36 |               1.06257