In [1]:
!pip install stop-words

Collecting stop-words
  Downloading https://files.pythonhosted.org/packages/1c/cb/d58290804b7a4c5daa42abbbe2a93c477ae53e45541b1825e86f0dfaaf63/stop-words-2018.7.23.tar.gz
Building wheels for collected packages: stop-words
  Building wheel for stop-words (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/75/37/6a/2b295e03bd07290f0da95c3adb9a74ba95fbc333aa8b0c7c78
Successfully built stop-words
Installing collected packages: stop-words
Successfully installed stop-words-2018.7.23


In [0]:
from bs4 import BeautifulSoup
import requests
import re
import json
import operator
from tabulate import tabulate
import sys
from stop_words import get_stop_words

In [0]:
# get the words
def getWordList(url):
  word_list = []
  
  #raw_data
  source_code = requests.get(url)
  
  #convert to text
  plain_text = source_code.text
  
  #lxml format
  soup = BeautifulSoup(plain_text,'lxml')
  
  #findthe words in paragraph tag
  
  for text in soup.findAll('p'):
    if text.text is None:
      continue
    content = text.text
    words = content.lower().split()
    
    for word in words:
      #remove non-chars
      cleaned_word = clean_word(word)
      #if there is something still there
      if len(cleaned_word)>0:
        #add it to our word_list
        word_list.append(cleaned_word)
  return word_list
def createFrequecyTable(word_list):
  word_count = {}
  for word in word_list:
    if word in word_count:
      word_count[word] += 1
    else:
      word_count[word] = 1
  return word_count

    

In [0]:

#clean words with regex
def clean_word(word):
  cleaned_word = re.sub('[^A-Za-z]+','',word)
  return cleaned_word
#remove stopwords
def remove_stop_words(frequency_list):
  stop_words = get_stop_words('en')
  temp_list = []
  for key,value in frequency_list:
    if key not in stop_words:
      temp_list.append([key,value])
      
  return temp_list

In [0]:
#keyword you want to search
string_query = 'ai'

#to remove stop words or not
search_mode = True

In [0]:
wikipedia_api_link = 'https://en.wikipedia.org/w/api.php?format=json&action=query&list=search&srsearch='
wikipedia_link = 'https://en.wikipedia.org/wiki/'

In [33]:
url = wikipedia_api_link + string_query

try:
  #retreiving raw data from wiki api
  response = requests.get(url)
  
  #formating data as json dictionary
  data = json.loads(response.content.decode('utf-8'))
  
  #page title, first option
  #show this in web browser
  wikipedia_page_tag = data['query']['search'][0]['title']
  
  #get actual wiki page based on retrieved title
  url = wikipedia_link + wikipedia_page_tag
  
  #get list of words from that page
  page_word_list = getWordList(url)
  
  #create table of word counts, dictionary
  page_word_count = createFrequecyTable(page_word_list)
  
  #sort the table by the frequency count
  sorted_word_frequency_list = sorted(page_word_count.items(), key=operator.itemgetter(1), reverse=True)
  
  #remove stop words if the user specified
  if(search_mode):
    sorted_word_frequency_list = remove_stop_words(sorted_word_frequency_list)

  #sum the total words to calculate frequencies   
  total_words_sum = 0
  for key,value in sorted_word_frequency_list:
      total_words_sum = total_words_sum + value

  #just get the top 20 words
  if len(sorted_word_frequency_list) > 20:
      sorted_word_frequency_list = sorted_word_frequency_list[:20]

  #create our final list which contains words, frequency (word count), percentage
  final_list = []
  for key,value in sorted_word_frequency_list:
      percentage_value = float(value * 100) / total_words_sum
      final_list.append([key, value, round(percentage_value, 4)])

  #headers before the table
  print_headers = ['Word', 'Frequency', 'Frequency Percentage']

  #print the table with tabulate
  print(tabulate(final_list, headers=print_headers, tablefmt='orgtbl'))

#throw an exception in case it breaks
except requests.exceptions.Timeout:
    print("The server didn't respond. Please, try again later.")

| Word         |   Frequency |   Frequency Percentage |
|--------------+-------------+------------------------|
| ai           |         166 |                 2.1765 |
| intelligence |          86 |                 1.1276 |
| can          |          83 |                 1.0882 |
| artificial   |          65 |                 0.8522 |
| human        |          63 |                 0.826  |
| learning     |          59 |                 0.7736 |
| many         |          50 |                 0.6556 |
| machine      |          47 |                 0.6162 |
| research     |          44 |                 0.5769 |
| networks     |          39 |                 0.5113 |
| knowledge    |          37 |                 0.4851 |
| neural       |          34 |                 0.4458 |
| use          |          33 |                 0.4327 |
| problems     |          32 |                 0.4196 |
| also         |          31 |                 0.4065 |
| computer     |          30 |                 0