<a href="https://colab.research.google.com/github/lismarcarolinacamacho/PYTHON/blob/main/scrapping_wikipdia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

lista de palabras mas buscadas en un momento dado

In [6]:
from bs4 import BeautifulSoup
import requests
import re
import operator 

import json
from tabulate import tabulate
import sys
from stop_words import get_stop_words

def getWordList(url):
  word_list = []

  #raw data
  source_code = requests.get(url)
  #convert to text
  plain_text = source_code.text
  #lxml format
  soup = BeautifulSoup(plain_text, 'lxml')

  #find the words in the paragraph tag
  for text in soup.findAll('p'):
    if text.text is None:
      continue
    #content
    content = text.text
    #lowercase and split into array
    words = content.lower().split()

    #for each word
    for word in words:
      #remove non-chars
      cleaned_word = clean_word(word)
      #if the is still something here
      if len(cleaned_word)> 0:
        #add it to our words list
        word_list.append(cleaned_word)

  return word_list

#clean word with regex
def clean_word(word):
  cleaned_word = re.sub('[^A-Za-z]+', '', word)
  return cleaned_word

def createFrecuencyTable(word_list):
  #wourd count
  word_count = {}
  for word in word_list:
    #index is the word
    if word in word_count:
      word_count[word] += 1
    else:
      word_count[word] = 1
  return word_count

#remove stop words
def remove_stop_words(frecuency_list):
  stop_words = get_stop_words('en')

  temp_list = []
  for key, value in frecuency_list:
    if key not in stop_words:
      temp_list.append([key, value])
  return temp_list

#access wiki API, JSON format, query it for  data . search type . show list of posibilities

wikipedia_API_link = 'https://en.wikipedia.org/w/api.php?format=json&action=query&list=search&srsearch='
wikipedia_link = 'https://en.wikipedia.org/wiki'

# if the search word is too small, throw error

if (len(sys.argv)<2):
  print("Enter valid string: ")
  exit()

#get the search word

string_query = sys.argv[1]
#to remove stop wordss or not
if (len(sys.argv)>2):
  search_mode = True
else:
  search_mode = False

#create our url

url = wikipedia_API_link + string_query
print(url)

#try except block. simple way to deal with exceptions 
# great for HTTPS request

try:
  #use requests to retrieve row data from wiki API URL we just consrtucted

  response = requests.get(url)
  #format that data as Json dictionary
  data = json.loads(response.content.decode("utf-8")) 

  #page title, first option 
  # show this in web browser

  wikipedia_page_tag = data['query']['search'][0]['title']

  #get actual wiki page based in retrieved title 
  url = wikipedia_link + wikipedia_page_tag
  print("url: "+url)
  #get list of words from that page 
  page_word_list = getWordList(url)
  #get list of words counts, dictionary 
  page_word_count = createFrecuencyTable(page_word_list)
  #short the table by frecuency count
  sorted_word_frecuency_list = sorted(page_word_count.items(),
                                 key=operator.itemgetter(1), reverse= True)
  #remove stop words if the user specified
  if(search_mode):
    sorted_word_frecuency_list = remove_stop_words(sorted_word_frecuency_list)

  #sum the total words to calculate frecuencies
  total_words_sum = 0
  for key, value in sorted_word_frecuency_list:
    total_words_sum = total_words_sum + value

  #just get the top 20 words 

  if len(sorted_word_frecuency_list) > 20:
    sorted_word_frecuency_list = sorted_word_frecuency_list[:20]

  #create our final list which contain list, frecuency (word count, percentage)
  final_list = []
  for key, value in sorted_word_frecuency_list:
    percentage_value = float(value*100) / total_words_sum
    final_list.append([key, value, round(percentage_value, 4)])
  
  #header before the table 
  print_headers = ['Palabra', 'Frecuencia', 'Porcentaje de frecuencia' ]

  # print the table with tabulate 
  print (tabulate(final_list, headers = print_headers, tablefmt= 'orgtbl' ))

#throw an exception in case it breaks

except requests.exceptions.Timeout:
  print("The server didn't respond. Please try again later")


https://en.wikipedia.org/w/api.php?format=json&action=query&list=search&srsearch=-f
url: https://en.wikipedia.org/wikiNCIS (TV series)
| Palabra      |   Frecuencia |   Porcentaje de frecuencia |
|--------------+--------------+----------------------------|
| garfield     |            4 |                     1.3333 |
| united       |            4 |                     1.3333 |
| states       |            4 |                     1.3333 |
| recorded     |            4 |                     1.3333 |
| painting     |            4 |                     1.3333 |
| also         |            3 |                     1      |
| day          |            3 |                     1      |
| sibelius     |            3 |                     1      |
| written      |            3 |                     1      |
| many         |            3 |                     1      |
| ehrling      |            3 |                     1      |
| milky        |            3 |                     1      |
| way      

In [None]:
pip install stop_words

Collecting stop_words
  Downloading stop-words-2018.7.23.tar.gz (31 kB)
Building wheels for collected packages: stop-words
  Building wheel for stop-words (setup.py) ... [?25l[?25hdone
  Created wheel for stop-words: filename=stop_words-2018.7.23-py3-none-any.whl size=32911 sha256=0c4f50d77b428aa7f466f0cb4ab9a6e448fd8ba38edb664eb411d87d20acfb51
  Stored in directory: /root/.cache/pip/wheels/fb/86/b2/277b10b1ce9f73ce15059bf6975d4547cc4ec3feeb651978e9
Successfully built stop-words
Installing collected packages: stop-words
Successfully installed stop-words-2018.7.23
