In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

from time import sleep
from random import randint

import regex as re

# 1. MVP (Minimum Viable Product)

### 1.1 Hot List from Listchallenges

In [2]:
url = "https://www.listchallenges.com/the-top-1000-billboard-hot-100-songs-of-all-time"
response = requests.get(url)
response.status_code

200

In [3]:
soup = BeautifulSoup(response.content, 'html.parser')

> The page has 25 pages, so we have to iterate the urls to add all the data to the soup.

In [4]:
iterations = range(0, 25)

pages = []

for i in iterations:
    
    start_at= str(i)
    url = "https://www.listchallenges.com/the-top-1000-billboard-hot-100-songs-of-all-time/list/" + start_at
    response = requests.get(url, headers = {"Accept-Language": "en-US"})
    pages.append(response)

    wait_time = randint(1,4000)
    sleep(wait_time/1000)

> Now that we have a variable with the pages, we can iterate each page and extract the information of every page.
> Each title-artist is separated (items are not in a table). But all of them follow the same pattern, so we can access them with an iterable string.

In [5]:
list1 = []

for i in range(len(pages)):
    soup = BeautifulSoup(pages[i].content, "html.parser")

    for j in range(1,46):
        number=j
        str_number=str(number)
        string = '#repeaterListItems > div:nth-child(' +str_number+ ') > div > div.item-name'
        selection=soup.select(string)
        list1.append(selection)

> The items of list1 are bs4.element.ResultSet, and the item of the item (list1[0][0]), is bs4.element.Tag. BeautifulSoup attributes like item.text, don´t work with resultset, so we need to apply it to the tags.

In [6]:
list2=[]

for k in range(0,1125):
    for item in list1[k]:
        list2.append(item.text)

> List2 items have this format we need to clean: '\r\n                                        The Twist - Chubby Checker\r\n                                    '

In [7]:
list3 = [item.strip() for item in list2[0:1000]]

> Finally, we only need to split the song and the artist, and put it on a df.

In [8]:
songs = []
artists = []
for element in list3:
    parts = element.split(' - ')
    if len(parts) == 2:
        song = parts[0].strip()
        artist = parts[1].strip()
        songs.append(song)
        artists.append(artist)
            
df_list_cha = pd.DataFrame({'song':songs, 'artist':artists})

In [9]:
df_list_cha

Unnamed: 0,song,artist
0,The Twist,Chubby Checker
1,Smooth,Santana
2,MacK the Knife,Bobby Darin
3,Leann Rimes,How Do I Live
4,Party Rock Anthem,LMFAO
...,...,...
930,Wooly Bully,Sam the Sham and the Pharoahs
931,Joyride,Roxette
932,Kansas City,Wilbert Harrison
933,Pumped Up Kicks,Foster the People


In [10]:
df_list_cha.to_csv('hot.csv', index=False)

> I can´t extract all of them because some have this pattern: 'Mark Ronson, "Uptown Funk" (Feat. Bruno Mars)'. I´ll do when I have more time.

# 2  Practice web scraping.

### 2.1  Retrieve an arbitrary Wikipedia page of "Python" and create a list of links on that page: 

url ='https://en.wikipedia.org/wiki/Python'

In [11]:
url='https://en.wikipedia.org/wiki/Python'

response=requests.get(url)
response.status_code

200

In [12]:
soup = BeautifulSoup(response.content, 'html.parser')

In [13]:
list1=[]
for i in range(len(soup.select("a"))):
    list1.append(soup.select("a")[i]['href'])

> The items of list1 are strings. We only want those that start by "/wiki/".

In [14]:
list2 = []

for i in list1:
    links = re.findall(r'/wiki/[\w]+', i)
    list2.extend(links)

In [15]:
list2

['/wiki/Main_Page',
 '/wiki/Wikipedia',
 '/wiki/Portal',
 '/wiki/Special',
 '/wiki/Wikipedia',
 '/wiki/Wikipedia',
 '/wiki/Special',
 '/wiki/Help',
 '/wiki/Help',
 '/wiki/Wikipedia',
 '/wiki/Special',
 '/wiki/Wikipedia',
 '/wiki/Main_Page',
 '/wiki/Special',
 '/wiki/Help',
 '/wiki/Special',
 '/wiki/Special',
 '/wiki/Python',
 '/wiki/Python',
 '/wiki/Python_',
 '/wiki/Python',
 '/wiki/Python_',
 '/wiki/Python',
 '/wiki/Python',
 '/wiki/Pitono_',
 '/wiki/Python_',
 '/wiki/Python',
 '/wiki/Python_',
 '/wiki/Pitono',
 '/wiki/Python',
 '/wiki/Python_',
 '/wiki/Python_',
 '/wiki/Python_',
 '/wiki/Mboma_',
 '/wiki/Python_',
 '/wiki/Python',
 '/wiki/Python_',
 '/wiki/Python',
 '/wiki/Pyton',
 '/wiki/Pyton',
 '/wiki/Python_',
 '/wiki/Python_',
 '/wiki/Python',
 '/wiki/Python',
 '/wiki/Python',
 '/wiki/Pyton',
 '/wiki/Python_',
 '/wiki/Python',
 '/wiki/Python_',
 '/wiki/Special',
 '/wiki/Python',
 '/wiki/Talk',
 '/wiki/Python',
 '/wiki/Python',
 '/wiki/Special',
 '/wiki/Special',
 '/wiki/Wikiped

### 2.2  Display the 20 latest earthquakes info (date, time, latitude, longitude and region name) by the EMSC as a pandas dataframe: 

url = 'https://www.emsc-csem.org/Earthquake/'

In [16]:
url='https://www.emsc-csem.org/Earthquake/'

response=requests.get(url)
response.status_code

200

In [17]:
soup = BeautifulSoup(response.content, 'html.parser')

> I´ve tried to get the results, and I also tried to get the id to navigate to the webpage information. But I couldn´t manage to access it.

### 2.3  Create a Python list with the top ten FBI's Most Wanted names: 

url = 'https://www.fbi.gov/wanted/topten'

In [18]:
url='https://www.fbi.gov/wanted/topten'

response=requests.get(url)
response.status_code

200

In [19]:
soup = BeautifulSoup(response.content, 'html.parser')

In [20]:
list1=[]
for i in range(1,11):
    number=i
    str_number=str(number)
    string = '#query-results-0f737222c5054a81a120bce207b0446a > ul > li:nth-child(' +str_number+ ') > h3 > a'
    selection=soup.select(string)
    list1.append(selection)

In [21]:
list2=[]

for k in range(0,10):
    for item in list1[k]:
        list2.append(item.text)

In [22]:
list2

['WILVER VILLEGAS-PALOMINO',
 'ALEJANDRO ROSALES CASTILLO',
 'RUJA IGNATOVA',
 'DONALD EUGENE FIELDS II',
 'ALEXIS FLORES',
 'ARNOLDO JIMENEZ',
 'OMAR ALEXANDER CARDENAS',
 'YULAN ADONAY ARCHAGA CARIAS',
 'BHADRESHKUMAR CHETANBHAI PATEL',
 'JOSE RODOLFO VILLARREAL-HERNANDEZ']