# __Import__

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
from tqdm import tqdm

# __Make Dataset__

In [2]:
lst = []

for x in tqdm(range(0, 8), desc='Downloading list of mountains'):
    wiki = 'https://en.wikipedia.org/wiki/List_of_mountains_by_elevation'

    response = requests.get(wiki).content
    soup = BeautifulSoup(response, 'lxml')

    table = soup.find_all('table')[x]

    for y in table.find_all('tr')[1:]:
        height_meters = y.find_all('td')[1].text
        height_feet = y.find_all('td')[2].text

        cells = y.find_all('td')[0]
        for z in cells.find_all('a', limit=1):
            wiki = z.get('href')

            wiki = [wiki, height_meters, height_feet]

            lst.append(wiki)

df = pd.DataFrame(lst, columns=['Wiki', 'Height (meters)', 'Height (feet)'])

height_columns = ['Height (meters)', 'Height (feet)']
for x in height_columns:
    df[x] = df[x].str.replace(',', '')
    df[x] = pd.to_numeric(df[x])

df.fillna('', inplace=True)

df = df[df['Wiki'].str.startswith('/wiki')]
df['Wiki'] = 'https://en.wikipedia.org' + df['Wiki']

df.reset_index(drop=True, inplace=True)

df

Downloading list of mountains: 100%|██████████| 8/8 [00:08<00:00,  1.02s/it]


Unnamed: 0,Wiki,Height (meters),Height (feet)
0,https://en.wikipedia.org/wiki/Mount_Everest,8848.0,29029
1,https://en.wikipedia.org/wiki/K2,8612.0,28255
2,https://en.wikipedia.org/wiki/Kangchenjunga,8586.0,28169
3,https://en.wikipedia.org/wiki/Lhotse,8516.0,27940
4,https://en.wikipedia.org/wiki/Makalu,8485.0,27838
...,...,...,...
1422,https://en.wikipedia.org/wiki/Buachaille_Etive...,1022.0,3353
1423,https://en.wikipedia.org/wiki/Munboksan,1015.0,3330
1424,https://en.wikipedia.org/wiki/K%C3%A9kes,1014.0,3327
1425,https://en.wikipedia.org/wiki/Mount_Belumut,1010.0,3314


In [3]:
wiki = list(df['Wiki'])

lst = []

for i in tqdm(wiki, desc='Downloading list of heights'):
    response = requests.get(i).content
    soup = BeautifulSoup(response, 'lxml')

    table = soup.find('table', class_='infobox vcard')

    try:
        name = soup.find('h1', class_='firstHeading').text
    except AttributeError:
        name = None

    try:
        for link in table.find_all('a', class_='external text', limit=1):
            geo = link.get('href')
    except AttributeError:
        geo = None

    name_geo = [name, geo]

    lst.append(name_geo)

df_geo = pd.DataFrame(lst, columns=['Name', 'Geo'])
df = pd.concat([df, df_geo], axis=1)

df.fillna('', inplace=True)

df = df[df['Name'] != 'Karakoram']
df['Name'] = df['Name'].apply(lambda x: x.replace('(mountain)', '').replace('Mountain', '').replace('(volcano)', ''))

df = df[df['Geo'].str.startswith('//geohack.toolforge.org/')]
df['Geo'] = 'https:' + df['Geo']

df.reset_index(drop=True, inplace=True)

df

Downloading list of heights: 100%|██████████| 1427/1427 [02:50<00:00,  8.35it/s]


Unnamed: 0,Wiki,Height (meters),Height (feet),Name,Geo
0,https://en.wikipedia.org/wiki/Mount_Everest,8848.0,29029,Mount Everest,https://geohack.toolforge.org/geohack.php?page...
1,https://en.wikipedia.org/wiki/K2,8612.0,28255,K2,https://geohack.toolforge.org/geohack.php?page...
2,https://en.wikipedia.org/wiki/Kangchenjunga,8586.0,28169,Kangchenjunga,https://geohack.toolforge.org/geohack.php?page...
3,https://en.wikipedia.org/wiki/Lhotse,8516.0,27940,Lhotse,https://geohack.toolforge.org/geohack.php?page...
4,https://en.wikipedia.org/wiki/Makalu,8485.0,27838,Makalu,https://geohack.toolforge.org/geohack.php?page...
...,...,...,...,...,...
1380,https://en.wikipedia.org/wiki/Mount_Zagora,1030.0,3379,Mount Zagora,https://geohack.toolforge.org/geohack.php?page...
1381,https://en.wikipedia.org/wiki/Buachaille_Etive...,1022.0,3353,Buachaille Etive Mòr,https://geohack.toolforge.org/geohack.php?page...
1382,https://en.wikipedia.org/wiki/Munboksan,1015.0,3330,Munboksan,https://geohack.toolforge.org/geohack.php?page...
1383,https://en.wikipedia.org/wiki/K%C3%A9kes,1014.0,3327,Kékes,https://geohack.toolforge.org/geohack.php?page...


In [4]:
geo = list(df['Geo'])

lst = []

for x in tqdm(geo, desc='Downloading latitude & longitude'):

    response = requests.get(x).content
    soup = BeautifulSoup(response, 'lxml')

    try:
        latitude = soup.find('span', class_='latitude').text
    except AttributeError:
        latitude = None

    try:
        longitude = soup.find('span', class_='longitude').text
    except AttributeError:
        longitude = None

    latitude_longitude = [latitude, longitude]

    lst.append(latitude_longitude)

df_lat_lon = pd.DataFrame(lst, columns=['Latitude', 'Longitude'])
df = pd.concat([df, df_lat_lon], axis=1)

df = df[['Name', 'Height (meters)', 'Height (feet)', 'Latitude', 'Longitude']]
df.reset_index(drop=True, inplace=True)

df

Downloading latitude & longitude: 100%|██████████| 1385/1385 [17:05<00:00,  1.35it/s]


Unnamed: 0,Name,Height (meters),Height (feet),Latitude,Longitude
0,Mount Everest,8848.0,29029,27.988056,86.925278
1,K2,8612.0,28255,35.8825,76.513333
2,Kangchenjunga,8586.0,28169,27.7025,88.146667
3,Lhotse,8516.0,27940,27.961667,86.933333
4,Makalu,8485.0,27838,27.889722,87.088889
...,...,...,...,...,...
1380,Mount Zagora,1030.0,3379,30.35,-5.75
1381,Buachaille Etive Mòr,1022.0,3353,56.647303,-4.897797
1382,Munboksan,1015.0,3330,35.676,129.034
1383,Kékes,1014.0,3327,47.878889,20.010278


# __Write File__

In [5]:
df.to_csv('../data/processed/mountains.csv', index=False)