In [12]:
# importing libraries
from bs4 import BeautifulSoup as bs
import requests as rq
import pandas as pd
import numpy as np
import re


In [13]:
# getting data from the url
links = []
for page_num in range (1,26):
    if page_num == 1:
        url = 'https://www.outdoorer.co/'
    else:
        url = f'https://www.hikingadventures.net/hikes/page/{page_num}/'
        
    page = rq.get(url, headers={"User-Agent": "Mozilla/5.0"})
    # print(page)
    data = bs(page.text,'html.parser')
    # print(data)

     # getting the article from DOM
    articles = data.find_all('article')
    # print(articles)
    
    # get links in article
    for article in articles:
        a = article.find("a", class_="read-more")
        if a and a.get('href'):
            links.append(a["href"])
# print(links) 
print("Total links scraped:", len(links))

Total links scraped: 279


In [14]:
# getting readmore from the other pages
all_hikes = []
for link in links:
    r = rq.get(link, headers={"User-Agent": "Mozilla/5.0"})
    # print (r)
    s = bs(r.text, "html.parser")
    # print(s)    
    ul = s.find('ul', class_="wp-block-list")  
    if not ul:
        continue 
    # print(ul)

    # getting the hikes from DOM
    hike = {}

    for li in ul.find_all("li"):
        # filter url and get details
        strong = li.find("strong")
        if strong:
            key = strong.get_text(strip=True).replace(":", "")
            strong.extract()
            value = li.get_text(strip=True)
            hike[key] = value
    all_hikes.append(hike)

# print(all_hikes)
print(len(all_hikes))

266


In [15]:
# creating a dataframe from scrapped data
df = pd.DataFrame(all_hikes)
# print(df.head())
# pd.set_option('display.max_columns', None)
# print(df.columns)

# downloding raw csv
df.to_csv('hikes_raw.csv')

In [16]:
# cleaning the data
# filter out locations outside of kenya
df = df[df["Location"].str.contains("Kenya", case=False, na=False)]

# dropping columns with the time records (am & pm)
# time patterns
time_pattern = re.compile(
    r"\b\d{1,2}:\d{2}\s?(am|pm)\b",
    re.IGNORECASE
)

time_columns = []

for col in df.columns:
    if df[col].astype(str).str.contains(time_pattern, regex=True, na=False).any():
        time_columns.append(col)

print(time_columns)

df = df.drop(columns=time_columns)

# dropping columns with no entries
df = df.dropna(axis=1, how="all")

# columns with metadata
columns_to_drop = ['Date','Challenge', 'Peak','Team', 'Calories Burned', 'KICC Staircase Challenge.','Objective',
                  'Plan','Description', 'Total Calories Burned', 'Outcome', 'First Round', 'Break', 'Second Round',
                  'First Loop', 'Second Loop', 'Third Loop', 'Steps', 'Calories', 'Total steps', 'Total Steps', 'Up', 'Down',
                  'Calories burned', 'Moving time', 'Total Calories burned', 'miotoni block']

df = df.drop(columns=columns_to_drop, errors='ignore')


['Start Time', 'Summit', 'Finish', 'Peak', 'Maratini Summit', 'Mugi Summit', 'Table Summit', 'Mount', 'Table Mountain Summit', '1st Summit', '2nd Summit', 'Start time', 'Shoulder', 'Finish ']


  if df[col].astype(str).str.contains(time_pattern, regex=True, na=False).any():
  if df[col].astype(str).str.contains(time_pattern, regex=True, na=False).any():
  if df[col].astype(str).str.contains(time_pattern, regex=True, na=False).any():
  if df[col].astype(str).str.contains(time_pattern, regex=True, na=False).any():
  if df[col].astype(str).str.contains(time_pattern, regex=True, na=False).any():
  if df[col].astype(str).str.contains(time_pattern, regex=True, na=False).any():
  if df[col].astype(str).str.contains(time_pattern, regex=True, na=False).any():
  if df[col].astype(str).str.contains(time_pattern, regex=True, na=False).any():
  if df[col].astype(str).str.contains(time_pattern, regex=True, na=False).any():
  if df[col].astype(str).str.contains(time_pattern, regex=True, na=False).any():
  if df[col].astype(str).str.contains(time_pattern, regex=True, na=False).any():
  if df[col].astype(str).str.contains(time_pattern, regex=True, na=False).any():
  if df[col].astype(str).str

In [19]:
# haromizing columns
# normalizing column names
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(".", "", regex=False)
    .str.replace("&", "and")
    .str.replace("_", " ")
)
# print(df.head())

# max elevation columns
max_elevation_cols = [
    "max elevation",
    "summit elevation",
    "peak elevation",
    "talio summit elevation",
    "sagalla summit elevation",
    "table summit elevation",
    "mugi hill summit elevation",
    "mount satima summit elevation",
    "lenana peak elevation",
    "kosikosi peak elevation",
    "point peter elevation",
    "point thompson elevation"
]

# elevation gain
elevation_gain_cols = [
    "elevation gain",
    "total elevation gain"
]

# elevation loss
elevation_loss_cols = [
    "elevation loss",
    "total elevation loss"
]

# distance columns
distance_cols = [
    "total distance",
    "distance",
    "distance covered",
    "total distance covered",
    "total distance covered",
    "distance from start to summit",
    "distance from base to summit"
]

# duration columns
duration_cols = [
    "time",
    "duration",
    "total time",
    "moving time",
    "total days"
]

# starting elevation
min_elevation_cols = [
    "min elevation",
    "starting elevation",
    "starting point elevation",
    "wandare trailhead elevation",
    "old moses camp elevation",
    "chogoria gate elevation",
    "met station elevation",
    "maratini elevation",
    "twin rocks elevation",
    "day dash"


]

# routes
route_cols = [
    "route",
    "wamba route",
    "gatare route",
    "summit trail"
]

# starting/finish point
start_point_cols = [
    "start",
    "start and finish point",
    "starting point",
    "starting and finish point",
    "start and finish"
]

# attraction
features_cols = [
    "attractions",
    "attraction",
    "features"
]

# park fees
park_fees_cols = [
    "park fees"
]

In [20]:
# applying harmonization
def coalesce(df, cols):
    return df[cols].bfill(axis=1).iloc[:, 0]

df["elevation_max"] = coalesce(df, max_elevation_cols)
df["elevation_gain"] = coalesce(df, elevation_gain_cols)
df["elevation_loss"] = coalesce(df, elevation_loss_cols)
df["total_distance"] = coalesce(df, distance_cols)
df["total_time"] = coalesce(df, duration_cols)
df["elevation_min"] = coalesce(df, min_elevation_cols)
df["routes"] = coalesce(df, route_cols)
df["starting_point"] = coalesce(df, start_point_cols)
df["features_attractions"] = coalesce(df, features_cols)
df["park_fees"] = coalesce(df, park_fees_cols)

In [21]:
# dropping all source columns
all_source_cols = (
    max_elevation_cols
    + elevation_gain_cols
    + elevation_loss_cols
    + distance_cols
    + duration_cols
    + min_elevation_cols
    + route_cols
    + start_point_cols
    + features_cols
    + park_fees_cols
)

df = df.drop(columns=[c for c in all_source_cols if c in df.columns])


In [22]:
# Further cleaniing for geocoding
# normalizing location column
df["location_clean"] = (
    df["location"]
    .str.strip()
    .str.replace(r"\.$", "", regex=True)
)

df[["place_name", "area"]] = (
    df["location_clean"]
    .str.split("â€“", n=1, expand=True)
    .apply(lambda x: x.str.strip())
)

# extracting county/area name
df["area"] = (
    df["area"]
    .str.replace(", Kenya", "", regex=False)
    .str.strip()
)

df[["place_name", "area"]].head()

df = df.drop(columns=["location", "location_clean"], errors="ignore")


# converting data frame to csv and save
df.to_csv("norm_hikes.csv", index=False)