# Web Scraping Script

## Importing `libraries`

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

## `Web scraping` function

In [None]:
## Cities dataframe (through webscraping)
def get_cities(city_list):
    # Initialize an empty list that will be filled with one dictionary of information per city
    city_data = []

    # Iterate through the list of cities to collect information
    for city in city_list:
        # Construct the Wikipedia URL for the city
        url = f'https://en.wikipedia.org/wiki/{city}'

        # Send a GET request and parse the HTML with BeautifulSoup
        r = requests.get(url)
        soup = BeautifulSoup(r.content, 'html.parser')

        # Initialize an empty dictionary for the city's information
        city_info = {}

        # Extract relevant information from the HTML using selectors
        city_info['city_name'] = soup.select(".firstHeading")[0].get_text()
        city_info['country'] = soup.select(".infobox-data")[0].get_text()
        city_info['latitude'] = soup.select(".latitude")[0].get_text()
        city_info['longitude'] = soup.select(".longitude")[0].get_text()
        city_info['website'] = soup.select_one('.infobox-label:-soup-contains("Website")').find_next(class_='infobox-data').get_text()


        # Check if elevation information is available on the page
        elevation_info = soup.select_one('.infobox-label:-soup-contains("Elevation")')
        if elevation_info:
            city_info['elevation'] = elevation_info.find_next(class_='infobox-data').get_text()

        # Check if population information is available on the page
        population_info = soup.select_one('th.infobox-header:-soup-contains("Population")')
        if population_info:
            city_info['population'] = population_info.parent.find_next_sibling().find(string=re.compile(r'\d+'))


        # Add the city's dictionary to the list
        city_data.append(city_info)

    # Create a DataFrame from the list of dictionaries
    cities_df = pd.DataFrame(city_data)

    # Fix formatting issues in latitude and longitude columns
    cities_df['latitude'] = cities_df['latitude'].str.replace('°', '.').str.replace('′', '').str.replace('″', '')
    cities_df['longitude'] = cities_df['longitude'].str.replace('°', '.').str.replace('′', '').str.replace('″', '')
    
    country_code = {"Germany": "DE",
                "United Kingdom": "UK",
                "Spain": "ES"}

    cities_df["country_code"] = ""
    for i, country in enumerate(cities_df["country"]):
        cities_df.iloc[i,7]=country_code[country]


    # Return the DataFrame
    return cities_df