# Nomadlist Scraper

## Step 0. Importing "Web Scraping Toolkit" libraries

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

## Step 1. Getting all cities on Nomadlist.com

In [None]:
# Create empty pandas dataframe with one column "city"
df = pd.DataFrame(columns=['city'])

# Parse local cities.xml file with BeautifulSoup
soup = BeautifulSoup(open('data/raw/cities.xml', 'r', encoding='utf-8'), 'html.parser')

# Look for and loop over all <li> tags
cities = soup.find_all('li', {'class', 'item'})
for city in cities:
    if city.has_attr('data-slug'): # City name is in the "data-slug" attribute
        city_name = city['data-slug']

        # Add one row to pandas dataframe
        df = df.append({'city': city_name}, ignore_index=True)

# Save dataframe to cities csv file
df.to_csv('data/interim/cities.csv', index=False)

## Step 2. Getting all users on Nomadlist.com

In [None]:
# Base URL for this scraping task with placeholder
base_url = 'https://nomadlist.com/people/{}'

# Read dataframe from cities csv file
df_cities = pd.read_csv('data/interim/cities.csv')
user_list = [] # Empty list for a set of users

# Loop over all cities in the dataframe
for index, row in df_cities.iterrows():
    # Request url with city name appended and parse with BeautifulSoup
    try:
        response = requests.get(base_url.format(row[0]))
        soup = BeautifulSoup(response.content, 'html.parser')

        # Look for and loop over all <a> tags
        users = soup.find_all('a', {'class': 'no-border'})
        for user in users:
            if user.has_attr('href') and user['href'].startswith('/@'): # User name is in the "href" attribute and starts with "/@"
                user_name = user['href'][1:]
                user_list.append(user_name) # Add user name to user list

        # Remove duplicate user names (users can visit multiple cities)
        user_list = list(set(user_list))
    except:
        pass

# Create pandas dataframe from list and save to users csv file
df_users = pd.DataFrame(user_list, columns=['user'])
df_users.to_csv('data/interim/users.csv', index=False)

## Step 3. Getting all trips for each user

In [None]:
# Extra libraries
import re
import json

# Base URL for this scraping task with placeholder
base_url = 'https://nomadlist.com/{}'

# Regular expression for the "tripsCoords" JavaScript variable
p_coords = re.compile('var tripsCoords=(.*?);')

# Create empty pandas dataframe with 13 columns
df_trips = pd.DataFrame(columns=['city', 'city_slug', 'epoch_end', 'epoch_start', 'latitude', 'longitude', 'previous_latitude', 'previous_longitude', 'trip_id', 'trip_length', 'user_image', 'mode', 'user'])

# Read dataframe from users csv file
df_users = pd.read_csv('data/interim/users.csv')

# Loop over all users in the dataframe
for index, row in df_users.iterrows():
    
    # Request url with user name appended and parse with BeautifulSoup
    response = requests.get(base_url.format(row[0]))
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Look for and loop over all <script> tags
    scripts = soup.find_all('script')
    for script in scripts:

        # Search for the regular expression inside the <script> tag
        match = p_coords.search(script.prettify())
        if match:

            # If found, turn into Python dictionary
            content = json.loads(match.groups()[0])
            if content:

                # Load the trips into a list of dictionaries
                trips = list(content.values())[0]

                # Add username to every trip
                trips_with_username = [dict(trip, **{'user':row[0]}) for trip in trips]
                
                # Append list of trips to the trips dataframe
                df_trips = df_trips.append(trips_with_username)

# Save dataframe to trips csv file
df_trips.to_csv('data/interim/trips.csv', index=False)