# Kayak Project : PART 2 store scrapped data
For my webscrapping I used 2 tools : Selenium and Beautiful Soup.

Selenium is able to handle asynchron and dynamic data such as price which was an issue when I try beautiful soup. My web scraping is located in the file bot_selenium and the scraped data are in ./bot_selenium/data/each_city.

Please note that if you want to run the bot, you'll have to install Firefox and its driver.

Then I used beautiful soup to loop over all the hotel links to scrap the static data such as the hotel_address and the coordinates.

In this notebook, we will :
- concatenate the the hotel best deal for each city that we scraped using Selenium. The data are located in ./bot_selenium/data/each_city
- scrap the coordinate for each hotel using beautiful soup.
- send those data in a S3 bucket



In [2]:
import pandas as pd
import glob
import os
import numpy as np
import boto3
from io import StringIO

In [3]:
# Specify the directory containing the CSV files
directory = "./bot_selenium/data/each_city" 

# Get a list of all CSV files ending with _hotel_deals.csv in the specified directory
csv_files = glob.glob(os.path.join(directory, "*_hotel_deals.csv"))

# Initialize an empty list to hold the DataFrames
data_frames = []

# Loop over the list of files and read each one into a DataFrame
for file in csv_files:
    df = pd.read_csv(file)
    data_frames.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(data_frames, ignore_index=True)

# Specify the path for the output CSV file
output_csv_file = os.path.join("./bot_selenium/data", "all_cities_hotel_deals.csv")

# Save the combined DataFrame to a new CSV file
combined_df.to_csv(output_csv_file, index=False)

print(f"Combined data saved to {output_csv_file}")

Combined data saved to ./bot_selenium/data/all_cities_hotel_deals.csv


In [4]:
all_cities_df=pd.read_csv('./bot_selenium/data/all_cities_hotel_deals.csv')

In [5]:
#load in data lake, S3 bucket

session = boto3.Session(aws_access_key_id="AWS_ACCESS_KEY", 
                        aws_secret_access_key="AWS_SECRET_ACCESS_KEY")

s3 = session.resource("s3")

bucket = s3.create_bucket(Bucket="holiday-recommendation")

csv_buffer = StringIO()
all_cities_df.to_csv(csv_buffer, index=False)

# Upload the CSV file to S3
bucket = s3.Bucket("holiday-recommendation")
bucket.put_object(Key="all_cities_info_deals.csv", Body=csv_buffer.getvalue())



s3.Object(bucket_name='holiday-recommendation', key='all_cities_info_deals.csv')

In [9]:
print(len(all_cities_df))

694


In [10]:
all_cities_df.head()

Unnamed: 0,city,name,hotel_prices,rating,number of review,hotel_link
0,Amiens,Appart'City Confort Amiens Gare,$930,Scored 7.6,"1,743 reviews",https://www.booking.com/hotel/fr/appart-city-a...
1,Amiens,The Nest,$973,Scored 8.2,137 reviews,https://www.booking.com/hotel/fr/the-nest-amie...
2,Amiens,Nemea Appart Hotel Coliseum Amiens Centre,"$1,029",Scored 8.6,332 reviews,https://www.booking.com/hotel/fr/nemea-appart-...
3,Amiens,Odalys City Amiens Blamont,"$1,074",Scored 8.0,"1,817 reviews",https://www.booking.com/hotel/fr/appart-39-oda...
4,Amiens,Le Saint Louis,"$1,078",Scored 8.1,"1,336 reviews",https://www.booking.com/hotel/fr/le-saint-loui...


In [14]:
all_cities_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 694 entries, 0 to 693
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   city              694 non-null    string 
 1   name              694 non-null    string 
 2   hotel_prices      694 non-null    int64  
 3   rating            694 non-null    object 
 4   number of review  687 non-null    float64
 5   hotel_link        694 non-null    string 
dtypes: float64(1), int64(1), object(1), string(3)
memory usage: 32.7+ KB


In [15]:
url_list = all_cities_df['hotel_link'].tolist()

# Check for duplicates by converting the list to a set
unique_url_set = set(url_list)

# Compare lengths to determine if there are duplicates
if len(url_list) == len(unique_url_set):
    print("All links are unique.")
else:
    print("There are duplicate links.")
    # Find duplicates
    seen = set()
    duplicates = set()
    for url in url_list:
        if url in seen:
            duplicates.add(url)
        else:
            seen.add(url)
    
    print("Duplicate links:")
    for duplicate in duplicates:
        print(duplicate)

All links are unique.


In [16]:
import requests
from bs4 import BeautifulSoup

# Extract hotel_link column into a list
url_list = all_cities_df['hotel_link'].tolist()

results_link = []
# Iterate through the list of URLs
for url in url_list:
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract hotel information using BeautifulSoup
            hotel_name_tag = soup.find('h2', class_='aceeb7ecbc pp-header__title')
            hotel_address_tag = soup.find('span', class_='hp_address_subtitle js-hp_address_subtitle jq_tooltip')
            map_link_tag = soup.find('a', id='hotel_address', class_='loc_block_link_underline_fix bui-link show_on_map_hp_link show_map_hp_link')
            
            hotel_name = hotel_name_tag.text.strip() if hotel_name_tag else None
            hotel_address = hotel_address_tag.text.strip() if hotel_address_tag else None
            latitude, longitude = None, None
            if map_link_tag:
                latlng = map_link_tag.get('data-atlas-latlng')
                if latlng:
                    latitude, longitude = latlng.split(',')
            
            # Append extracted data to results_link list
            results_link.append({
                'hotel_name': hotel_name,
                'hotel_address': hotel_address,
                'latitude': latitude,
                'longitude': longitude,
            })
            print(f'this url has ben done {url}')
        else:
            print(f'Failed to retrieve {url}')
    except Exception as e:
        print(f'Error processing {url}: {str(e)}')

# Process results_link as needed
print(results_link)


this url has ben done https://www.booking.com/hotel/fr/appart-city-amiens.html?label=gen173nr-1FCAEoggI46AdIM1gEaKQCiAEBmAExuAEHyAEP2AEB6AEB-AECiAIBqAIDuAK95OyzBsACAdICJDEyYjJmNzkwLWRmZDMtNGM0OC05NTc1LTk4ZTJmNWZhMjc0NtgCBeACAQ&aid=304142&ucfs=1&arphpl=1&checkin=2024-07-20&checkout=2024-07-28&dest_id=-1407447&dest_type=city&group_adults=3&req_adults=3&no_rooms=1&group_children=0&req_children=0&hpos=1&hapos=1&sr_order=price&nflt=class%3D3%3Bclass%3D4%3Bht_id%3D204&srpvid=1a52950274530116&srepoch=1719349895&all_sr_blocks=55821703_340057887_0_2_0&highlighted_blocks=55821703_340057887_0_2_0&matching_block_id=55821703_340057887_0_2_0&sr_pri_blocks=55821703_340057887_0_2_0__86700&from_sustainable_property_sr=1&from=searchresults
this url has ben done https://www.booking.com/hotel/fr/the-nest-amiens.html?label=gen173nr-1FCAEoggI46AdIM1gEaKQCiAEBmAExuAEHyAEP2AEB6AEB-AECiAIBqAIDuAK95OyzBsACAdICJDEyYjJmNzkwLWRmZDMtNGM0OC05NTc1LTk4ZTJmNWZhMjc0NtgCBeACAQ&aid=304142&ucfs=1&arphpl=1&checkin=2024-07-2

In [17]:
results_link_df=pd.DataFrame(results_link)
results_link_df.head()

Unnamed: 0,hotel_name,hotel_address,latitude,longitude
0,Appart'City Confort Amiens Gare,"80, boulevard d'Alsace-Lorraine, 80000 Amiens,...",49.8943163,2.3097612
1,The Nest,"37 Boulevard Maignan Larivière, 80000 Amiens, ...",49.889307,2.294634
2,Nemea Appart Hotel Coliseum Amiens Centre,"25 Rue Frédéric Petit, 80000 Amiens, France",49.8921749,2.2889352
3,Odalys City Amiens Blamont,"25 rue du Blamont, 80000 Amiens, France",49.88714842,2.3116109
4,Le Saint Louis,"24 Rue Des Otages, 80000 Amiens, France",49.88977885,2.30249941


In [18]:
# results_link_df.to_csv('./bot_selenium/data/scraped_results_link.csv')

In [33]:
#load in data lake, S3 bucket

csv_buffer = StringIO()
results_link_df.to_csv(csv_buffer, index=False)

# Upload the CSV file to S3
bucket = s3.Bucket("holiday-recommendation")
bucket.put_object(Key="results_link_df.csv", Body=csv_buffer.getvalue())


s3.Object(bucket_name='holiday-recommendation', key='results_link_df.csv')