# Scrape Radarbox for more plane details

In [20]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

In [21]:
df = pd.read_csv("/Users/karinashedrofsky/LEDE_2023/flights-project/csvs/icarus_flights_cleaned.csv")

The first attempt at scraping from Radarbox resulted in a number of blank pages, meaning Radarbox doesn't have webpages or data on some of the ICAOs. I found that, in some cases, it's because some ICAOs are missing a '0' at the front. To maximize the number of pages pulled from Radarbox, I put a '0' in front of every ICAO with fewer than 6 characters. 

#### Add a 0 to the front of every icao < 6 characters

In [11]:
df.loc[df['icao'].str.len() < 6, 'icao'] = '0' + df['icao']

### Pull all unique ICAOs from the DataFrame
These will be used to scrape Radarbox profiles for the planes' tail numbers and make/model  
Radarbox urls are formatted ``https://www.radarbox.com/data/mode-s/{ICAO}``

In [12]:
unique_icaos = df['icao'].unique().tolist()
len(unique_icaos)

1063

### Save the html for each available RadarBox page

In [4]:
directory = "/Users/karinashedrofsky/LEDE_2023/flights-project/radarbox-htmls"
base_url = "https://www.radarbox.com/data/mode-s/"

In [26]:
for icao in unique_icaos:
    try:
        # Construct the URL for each ICAO code by appending it to the base URL
        url = base_url + icao

        response = requests.get(
            url,
            headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36"}
        )

        # Save HTML content to a separate file in the 'radarbox-htmls' directory
        file_name = icao + ".html"
        file_path = os.path.join(directory, file_name)
        
        if os.path.exists(file_path): #skip scraping webpages for icaos that have already been scraped
            continue

        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(response.text)

        print("HTML content saved to:", file_path)

        # Add a pause of 2 seconds between each iteration
        time.sleep(2)

    except Exception as e:
        print(f"Error occurred for ICAO code: {icao}")
        print(f"Error message: {str(e)}")

### Check for all htmls that didn't scrape properly and delete them from the directory 
This is based on file size. All htmls without detailed plane information on RadarBox are < 252,000 bytes

In [23]:
failed_icaos = []

# Get a list of all files in the directory
file_list = os.listdir(directory)


for file_name in file_list:
    file_path = os.path.join(directory, file_name)
    
    
    # Check if the file is smaller than 252000 bytes
    if os.path.getsize(directory + '/' + file_name) < 252000:
        # Make a list of all icaos that don't have populated Radarbox page (append file name without the ".html" extension)
        file_name_without_extension = file_name[:-5] 
        failed_icaos.append(file_name_without_extension)

        try:
            # Delete the file from directory
            os.remove(file_path)
            print(f"Deleted file: {file_path}")
        except OSError as e:
            print(f"Error occurred while deleting {file_path}: {str(e)}")
            
#show the list of all of the icaos that don't have a populated Radarbox page
print(failed_icaos)

[]


### Scrape tail number and plane type from each planes Radarbox page

In [5]:
data_list = []

# Loop through each HTML file in the radarbox_htmls directory
for filename in os.listdir(directory):
    if filename.endswith(".html"):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            html = file.read()

        soup = BeautifulSoup(html)

        # Take the icao from the filename (without the ".html" extension)
        icao = filename.replace(".html", "")

        # Extract the tail number from the HTML content
        try:
            tail = soup.select("#secondary")[0].text
        except IndexError:
            tail = ""
        # Extract the plane model from the HTML content
        try:
            plane = soup.select("#aircraft-info .full-width #value")[0].text
        except IndexError:
            plane = ""

        # Append the extracted data to the data_list
        data_list.append([icao, tail, plane])

# Create a DataFrame from the data_list with appropriate column names
radarbox_details_df = pd.DataFrame(data_list, columns=["icao", "tail", "plane"])

In [6]:
radarbox_details_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1052 entries, 0 to 1051
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   icao    1052 non-null   object
 1   tail    1052 non-null   object
 2   plane   1052 non-null   object
dtypes: object(3)
memory usage: 24.8+ KB


In [22]:
radarbox_details_df.to_csv("/Users/karinashedrofsky/LEDE_2023/flights-project/csvs/radarbox_details.csv", index=False)

### Identify the missing data
Some planes have populated RadarBox profiles but are still missing details on the tail number and plane type  
There are other websites other than Radarbox that contain this info, so they can be checked manually

In [23]:
blank = radarbox_details_df[radarbox_details_df['tail'] == '']
blank

Unnamed: 0,icao,tail,plane
193,1418d8,,-
229,14fa0a,,-
304,466b38,,-
312,152c2e,,-
418,142586,,-
436,4b850c,,-
507,140ac5,,-
508,1506a6,,-
514,032091,,-
577,1506aa,,-


### Read in manually checked data
_Since there were only 22, I manually checked all ICAOs without info in Radarbox and created a separate csv_  
The link to the data sources can be found in the csv file in the directory

In [12]:
manual_df = pd.read_csv("/Users/karinashedrofsky/LEDE_2023/flights-project/csvs/manual-plane-data.csv")
manual_df.drop('link', axis=1, inplace=True)
manual_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   icao    22 non-null     object
 1   tail    17 non-null     object
 2   plane   17 non-null     object
dtypes: object(3)
memory usage: 656.0+ bytes


#### Combine the scraped plane info with manually identified plane info

In [13]:
# Create a copy of radarbox_details_df to make a new DataFrame for the updates
updated_plane_df = radarbox_details_df.copy()

# Set 'icao' as the index for both DataFrames
updated_plane_df.set_index('icao', inplace=True)
manual_df.set_index('icao', inplace=True)

# Update the values in updated_plane_df with the values from manual_df where there's an icao match
updated_plane_df.update(manual_df)

# Reset the index to bring 'icao' back as a regular column
updated_plane_df.reset_index(inplace=True)

In [14]:
updated_plane_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1052 entries, 0 to 1051
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   icao    1052 non-null   object
 1   tail    1052 non-null   object
 2   plane   1052 non-null   object
dtypes: object(3)
memory usage: 24.8+ KB


We're still left with a few icaos that couldn't be identified manually or by scraping Radarbox...

In [15]:
updated_plane_df[updated_plane_df["tail"] == '']

Unnamed: 0,icao,tail,plane
304,466b38,,-
514,032091,,-
682,600be9,,-
718,ea000f,,-
775,26002b,,-


In [24]:
#save as csv
updated_plane_df.to_csv("/Users/karinashedrofsky/LEDE_2023/flights-project/csvs/all_plane_info.csv", index = False)