# **Forest Fire Prediction**
#### by Yevgenia Zalkind and Andrey Makarenko

### Part 1: Data acquisition
The Data Acquisition part is responsible for retrieving data from a specific source, in our case - retrieving wild fires locations data using a crawling mechanism from the NIFC - National Interagency Fire Center website. <br>


Imports section:

In [1]:
# Please note if running on a clean environment, need to install missing modules
import pandas as pd
import bs4
from bs4 import BeautifulSoup
import time 
from random import randint
import requests  
import csv
from collections import defaultdict
from selenium import webdriver 
from selenium.webdriver.common.keys import Keys 
from webdriver_manager.firefox import GeckoDriverManager
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By 
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC 

Global variables:

In [3]:
# The URL we are using is point locations for all wild land fires in the United States reported by the IRWIN system
URL = "https://data-nifc.opendata.arcgis.com/datasets/nifc::wildland-fire-incident-locations/explore?showTable=true"
CSV_NAME = "fire_history.csv"

Retrieve page content and save as CSV: <br>
<br>
*We are retrieving the page content by sending our URL to the get_page_source_after_scroll function which uses the scroll_down_element function that is designed to scroll down a web page using Selenium. Then with the help of the read_table_and_save_to_csv function, we can save our content to a CSV file.*

In [None]:
def scroll_down_element(driver, element):
    # Get current page height
    current_height= driver.execute_script("return arguments[0].scrollHeight",element)

    while True: 
        # Scroll to the bottom of the page
        driver.execute_script("arguments[0].scrollTo(0, arguments[0].scrollHeight);", element)

        time.sleep(randint(1, 5))  # Wait for some time to let the page load new content

        # Get the new height after scrolling
        new_height = driver.execute_script("return arguments[0].scrollHeight", element)

        if new_height == current_height:  # Check if the page height has remained the same
            table_info = driver.find_element(by=By.CLASS_NAME, value='feature-table-count').text.split()
            if table_info[1] == table_info[3]:
                print("found {} from {}".format(table_info[1], table_info[3]))
                break # We reached the end

            else:  # Check if the page is still loading and make sure it didn't crash
                try:
                    #driver.find_element(by=By.CLASS_NAME, value='loader') 
                    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'loader')))
                except:
                    driver.execute_script("arguments[0].scrollTo(0, 0);", element)
                    time.sleep(randint(1, 5))

        current_height = new_height

In [None]:
def get_page_source_after_scroll(url):
    # TODO - remove line
    # FIXME - fix something
    #driver = webdriver.Firefox(service=Service(GeckoDriverManger().install())) 

    driver = webdriver.Chrome()  # Initialize the WebDriver
    driver.get(url)  # Load the webpage
    time.sleep(10)
    if (requests.get(url).status_code == 200):
        element = driver.find_element(by=By.CLASS_NAME, value='infinite-scroll-container')
        scroll_down_element(driver, element)
    else:
        print("Error - Failed to retrieve page source")
    driver.quit()  # Quit the WebDriver to clean up resources
    return driver.page_source  # Return the page source after scrolling


In [None]:
def read_table_and_save_to_csv(page_source):
    # Create a BeautifulSoup object with the page source
    soup = BeautifulSoup(page_source, 'html.parser')
    table = soup.find('table')  # Find the table element on the page

    # Extract the table data into a list of lists
    table_data = [] 
    for row in table.find_all('tr'):
        row_data = [cell.get_text(strip=True) for cell in row.find_all('td')]
        table_data.append(row_data)

    df = pd.DataFrame(table_data)  # Create a DataFrame from the table data
    df.to_csv(CSV_NAME, index=False)  # Save the DataFrame to a CSV file


Implementation section:

In [None]:
page_content = get_page_source_after_scroll(URL)
read_table_and_save_to_csv(page_content)

Exploration section:<br>
<br>
*Let's see how our data is looking.*

In [4]:
df = pd.read_csv(CSV_NAME)
df

  df = pd.read_csv(CSV_NAME)


Unnamed: 0,X,Y,OBJECTID,SourceOID,ABCDMisc,ADSPermissionState,ContainmentDateTime,ControlDateTime,CreatedBySystem,IncidentSize,...,EstimatedFinalCost,OrganizationalAssessment,StrategicDecisionPublishDate,CreatedOnDateTime_dt,ModifiedOnDateTime_dt,IsCpxChild,CpxName,CpxID,SourceGlobalID,GlobalID
0,-118.180712,33.808985,1,7747595,,DEFAULT,,,lacocad,,...,,,,2020/02/28 20:52:36.363+00,2020/02/28 20:52:36.363+00,0,,,{6A311ABB-DF4F-4947-B8DD-3900BDA784F6},48d2c0e2-5e38-4d40-9d5e-066b076c7d98
1,-117.153901,33.176394,2,6384391,,DEFAULT,,,firecode,,...,,,,2019/07/01 20:10:12.737+00,2019/07/01 20:10:12.737+00,0,,,{1AF2C949-B159-4D8F-8D39-90CB58BC5DD5},17d2d66a-d451-4592-a172-7b2c860a2cc9
2,-121.104180,38.834727,3,1383752,,DEFAULT,,,firecode,,...,,,,2016/06/20 22:39:02.410+00,2016/06/20 22:39:02.410+00,0,,,{1B179EA1-97CE-4699-915B-374754BCBC5B},60c471ff-3c85-41b4-9135-e7338d7ec90b
3,-117.228592,33.782442,4,22499589,,DEFAULT,,,cfcad,,...,,,,2021/11/25 15:24:53.120+00,2021/11/25 15:24:53.120+00,0,,,{E61E387B-4ED7-4971-9604-C5D7391FAF77},149237ec-a42e-43d6-9318-22207a705dd9
4,-118.309032,33.941815,5,23869477,,DEFAULT,,,lacocad,,...,,,,2022/11/21 11:28:49.097+00,2022/11/21 11:28:49.097+00,0,,,{AEB6F7A3-A109-4132-9FEB-FB1EE1DF3193},ef7675e3-d5be-412a-a6c1-0d63fc7153c8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263817,-116.073913,43.243246,315324,28035649,,FIREREPORTING,2023/05/20 19:06:00+00,2023/05/20 19:06:00+00,wildcad,0.50,...,,,,2023/06/19 16:23:18.380+00,2023/06/19 16:26:42.300+00,0,,,{1553284C-4F2F-4D1E-8DFF-77F4593289FE},32553b11-17d3-405a-82ff-1cfbdfa6492e
263818,-116.026013,43.184536,315326,28035650,,FIREREPORTING,2023/05/20 19:33:00+00,2023/05/20 19:33:00+00,wildcad,0.50,...,,,,2023/06/19 16:28:15.883+00,2023/06/19 16:28:37.680+00,0,,,{2957B1E7-485A-4BE8-8914-2EEAD0823DF3},09c8f1ca-6e1f-4438-b929-55488216cb74
263819,-116.069113,43.241006,315327,28035652,,FIREREPORTING,2023/05/20 23:30:00+00,2023/05/20 23:30:00+00,wildcad,20.00,...,,,,2023/06/19 16:35:53.457+00,2023/06/19 16:36:22.327+00,0,,,{B0A31C1B-638A-4FBC-AF6F-E8BAC9DFCED5},72a22987-ba12-414c-9b8e-63eaa2587dd9
263820,-151.187739,60.447151,315328,28035653,,DEFAULT,2023/06/19 08:19:42+00,2023/06/19 08:37:23+00,ifm,0.10,...,,,,2023/06/19 16:40:37.620+00,2023/06/19 16:44:40.917+00,0,,,{F8490B1B-82F1-4851-8386-F121978FE268},197b872b-1932-46aa-a7e6-628097227187


In [7]:
print("=== Data Frame description ===")
print(df.describe(include='all'))

=== Data Frame description ===
                    X              Y       OBJECTID     SourceOID ABCDMisc  \
count   263822.000000  263822.000000  263822.000000  2.638220e+05    13347   
unique            NaN            NaN            NaN           NaN      512   
top               NaN            NaN            NaN           NaN     EKV5   
freq              NaN            NaN            NaN           NaN      529   
mean      -108.972841      40.020878  148962.076176  1.169931e+07      NaN   
std         14.262137       6.339502   92327.781085  9.373173e+06      NaN   
min       -176.645294      13.380717       1.000000  5.009160e+05      NaN   
25%       -118.311680      34.637724   65977.250000  3.338407e+06      NaN   
50%       -112.035261      39.387641  145334.500000  7.606048e+06      NaN   
75%       -100.660843      44.760283  229138.750000  2.130120e+07      NaN   
max        144.755273      70.330801  315329.000000  2.803565e+07      NaN   

       ADSPermissionState     Co

In [8]:
print("=== Data Frame information ===")
print(df.info())

=== Data Frame information ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 263822 entries, 0 to 263821
Data columns (total 96 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   X                                263822 non-null  float64
 1   Y                                263822 non-null  float64
 2   OBJECTID                         263822 non-null  int64  
 3   SourceOID                        263822 non-null  int64  
 4   ABCDMisc                         13347 non-null   object 
 5   ADSPermissionState               263822 non-null  object 
 6   ContainmentDateTime              158269 non-null  object 
 7   ControlDateTime                  141929 non-null  object 
 8   CreatedBySystem                  263822 non-null  object 
 9   IncidentSize                     186120 non-null  float64
 10  DiscoveryAcres                   197376 non-null  float64
 11  DispatchCenterID                 2