In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv

In [2]:
# Укажите путь к вашему ChromeDriver
driver_path = r"C:\chromedriver-win64\chromedriver.exe"   #"/path/to/chromedriver"
url = "https://storm.pps.eosdis.nasa.gov/storm/data/Service.jsp?serviceName=Order"
email = 'test@gmail.com'

In [3]:
def login_to_website(driver, url, email):
    """
    Logs into the website using the provided email without checking for login success.

    :param driver: Selenium WebDriver instance.
    :param url: URL of the login page.
    :param email: Email to use for login.
    """
    driver.get(url)
    try:
        # Wait for the email input field to appear
        email_field = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.NAME, "userId"))
        )
        email_field.send_keys(email)
        
        # Submit the form by simulating the Enter key
        email_field.send_keys("\n")
        
        print("Login attempted.")
    except Exception as e:
        print("An error occurred during login:", e)
        raise

In [4]:
def tick_checkbox(driver, checkbox_name):
    try:
        # Wait for the checkbox to be present and interactable
        checkbox = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.NAME, checkbox_name))
        )
        
        # Check if the checkbox is already selected
        if not checkbox.is_selected():
            # Use JavaScript to click the checkbox if needed
            driver.execute_script("arguments[0].click();", checkbox)
            print(f"Checkbox '{checkbox_name}' has been ticked.")
        else:
            print(f"Checkbox '{checkbox_name}' is already selected.")
    except Exception as e:
        print(f"An error occurred while ticking checkbox '{checkbox_name}': {e}")

In [5]:
def set_coordinates(driver, north, south, east, west, retries=3, time_to_sleep=1):
    """
    Sets the geographical coordinates (north, south, east, west) on the webpage.
    Handles stale element exceptions by re-locating elements as needed.
    """
    try:
        for attempt in range(retries):
            try:
                # Locate all fields fresh in each attempt
                north_field = driver.find_element(By.ID, "north")
                south_field = driver.find_element(By.ID, "south")
                east_field = driver.find_element(By.ID, "east")
                west_field = driver.find_element(By.ID, "west")

                # Set north
                north_field.clear()
                north_field.send_keys(north)
                print(f"North coordinate set: {north_field.get_attribute('value')}")

                # Set south
                south_field.clear()
                south_field.send_keys(south)
                print(f"South coordinate set: {south_field.get_attribute('value')}")

                # Set east
                east_field.clear()
                east_field.send_keys(east)
                print(f"East coordinate set: {east_field.get_attribute('value')}")

                # Set west
                west_field.clear()
                west_field.send_keys(west)
                print(f"West coordinate set: {west_field.get_attribute('value')}")

                # Trigger change and blur events for west
                driver.execute_script("arguments[0].dispatchEvent(new Event('change'));", west_field)
                driver.execute_script("arguments[0].dispatchEvent(new Event('blur'));", west_field)

                # Validate west field
                time.sleep(1)  # Allow scripts to process
                current_value = driver.find_element(By.ID, "west").get_attribute("value")  # Re-locate for validation
                if current_value == west:
                    print("West coordinate set and verified successfully!")
                    break
                else:
                    print(f"West coordinate reset, retrying... (Attempt {attempt + 1}/{retries})")
            except StaleElementReferenceException:
                print("Stale element encountered, re-locating elements...")
                time.sleep(time_to_sleep)
        else:
            raise ValueError("Failed to set 'west' coordinate correctly after multiple attempts.")

        # Allow page scripts to process inputs
        time.sleep(time_to_sleep)
        print("All coordinates set successfully!")
    except Exception as e:
        print("Error while setting coordinates:", e)
        raise

In [6]:
def set_parameters(driver, time_to_sleep=1):
    try:
        # Tick the first checkbox (makeGeoLocation)
        tick_checkbox(driver, "makeGeoLocation")
        time.sleep(time_to_sleep)  # Wait for the new page to load

        # Tick the second checkbox (makeGeoSubset)
        tick_checkbox(driver, "makeGeoSubset")
        time.sleep(2)  # Wait for the new page to load

        # Select the option for "2B" in the dropdown menu
        try:
            select_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "select[title='Select value to narrow down product types displayed']"))
            )
            for option in select_element.find_elements(By.TAG_NAME, "option"):
                if option.get_attribute("value") == "2B":
                    option.click()
                    break
            print("Option '2B' selected successfully!")
        except Exception as e:
            print("Failed to select '2B' option:", e)
            raise

        time.sleep(2)  # Wait for the new page to load

        # Tick the newly appeared checkbox
        try:
            new_checkbox = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "img[onclick*='changeState(true)']"))
            )
            driver.execute_script("arguments[0].click();", new_checkbox)
            print("New checkbox ticked successfully!")
        except Exception as e:
            print("Failed to tick new checkbox:", e)
            raise

        time.sleep(2)  # Wait for the new page to load

        # Input the start date
        try:
            start_date = driver.find_element(By.ID, "startDate")
            start_date.clear()
            start_date.send_keys("20140308 00:00")
            print("Start date set successfully!")
        except Exception as e:
            print("Failed to set start date:", e)
            raise

        # Input the end date
        try:
            end_date = driver.find_element(By.ID, "endDate")
            end_date.clear()
            end_date.send_keys("20181231 23:59")
            print("End date set successfully!")
        except Exception as e:
            print("Failed to set end date:", e)
            raise

        # Input the coordinates using the new function
        try:
            set_coordinates(driver, north="40.77", south="40.32", east="44.71", west="43.52")
        except Exception as e:
            print("Failed to set coordinates using set_coordinates:", e)
            raise

    except Exception as e:
        print("An error occurred while setting parameters:", e)
        raise

In [7]:
def navigate_pages(driver):
    """
    Navigate to the next page of the table using numbered buttons and arrows for pagination.

    :param driver: Selenium WebDriver instance.
    :return: True if navigation was successful, False if no more pages or error occurred.
    """
    try:
        # Find all numbered page buttons
        page_buttons = driver.find_elements(By.CSS_SELECTOR, "div.dhx_page_skyblue div")
        current_page = None

        # Identify the active page button
        for button in page_buttons:
            if "dhx_page_active_skyblue" in button.get_attribute("class"):
                current_page = button
                break

        if current_page is None:
            print("No active page button found. Possibly an error.")
            return False

        # Determine the current page number
        current_page_number = int(current_page.text.strip())
        print(f"Currently on page: {current_page_number}")

        # Check for the next numbered page
        for button in page_buttons:
            try:
                button_number = int(button.text.strip())
                if button_number == current_page_number + 1:
                    print(f"Found next page button: {button_number}")
                    driver.execute_script("arguments[0].click();", button)
                    print(f"Navigated to page {button_number}.")
                    time.sleep(2)  # Allow time for the new page to load
                    return True
            except ValueError:
                # Skip non-numeric buttons
                continue

        # If no next numbered page, try clicking the arrow button
        print("No next numbered page found. Trying the arrow button for the next range.")
        arrow_button = driver.find_element(By.XPATH, "//div[contains(text(), '→')]")
        driver.execute_script("arguments[0].click();", arrow_button)
        print("Arrow button clicked to load the next range of pages.")
        time.sleep(2)  # Allow time for the new range of pages to load
        return True

    except Exception as e:
        print(f"Reached the end of pages or encountered an error: {e}")
        return False


In [11]:
def tick_checkbox_in_row(row):
    """
    Tick the checkbox in a given row.

    :param row: WebElement representing the table row.
    """
    try:
        checkbox = row.find_element(By.CSS_SELECTOR, "img[onclick*='changeState(true)']")
        checkbox.click()
        print("Checkbox ticked successfully!")
    except Exception as e:
        print(f"Failed to tick checkbox in row: {e}")
        raise


In [12]:
def collect_data(driver, output_file):
    """
    Collect datetime data from the table and save it incrementally to a CSV file.

    :param driver: Selenium WebDriver instance.
    :param output_file: Path to the output CSV file.
    """
    try:
        with open(output_file, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            # Write the header to the CSV file
            writer.writerow(["Start Date", "End Date"])

            while True:
                # Get all rows from the current table page, including both even and odd rows
                rows = driver.find_elements(By.CSS_SELECTOR, "tr.ev_dhx_skyblue, tr.odd_dhx_skyblue")
                print(f"Found {len(rows)} rows.")
                
                for row in rows:
                    # Extract cells from the row
                    cells = row.find_elements(By.TAG_NAME, "td")
                    print(f"Row content: {[cell.text for cell in cells]}")  # Debug row content
                    
                    if len(cells) >= 6:  # Ensure there are enough cells in the row
                        start_date = cells[4].text.strip()  # 4th column: Start Date
                        end_date = cells[5].text.strip()    # 5th column: End Date
                        
                        # Skip rows with empty start or end dates
                        if not start_date or not end_date:
                            print("Skipping empty row.")
                            continue
                        
                        # Write to CSV incrementally
                        writer.writerow([start_date, end_date])
                        print(f"Row written: Start Date: {start_date}, End Date: {end_date}")

                        # Tick the checkbox for this row
                        tick_checkbox_in_row(row)

                # Navigate to the next page
                if not navigate_pages(driver):
                    print("No more pages available or error in navigation.")
                    break

        print(f"Data collection complete. Data saved to {output_file}")
    except Exception as e:
        print(f"An error occurred during data collection: {e}")
        raise


In [13]:
def collect_data_add_logs(driver, output_file, debug_output_file):
    """
    Collect datetime data from the table and save it incrementally to two CSV files.
    
    :param driver: Selenium WebDriver instance.
    :param output_file: Path to the output CSV file for valid rows.
    :param debug_output_file: Path to the debug CSV file for all rows, including problematic ones.
    """
    try:
        with open(output_file, mode='w', newline='', encoding='utf-8') as valid_file, \
             open(debug_output_file, mode='w', newline='', encoding='utf-8') as debug_file:

            valid_writer = csv.writer(valid_file)
            debug_writer = csv.writer(debug_file)

            # Write headers to both CSV files
            valid_writer.writerow(["Start Date", "End Date"])
            debug_writer.writerow(["Full Row Text", "Start Date", "End Date", "Remarks"])

            while True:
                # Get all rows from the current table page
                rows = driver.find_elements(By.CSS_SELECTOR, "tr.ev_dhx_skyblue")
                print(f"Found {len(rows)} rows.")
                for row in rows:
                    # Extract cells from the row
                    cells = row.find_elements(By.TAG_NAME, "td")
                    full_row_text = [cell.text for cell in cells]  # Collect full row content for debug
                    print(f"Row content: {full_row_text}")

                    if len(cells) >= 6:  # Ensure there are enough cells in the row
                        start_date = cells[4].text.strip()  # 4th column: Start Date
                        end_date = cells[5].text.strip()    # 5th column: End Date
                        
                        # Log the row to the debug file
                        debug_writer.writerow([full_row_text, start_date, end_date, "Valid Row" if start_date and end_date else "Missing Dates"])

                        # Skip rows with empty start or end dates
                        if not start_date or not end_date:
                            print("Skipping row with missing dates.")
                            continue
                        
                        # Write valid rows to the main CSV file
                        valid_writer.writerow([start_date, end_date])
                        print(f"Valid row written: Start Date: {start_date}, End Date: {end_date}")
                    else:
                        # Log problematic rows with insufficient columns
                        debug_writer.writerow([full_row_text, "", "", "Insufficient Columns"])
                        print("Row skipped due to insufficient columns.")

                # Navigate to the next page
                if not navigate_pages(driver):
                    print("No more pages available or error in navigation.")
                    break

        print(f"Data collection complete. Valid data saved to {output_file}, debug data saved to {debug_output_file}")
    except Exception as e:
        print(f"An error occurred during data collection: {e}")
        raise

In [17]:
output_path = 'D:/2024_Lightning_Analyzing/GPM/'
output_file =  output_path + 'data_from_GPM.csv'
debug_output_file =  output_path + "debug_data.csv"
time_to_sleep = 1   # Подождём, чтобы страница загрузилась

if __name__ == "__main__":
    # Example usage
#    driver_path = "/path/to/chromedriver"  # Update this with your actual path
    driver = webdriver.Chrome(executable_path=driver_path)

    try:
        # Call the login function
        login_to_website(driver, url, "katja.svechnikova@gmail.com")
        time.sleep(time_to_sleep)
        # Call the set_parameters function
        set_parameters(driver, time_to_sleep)
        time.sleep(time_to_sleep)
        collect_data(driver, output_file)
        time.sleep(60)

        #collect_data_add_logs(driver, output_file, debug_output_file)



    finally:
        # Close the browser session
        driver.quit()

time.sleep(6)
# Закрываем браузер
driver.quit()

Login attempted.
Checkbox 'makeGeoLocation' has been ticked.
Checkbox 'makeGeoSubset' has been ticked.
Option '2B' selected successfully!
New checkbox ticked successfully!
Start date set successfully!
End date set successfully!
North coordinate set: 40.77
South coordinate set: 40.32
East coordinate set: 44.71
West coordinate set: 43.52
West coordinate reset, retrying... (Attempt 1/3)
North coordinate set: 40.77
South coordinate set: 40.32
East coordinate set: 44.71
West coordinate set: 43.52
West coordinate reset, retrying... (Attempt 2/3)
North coordinate set: 40.77
South coordinate set: 40.32
East coordinate set: 44.71
West coordinate set: 43.52
West coordinate set and verified successfully!
All coordinates set successfully!
Found 18 rows.
Row content: ['', '2B', '2BCMB', 'MULTIPLE', '', '', 'ORBIT', 'GPM', 'DPR', 'Precipitation', 'hdf5', ' ', '', '10.5067/GPM/DPRGMI/CMB/2B/07']
Skipping empty row.
Row content: ['', '2B', '2BCMBT', 'MULTIPLE', '', '', 'ORBIT', 'TRMM', 'MULTIPLE', 'Pr