# Scraping Tests

## Anti-Scraping Capabilities

In [13]:
import requests
import time

# Base URL of the website to test
url = 'https://www.myfxbook.com/forex-market/currencies/EURUSD-historical-data'

# Function to make requests and print response status
def make_request(session, url, headers=None):
    try:
        response = session.get(url, headers=headers)
        print(f'Status Code: {response.status_code}, URL: {url}')
        return response
    except Exception as e:
        print(f'Error: {e}')

def is_captcha_page(content):
    """
    Check if the page content contains a CAPTCHA challenge.
    This function may need to be adjusted based on how the CAPTCHA is implemented on the webpage.
    """
    soup = BeautifulSoup(content, 'html.parser')
    # Look for common CAPTCHA elements (adjust as needed)
    captcha_elements = soup.find_all(string=['CAPTCHA', 'Please prove you are not a robot'])
    return bool(captcha_elements)

# Main function to perform the tests
def test_anti_scraping(url):
    with requests.Session() as session:
        
        print('Testing rate limiting by making rapid requests...')
        for _ in range(10):  # making 10 rapid requests
            make_request(session, url)
            time.sleep(1)  # sleep for 1 second between requests

        print('\nTesting User-Agent restriction by changing User-Agent...')
        # Try with a different user-agent
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
        }
        response = make_request(session, url, headers)
        print(f'is captcha: {is_captcha_page(response.content)}')

        # Add more tests as needed, like IP rotation, CAPTCHA detection, etc.

# Run the test
test_anti_scraping(url)


Testing rate limiting by making rapid requests...

Testing User-Agent restriction by changing User-Agent...
Status Code: 200, URL: https://www.myfxbook.com/forex-market/currencies/EURUSD-historical-data
is captcha: False


## Data From HTML

In [1]:
import requests
from bs4 import BeautifulSoup

# URL of the webpage you want to scrape
url = 'https://www.myfxbook.com/forex-market/currencies/EURUSD-historical-data'

# Send a HTTP request to the specified URL and save the response from server
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the content of the response using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Print the entire HTML source of the page
    print(soup.prettify())
else:
    print('Failed to retrieve the webpage. Status code:', response.status_code)

<!DOCTYPE html>
<html lang="en">
 <head>
  <link href="https://static.mfbcdn.net" rel="preconnect"/>
  <link href="https://securepubads.g.doubleclick.net" rel="preconnect"/>
  <link as="font" crossorigin="" href="https://static.mfbcdn.net/assets/layouts/layout3/css/fonts/Roboto/roboto-v20-latin-regular.woff" rel="preload" type="font/woff2"/>
  <link as="font" crossorigin="" href="https://static.mfbcdn.net/assets/layouts/layout3/css/fonts/Roboto/roboto-v20-latin-regular.woff2" rel="preload" type="font/woff2"/>
  <link as="font" crossorigin="" href="https://static.mfbcdn.net/assets/layouts/layout3/css/fonts/Roboto/roboto-v20-latin-500.woff2" rel="preload" type="font/woff2"/>
  <link as="font" crossorigin="" href="https://static.mfbcdn.net/assets/layouts/layout3/css/fonts/Roboto/roboto-v20-latin-700.woff2" rel="preload" type="font/woff2"/>
  <link as="font" crossorigin="" href="https://static.mfbcdn.net/assets/global/plugins/font-awesome/webfonts/fa-regular-400.woff2" rel="preload" type="

In [17]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the webpage you want to scrape
url = 'https://www.myfxbook.com/forex-market/currencies/EURUSD-historical-data'

# Send a HTTP request to the specified URL
response = requests.get(url)

# Initialize Pandas dataframe
df = pd.DataFrame()

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the table with the ID 'symbolMarket'
    table = soup.find('table', id='symbolMarket')

    if table:
        # Extracting table headers
        headers = ['Date', 'Open', 'High', 'Low', 'Close', 'Change (Pips)', 'Change (%)']

        # Extracting rows
        rows = []
        for row in table.find_all('tr')[1:]:  # Skipping the header row
            cols = row.find_all('td')
            cols = [ele.text.strip() for ele in cols]
            rows.append(cols)

        # Creating a DataFrame
        df = pd.DataFrame(rows, columns=headers)

        print(df.head())  # Print the first few rows of the DataFrame
    else:
        print("Table with ID 'symbolMarket' not found.")
else:
    print('Failed to retrieve the webpage. Status code:', response.status_code)


print(df)

                 Date     Open     High      Low    Close Change (Pips)  \
0  Mar 17, 2024 00:00  1.08882  1.08907  1.08812  1.08832          -5.0   
1  Mar 15, 2024 00:00  1.08826     1.09   1.0871   1.0889          +6.4   
2  Mar 14, 2024 00:00   1.0953  1.09549   1.0879  1.08826         -70.4   
3  Mar 13, 2024 00:00   1.0927  1.09637   1.0919  1.09527         +25.7   
4  Mar 12, 2024 00:00  1.09303  1.09432   1.0902  1.09267          -3.6   

  Change (%)  
0     -0.05%  
1     +0.06%  
2     -0.65%  
3     +0.23%  
4     -0.03%  
                  Date     Open     High      Low    Close Change (Pips)  \
0   Mar 17, 2024 00:00  1.08882  1.08907  1.08812  1.08832          -5.0   
1   Mar 15, 2024 00:00  1.08826     1.09   1.0871   1.0889          +6.4   
2   Mar 14, 2024 00:00   1.0953  1.09549   1.0879  1.08826         -70.4   
3   Mar 13, 2024 00:00   1.0927  1.09637   1.0919  1.09527         +25.7   
4   Mar 12, 2024 00:00  1.09303  1.09432   1.0902  1.09267          -3.6   
.. 

## Javascript Interaction

id="timeFramePicker"

To actually choose timescale, go into the <select id="timeScales"> tag
and input a selected="" class for the one you want to select

id="dateRangePicker"
All datepickers are in div with id="xdsoft_datepicker active"

e.g. month picker
<div class="xdsoft_select xdsoft_monthselect xdsoft_scroller_box" style="display: none;"><div style="margin-top: -48px;">

give class "xdsoft_option xdsoft_current" for the month you want

In [31]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service as ChromeService
import pandas as pd
import time

# Initialize Selenium WebDriver
driver = webdriver.Chrome()

# URL of the webpage you want to scrape
url = 'https://www.myfxbook.com/forex-market/currencies/EURUSD-historical-data'

# Navigate to the webpage
driver.get(url)

# Wait for the page to load completely
time.sleep(3)
driver.execute_script("""
var optionToSelect = document.getElementById('timeScale240');
var optionToDeselect = document.getElementById('timeScale1440');
if (optionToSelect) {
    optionToSelect.setAttribute('selected', '');
}
if (optionToDeselect) {
    optionToDeselect.removeAttribute('selected');
}
""")
# Click DateButton
filter_button = driver.find_element(By.ID, 'historicalFilterBtn')


# Click the link with id "historicalFilterBtn"
filter_button = driver.find_element(By.ID, 'historicalFilterBtn')
filter_button.click()
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)

# Close the browser
driver.quit()



In [30]:
import datetime

# Create a list of datetime objects for every minute in a week
start_datetime = datetime.datetime(2023, 1, 1, 0, 0)  # Start of the week
end_datetime = start_datetime + datetime.timedelta(weeks=1)  # End of the week

current_datetime = start_datetime
datetime_list = []

while current_datetime < end_datetime:
    datetime_list.append(current_datetime)
    current_datetime += datetime.timedelta(minutes=1)

# Split the list into 1000-minute chunks
chunk_size = 1000
datetime_chunks = [datetime_list[i:i + chunk_size] for i in range(0, len(datetime_list), chunk_size)]

# Show the first chunk as an example
datetime_chunks[0][:5], len(datetime_chunks), datetime_chunks[-1][-5:]  # Show the first 5 minutes of the first chunk, total chunks, and last 5 minutes of the last chunk


([datetime.datetime(2023, 1, 1, 0, 0),
  datetime.datetime(2023, 1, 1, 0, 1),
  datetime.datetime(2023, 1, 1, 0, 2),
  datetime.datetime(2023, 1, 1, 0, 3),
  datetime.datetime(2023, 1, 1, 0, 4)],
 11,
 [datetime.datetime(2023, 1, 7, 23, 55),
  datetime.datetime(2023, 1, 7, 23, 56),
  datetime.datetime(2023, 1, 7, 23, 57),
  datetime.datetime(2023, 1, 7, 23, 58),
  datetime.datetime(2023, 1, 7, 23, 59)])