# Web Scraping with Selenium

---


Selenium is a powerful tool, popular with developers and data scientists alike, An open source test automation tool for automating web browsers and it is widely used for web scraping.

---


This notebook explores Selenium's web scraping capabilities through an example
where we scrape data from the [bloomberght](https://www.bloomberght.com/) website to create a dataset. Once we had scraped our data, we pulled in additional data from the Yahoo Finance library and built up our data for analysis.



# 1. Import Libraries

Import necessary libraries.

In [None]:
!pip install selenium
!pip install yfinance
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
import os
from datetime import datetime
from dateutil import parser
import yfinance as yf

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

Collecting selenium
  Downloading selenium-4.23.1-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.26.0-py3-none-any.whl.metadata (8.8 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading selenium-4.23.1-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.26.0-py3-none-any.whl (475 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.7/475.7 kB[0m [31m11.

# 2. Necessary functions

In [None]:

def scrape_news(page_link, page_number):
    # Driver and chrome options
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    driver = webdriver.Chrome(options=chrome_options)
    data = []

    # Get all pages
    for i in range(1, page_number + 1):
        # Get each page's link
        url = f'https://www.bloomberght.com/{page_link}/{i}'
        driver.get(url)

        # Use WebDriverWait instead of sleep for more reliable waiting
        time.sleep(12)  # Alternatively, implement a more dynamic wait

        # Find article html elements
        new_elements = driver.find_elements(By.XPATH, "//div[@class='widget-news-list type1']//li")

        # In one page, get all articles inside //li
        for element in new_elements:
            # Get each article's link
            href_element = element.find_element(By.XPATH, './/a')
            href = href_element.get_attribute('href')

            # Get each article's title
            title_element = element.find_element(By.CLASS_NAME, 'title')
            text = title_element.text.strip() if title_element else element.text.strip()

            # Get each article's description
            desc_element = element.find_element(By.CLASS_NAME, 'description')
            desc = desc_element.text.strip() if desc_element else None

            # Get each article's date
            date_element = element.find_element(By.CLASS_NAME, 'date')
            date = date_element.text.strip() if date_element else None

            # Each article add to data list
            data.append({"Page_link": page_link, "Link": href, "Title": text, "Description": desc, "Date1": date})

    # Stop driver
    driver.quit()

    # Generate dataframe
    data = pd.DataFrame(data)
    return data

def add_new_data(file_path, data):
    # Check if the data exists
    if data is None or data.empty:
        print("No data to add.")
        return

    # Retrieving old data
    if os.path.exists(file_path):
        existing_data = pd.read_csv(file_path)
    else:
        existing_data = pd.DataFrame(columns=["Page_link", "Link", "Title", "Description", "Date1"])

    # Eliminate the non-existent
    new_data = data[~data['Date1'].isin(existing_data['Date1'])]

    # Merge with existing dataset
    combined_data = pd.concat([existing_data, new_data], ignore_index=True)

    print("Data updated.")
    return combined_data.to_csv(file_path, index=False)

# Months names in Turkish
months = {
    'Ocak': '01', 'Şubat': '02', 'Mart': '03', 'Nisan': '04', 'Mayıs': '05', 'Haziran': '06',
    'Temmuz': '07', 'Ağustos': '08', 'Eylül': '09', 'Ekim': '10', 'Kasım': '11', 'Aralık': '12'
}

def parse_turkish_date(date_str):
    # Split the date string into date and time parts
    parts = date_str.split(', ')
    date_part = parts[0]

    # Parse the date part, handling potential extra spaces and day of week
    date_parts = date_part.split(' ')
    if len(date_parts) == 3:  # If day of week is present
        day = date_parts[1]
        month_name = date_parts[2]
        year = date_parts[3]
    else:  # If day of week is not present
        day = date_parts[0]
        month_name = date_parts[1]
        year = date_parts[2]

    month = months[month_name]

    # Rearrange the date components to Year-Month-Day format
    formatted_date = f'{year}-{month}-{day}'

    # Check if time information is available
    if len(parts) > 1:
        time_part = parts[1]
        return f'{formatted_date} {time_part}'
    else:
        return f'{formatted_date} 00:00'

def add_date_column(file_path, file_path_last):
    data = pd.read_csv(file_path)
    # Data parsing and addition of a new column
    data['Date1'] = data['Date1'].apply(parse_turkish_date)
    # The format is now correct for the date strings from parse_turkish_date
    data['Date1'] = pd.to_datetime(data['Date1'], format='%Y-%m-%d %H:%M')

    # Re-index and write data to CSV file
    data = data.reset_index(drop=True)

    # Add new date format to %Y-%m-%d
    data['Date'] = data['Date1'].dt.date
    data['Date'] = pd.to_datetime(data['Date'])

    return data.to_csv(file_path_last, index=False)

# 3. Scrape Data

Pull all the data from the bloomberght whose title is "[tum-ekonomi-haberleri](https://www.bloomberght.com/tum-ekonomi-haberleri)"

In [None]:
# Scrape data from bloomberght
page_link = ["tum-piyasa-haberleri", "tum-ekonomi-haberleri"]
page_number = 5
# Add data to /content/news.csv file If there is no file, create it.
news_path = "/content/news.csv"

for link in page_link:
    data = scrape_news(link, page_number)
    add_new_data(news_path, data)



Data updated.
Data updated.


In [None]:
add_date_column(news_path, news_path)

In [None]:
#Read scraped data
df = pd.read_csv(news_path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Page_link    99 non-null     object
 1   Link         99 non-null     object
 2   Title        99 non-null     object
 3   Description  99 non-null     object
 4   Date1        99 non-null     object
 5   Date         99 non-null     object
dtypes: object(6)
memory usage: 4.8+ KB


In [None]:
df.head()

Unnamed: 0,Page_link,Link,Title,Description,Date1,Date
0,tum-piyasa-haberleri,https://www.bloomberght.com/tcmb-den-ilk-ek-tl-depo-alim-ihalesi-2357713,TCMB'den ilk ek TL depo alım ihalesi,Türkiye Cumhuriyet Merkez Bankasının (TCMB) TL depo alım ihalesinde teklif tutarı 21 milyar 64 milyon lira oldu.,2024-08-02 14:08:00,2024-08-02
1,tum-piyasa-haberleri,https://www.bloomberght.com/gram-altinda-yeni-rekor-2357679,Gram altında yeni rekor,"Altın fiyatları, Fed’in faiz indirimine gideceği yönündeki tahminlerin ve Orta Doğu’daki siyasi gerilimlerin artmasıyla haftalık 2,7’lik artışa yöneldi. Gram altın ise haftanın son işlem gününde tüm zamanların en yüksek seviyesini gördü.",2024-08-02 10:03:00,2024-08-02
2,tum-piyasa-haberleri,https://www.bloomberght.com/kuresel-piyasalarda-abd-tarim-disi-odagi-2357666,Küresel piyasalarda ABD tarım dışı odağı,Fed Başkanı Jerome Powell'ın işgücü risklerine işaret etmesinin ardından gözler ABD istihdam verilerinde olacak.,2024-08-02 09:12:00,2024-08-02
3,tum-piyasa-haberleri,https://www.bloomberght.com/piyasa-ozeti-2-agustos-2024-borsa-doviz-altin-ve-kripto-piyasalarindaki-son-durum-pkh-2357752,"Piyasa özeti: 2 Ağustos 2024 Borsa, Döviz, Altın ve Kripto piyasalarındaki son durum","Borsa İstanbul günü (2 Ağustos 2024 Cuma) düşüşle tamamladı. BIST 100 endeksi, yüzde -3.01 değer kaybederek 10.473.47 puanla günü kapattı. Dolar 33.23 TL’yi gördü. Kapalı Çarşı’da gram altın ise düne göre yüzde -0,94 düşüşle 2.578,51 TL oldu.",2024-08-02 18:30:00,2024-08-02
4,tum-piyasa-haberleri,https://www.bloomberght.com/gram-altinda-son-durum-ne-2-agustos-2024-cuma-altin-fiyatlari-pkh1-2357678,Gram altında son durum ne? 2 Ağustos 2024 Cuma altın fiyatları...,"Gram altın serbest piyasada bugün (2 Ağustos 2024 / saat: 10:00) 2.627,29 TL’den alınıp 2.627,76 TL’den satılıyor. Kapalıçarşı’da altın fiyatlarındaki en son durumu; gram altın ne kadar oldu, gram altın kaç TL, çeyre kaltın ne kadar? gibi merak ettiğiniz tüm detayları sayfamızdan güncel olarak takip edebilirsiniz.",2024-08-02 10:00:00,2024-08-02


# 4. Yahoo Finance Data

In [None]:
today = datetime.today().strftime('%Y-%m-%d')
df['Date'] = pd.to_datetime(df['Date'])
min_date = df['Date'].dt.date.min()

# Get data of BIST100, gold and USD
bist100_data = yf.download("XU100.IS", start=min_date, end=today)
gold_data = yf.download("GC=F", start=min_date, end=today)
usd_try_data = yf.download("TRY=X", start=min_date, end=today)

# Arrange column names
bist100_data.rename(columns={"Adj Close": "BIST100"}, inplace=True)
gold_data.rename(columns={"Adj Close": "Gold"}, inplace=True)
usd_try_data.rename(columns={"Adj Close": "USD/TRY"}, inplace=True)

# Select the columns
bist100_data = bist100_data[["BIST100"]]
gold_data = gold_data[["Gold"]]
usd_try_data = usd_try_data[["USD/TRY"]]

# Reset indexes and add the ‘Date’ column
bist100_data.reset_index(inplace=True)
gold_data.reset_index(inplace=True)
usd_try_data.reset_index(inplace=True)

# Merge all data
merged_data = pd.merge(bist100_data, gold_data, on="Date", how="outer")
merged_data = pd.merge(merged_data, usd_try_data, on="Date", how="outer")

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


In [None]:
final_data = pd.merge(merged_data, df, on="Date", how="right")
final_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Date         99 non-null     datetime64[ns]
 1   BIST100      88 non-null     float64       
 2   Gold         92 non-null     float64       
 3   USD/TRY      88 non-null     float64       
 4   Page_link    99 non-null     object        
 5   Link         99 non-null     object        
 6   Title        99 non-null     object        
 7   Description  99 non-null     object        
 8   Date1        99 non-null     object        
dtypes: datetime64[ns](1), float64(3), object(5)
memory usage: 7.1+ KB


In [None]:
final_data.to_csv(news_path, index=False)

# 5. Bonus Part
If you want to filter the news by date, this code may work for you

In [None]:
# News filtering by date
filtered_tarih = '2024-08-04'
filtered_news = df[df['Date'].dt.date == pd.Timestamp(filtered_tarih).date()]

# If there is such a date, printing texts
if not filtered_news.empty:
    filtered_list = filtered_news[[ "Title", "Description"]].values.tolist()
    filtered_data = pd.DataFrame(filtered_list, columns=[ "Title", "Description"])
    news_number = len(filtered_data)
    print(f'There are {news_number} news of {filtered_tarih} date.')
    print(filtered_data)
else:
    print(f'No news of {filtered_tarih} date found.')

There are 4 news of 2024-08-04 date.
                                                         Title  \
0                Spot piyasada elektrik fiyatları (04.08.2024)   
1  Maliyeden kayıt dışı hasılatı belirlenen firmaya rekor ceza   
2                              Kotil'in yeni görevi belli oldu   
3                           Resmi Gazete'de bugün (04.08.2024)   

                                                                                                                                                                                                                                       Description  
0                                                                                                                 Spot piyasada bir megavatsaat elektriğin fiyatı, yarın için en yüksek 3 bin lira, en düşük 1749 lira 99 kuruş olarak belirlendi.  
1  Hazine ve Maliye Bakanlığı müfettişleri, Bakan Mehmet Şimşek'in "çok kazanandan çok vergi alınması" prensibi doğrultusunda denetimlerini sü