# Scraping Overview

In this notebook, we will discuss our journey in web scraping. Although we developed numerous scrapers tailored for different websites, our experience with scraping about half of them led us to create a generalized scraper using Selenium. This versatile scraper enables us to efficiently gather data from various sites, streamlining our scraping process and enhancing productivity.

--- 

**Imports**

In [1]:
from selenium import webdriver
from time import sleep, time
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from deep_translator import GoogleTranslator
from tqdm import tqdm
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from io import BytesIO
from PIL import Image
import urllib.parse
import re
import shutil
import os
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import pandas as pd
from json import dumps

In [7]:
data_dir = os.path.join(('/').join(os.getcwd().split('/')[:-2]), 'data')

In [8]:
def initialize_driver(website):
    options = webdriver.FirefoxOptions()
    options.set_preference("browser.download.folderList", 2)  # Use custom download directory
    options.set_preference("browser.download.dir", os.path.join(data_dir, website))
    options.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv") 
    driver = webdriver.Firefox(options=options)

    return driver

In [None]:
def translate(text, source):
    if source == 'en':
        return text
    try:
        return GoogleTranslator(source=source, target='en').translate(text=text).replace(',', '').replace('\n', '')
    except:
        return text

def create_directories(base_url, categories):
    # create the following dir struct; outputs > base website > categories
    base_dir = os.path.join(data_dir, urlparse(base_url).netloc)
    base_links_dir = os.path.join('../scrapping/links', urlparse(base_url).netloc)

    if not os.path.exists(base_links_dir):
        os.makedirs(base_links_dir)

    if not os.path.exists(base_dir):
        os.makedirs(base_dir)

    for category, _ in categories:
        category_dir = os.path.join(base_dir, category)
        if not os.path.exists(category_dir):
            os.makedirs(category_dir)
        with open(os.path.join(base_links_dir), f'{category}.txt', 'w') as f:
            f.write('')

    return base_dir

Utility functions to capture image data when a user clicks 

In [None]:
def load_urls(website, category):
    with open(f'../scrapping/links/{website}/{category}.txt', 'r') as f:
        return f.read().split('\n')

def get_java_script():
    with open('scripts/script.js', 'r') as f:
        return f.read()

def article_scrapper(website, category, driver):
    article_links = load_urls(website, category)
    js = get_java_script()

    for x, link in enumerate(article_links):
        driver.get(link)
        driver.execute_script(js)
        input(f'You may begin selecting images for article {x+1}')

    driver.execute_script(f'{js.replace("file_name", category)}\n\nendSelection()')

def format_df(file_path):
    df = pd.read_csv(file_path, quotechar='"')
    df.drop_duplicates(subset=['article_heading', 'img_src'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    article_mapping = {url: idx + 1 for idx, url in enumerate(df['article_url'].unique())}
    df['article_number'] = df['article_url'].map(article_mapping)
    df['image_number'] = df.groupby('article_number').cumcount() + 1  

    return df[['article_number' 'image number', 'img_src', 'altText', 'article_heading', 'article_url']].rename(columns={'altText': 'alt', 'image_number' : 'image number'} )


Downloading images

In [None]:
def download_image(img_url, save_dir, img_name):
    try:
        if not img_url.startswith('data:'):
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }
            response = requests.get(img_url, headers=headers)
            img_data = response.content
            img = Image.open(BytesIO(img_data))
            width, height = img.size

            # Only save images larger than 100x100 pixels
            if width >= 100 and height >= 100:
                with open(os.path.join(save_dir, img_name), 'wb') as img_file:
                    img_file.write(img_data)
    except:
        pass

def download_images(df, save_dir):

    # parallising the downloads to make it faster
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        headlines = []
        for idx in range(df.shape[0]):
            img_url = df.loc[idx, 'img_src']
            img_name = df.loc[idx, 'image number']
            futures.append(executor.submit(download_image, img_url, save_dir, img_name))

        for future in as_completed(futures):
            future.result()

### Example Usage of the Scraper

In [None]:
categories = [
    ('Health', 'https://muzyka.interia.pl/pop,nPack,2'),
    ('Tech', 'https://tech.163.com/')
    ('Military', 'https://war.163.com/')
    ('International Football', 'https://sports.163.com/world')
    ('World News', 'https://news.163.com/world')
]

base_url = 'https://www.163.com/'
base_dir = create_directories(base_url, categories)
base_lang = 'cn'

**Using the Scraper**

After running the cell above, directories will be created in `links/{website}/{category}.txt`. Each of these text files will contain 10 article links relevant to the specified category. This structure allows for organized access to add lastest 10 article links for each category. Add these links in appropiate files and then run the next cell to scrape the data.

In [None]:
for category, _ in tqdm(categories, desc='Downloading images for every category'):
    website = urlparse(base_url).netloc
    category_dir = os.path.join(base_dir, category)

    driver = initialize_driver(website)
    article_scrapper(website, category, driver)

    sleep(5)

    df = format_df(f'{data_dir}/{website}/{category}.csv')
    download_images(df, category_dir)

    df.drop(columns=['img_src'], inplace=True)
    df['article_heading'] = df['article_heading'].apply(translate, src=base_lang)
    df['alt'] = df['alt'].apply(translate, src=base_lang)
    df.to_csv(f'{category_dir}/image_data.csv', index=False)

    driver.quit()

Removes temporary files made to mantain images source during scrapping

In [None]:
for category, _ in categories:
    os.remove(os.path.join(base_dir, f'{category}.csv'))