In [3]:
from collections import defaultdict

VALID_YEARS = {2017, 2018, 2019, 2020, 2021}

# Dictionary to keep track of the number of images downloaded per year
images_per_year = defaultdict(int)

images_per_year

defaultdict(int, {})

In [1]:
import os
import csv
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from urllib.parse import urlparse
import aiohttp
import asyncio
import time
import nest_asyncio
from collections import defaultdict
import pandas as pd
from PIL import Image
import io
nest_asyncio.apply()

# Define the path to the CSV file with image URLs
csv_file_path = r"C:\Users\hmane\Desktop\CLEAN_CT\FB_IG_RACE_LGBTQ\fb_ig_race_lgbtq_01012016_06132024_sentiment_raceterm_lgbtqterm_with_gender.csv"

# Define the base path to the folder where you want to save the images
base_image_folder_path = r"C:\Users\hmane\Desktop\label_studio_images\images_yr2"

MAX_THREADS = 10
MAX_RETRIES = 3  # For example, retry 3 times
BATCH_SIZE = 10  # We want 300 images per folder
IMAGE_LIMIT_PER_YEAR = 30  # We want to create three folders per year
VALID_YEARS = { 2017, 2018, 2019, 2020, 2021}

# Dictionary to keep track of the number of images downloaded per year
images_per_year = defaultdict(int)

def navigate_to_url(driver, url, retries=MAX_RETRIES):
    for i in range(retries):
        try:
            driver.get(url)
            return True  # Successfully navigated to the URL
        except Exception as e:
            print(f"Attempt {i + 1} failed. Retrying...")
            if i == retries - 1:  # If this was the last attempt
                print(f"Failed to navigate to {url} after {retries} attempts.")
                return False  # Navigation failed after max retries
        time.sleep(5)  # Delay for 5 seconds before retrying

def is_valid_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False

async def image_exists(url):
    async with aiohttp.ClientSession() as session:
        try:
            async with session.head(url, timeout=10) as response:
                return response.status == 200
        except Exception as e:
            print(f"Error checking existence of {url}: {e}")
            return False

async def download_image_async(row_number, url, file_name, year):
    # Determine the batch subfolder based on the number of images downloaded so far
    batch_number = images_per_year[year] // BATCH_SIZE + 1

    # If batch_number exceeds 3, do not download more images for the year
    if batch_number > 3:
        return

    # Create the folder for the year and batch if it doesn't exist
    batch_folder_path = os.path.join(base_image_folder_path, str(year), f'batch_{batch_number}')
    os.makedirs(batch_folder_path, exist_ok=True)

    async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(limit=MAX_THREADS)) as session:
        try:
            async with session.get(url, timeout=30) as response:
                if response.status == 200:
                    file_content = await response.read()
                    if not file_content or len(file_content) < 100:  # Check for very small or empty files
                        print(f"Row {row_number}: Image content is invalid or too small.")
                        return
                    
                    # Validate image content using PIL
                    try:
                        Image.open(io.BytesIO(file_content)).verify()
                    except Exception as e:
                        print(f"Row {row_number}: Image content is invalid: {e}")
                        return

                    file_path = os.path.join(batch_folder_path, file_name)
                    with open(file_path, 'wb') as f:
                        f.write(file_content)
                    images_per_year[year] += 1  # Increment the count for the year
                else:
                    print(f"Row {row_number}: Unexpected response status: {response.status}")
        except Exception as e:
            print(f"Row {row_number}: Error downloading image: {e}")

async def extract_image_urls():
    with open(csv_file_path, 'r', encoding='utf-8') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        chunk = []
        for row_number, row in enumerate(csv_reader, 1):
            if row_number < 23480:
                continue  # Skip rows until 23480

            post_created_date = row['Post Created Date']
            post_year = pd.to_datetime(post_created_date).year

            if post_year not in VALID_YEARS or images_per_year[post_year] >= IMAGE_LIMIT_PER_YEAR:
                continue  # Skip years not in the valid list or if limit reached

            if row['Type'] == 'Photo':
                image_url = row['Link']
                if not is_valid_url(image_url):
                    print(f"Row {row_number}: Invalid URL: {image_url}")
                    continue

                if not navigate_to_url(driver, image_url):
                    continue

                post_url = row['URL']
                post_id = post_url.split("/")[-1]

                if row_number % 1000 == 0:
                    print(f"Row {row_number}: {post_id}")
                print(f"Attempting to access URL: {image_url}")

                try:
                    # wait for the image element to be present
                    wait = WebDriverWait(driver, 10)
                    img_element = wait.until(EC.presence_of_element_located((By.XPATH, '//img')))
                    img_url = img_element.get_attribute("src")

                    if await image_exists(img_url):
                        file_name = post_id + ".png"

                        chunk.append((row_number, img_url, file_name, post_year))

                        if len(chunk) == BATCH_SIZE:
                            yield chunk
                            chunk = []
                    else:
                        print(f"Row {row_number}: Image does not exist: {img_url}")

                except NoSuchElementException:
                    print(f"Row {row_number}: No image found on page: {image_url}")
                except Exception as e:
                    print(f"Row {row_number}: Unknown error downloading image: {image_url}\n{str(e)}")

        # Yield any remaining tasks in the chunk
        if chunk:
            yield chunk

options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(service=Service(r"C:\Users\hmane\Downloads\chromedriver-win32\chromedriver-win32\chromedriver.exe"), options=options)

async def main():
    async for batch in extract_image_urls():
        await asyncio.gather(*[download_image_async(row_number, img_url, file_name, year) for row_number, img_url, file_name, year in batch])

try:
    asyncio.run(main())
except Exception as e:
    print(f"An error occurred: {e}")

try:
    driver.quit()
except Exception as e:
    print(f"Error while quitting the driver: {e}")
