In [1]:
# -*- coding: utf-8 -*-

import os
import urllib.request
import pandas as pd
import threading
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from concurrent.futures import ThreadPoolExecutor

# CSV file path
csv_file_path = r"C:\Users\hmane\Desktop\CLEAN_CT\FB_IG_RACE_LGBTQ\fb_ig_race_lgbtq_01012016_06132024_sentiment_raceterm_lgbtqterm_with_gender.csv"

# Load the CSV data with dtype handling to prevent DtypeWarning
dtype = {
    'Link': str,
    'URL': str,
    'Post Created Date': str,
    'Type': str,
    # Add other relevant columns here
}

df = pd.read_csv(csv_file_path, dtype=dtype, low_memory=False)

# Filter for the year 2017 and type 'Photo' with error handling for date conversion
def safe_date_conversion(date_str):
    try:
        return pd.to_datetime(date_str)
    except Exception:
        return pd.NaT

df['Post Created Date'] = df['Post Created Date'].apply(safe_date_conversion)
df = df[(df['Post Created Date'].dt.year == 2017) & (df['Type'] == 'Photo')]

# Drop rows with invalid dates
df = df.dropna(subset=['Post Created Date'])

In [2]:
from queue import Queue

print(f"Filtered dataframe contains {len(df)} rows")

lock = threading.Lock()

def download_image(URL, postID, idx, driver):
    try:
        print(f"Accessing URL: {URL}")
        driver.get(URL)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "img[data-visualcompletion*='media-vc-image']")))

        image_elem = ""
        try_count = 5
        while try_count > 0:
            try:
                image_elem = driver.find_element(By.CSS_SELECTOR, "img[data-visualcompletion*='media-vc-image']")
                print(f"Image element found for postID {postID}")
                break
            except Exception:
                try_count -= 1
                time.sleep(5)
                continue

        image_url = image_elem.get_attribute("src")
        print(f"Image URL: {image_url}")
        image_title = str(idx) + "-" + str(postID.split("/")[-1])
        image = {
            "url": image_url,
            "title": image_title
        }

        out_path = os.path.join("img3/" + image["title"] + ".png")
        urllib.request.urlretrieve(image["url"], out_path)
        print(f"Image saved at {out_path}")

    except Exception as error:
        lock.acquire()
        with open("problem1016.txt", 'a', encoding="utf-8") as f:
            f.write(str(URL) + "\t" + str(postID) + "\t" + str(idx) + "\n")
        lock.release()
        print(f"Exception: {error}")


Filtered dataframe contains 21260 rows


In [6]:
def worker(queue):
    options = Options()
    options.add_argument('--headless=new')
    driver = webdriver.Chrome(executable_path=r"C:\Users\hmane\Downloads\chromedriver-win32\chromedriver-win32\chromedriver.exe", options=options)

    while not queue.empty():
        idx, URL, postID = queue.get()
        download_image(URL, postID, idx, driver)
        queue.task_done()

    driver.quit()

In [5]:
start = time.perf_counter()


In [7]:

URLS = df['Link'].tolist()


In [8]:
postIDs = df['URL'].tolist()

In [9]:
queue = Queue()


In [10]:
for idx, (url, post_id) in enumerate(zip(URLS, postIDs)):
    queue.put((idx, url, post_id))

In [1]:
import os
import csv
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from urllib.parse import urlparse
import aiohttp
import asyncio
import time
import random
import nest_asyncio
from collections import defaultdict
import pandas as pd
from PIL import Image
import io

nest_asyncio.apply()

csv_file_path = r"C:\Users\hmane\Desktop\CLEAN_CT\FB_IG_RACE_LGBTQ\fb_ig_race_lgbtq_01012016_06132024_sentiment_raceterm_lgbtqterm_with_gender.csv"
base_image_folder_path = r"C:\Users\hmane\Desktop\label_studio_images\images_yr4"

MAX_THREADS = 10
MAX_RETRIES = 3
IMAGE_LIMIT_PER_YEAR = 300
VALID_YEARS = {2017, 2018, 2019, 2020, 2021}

images_per_year = defaultdict(int)

def sanitize_filename(filename):
    return "".join(c if c.isalnum() or c in "._-" else "_" for c in filename)

def navigate_to_url(driver, url, retries=MAX_RETRIES):
    for i in range(retries):
        try:
            driver.get(url)
            return True
        except Exception as e:
            print(f"Attempt {i + 1} failed. Retrying... Error: {str(e)}")
            if i == retries - 1:
                print(f"Failed to navigate to {url} after {retries} attempts.")
                return False
        time.sleep(5)

def is_valid_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False

async def download_image_async(session, row_number, url, file_name, year):
    year_folder_path = os.path.join(base_image_folder_path, str(year))
    os.makedirs(year_folder_path, exist_ok=True)

    for attempt in range(MAX_RETRIES):
        try:
            async with session.get(url) as response:
                if response.status == 200:
                    file_content = await response.read()
                    
                    if len(file_content) < 100:  # Arbitrary threshold to check for empty content
                        print(f"Row {row_number}: Downloaded content seems too small: {len(file_content)} bytes")
                        debug_path = os.path.join(year_folder_path, f"debug_{sanitize_filename(file_name)}")
                        with open(debug_path, 'wb') as debug_file:
                            debug_file.write(file_content)
                        return False
                    
                    try:
                        img = Image.open(io.BytesIO(file_content))
                        img.verify()
                    except Exception as e:
                        print(f"Row {row_number}: Image content is invalid: {e}")
                        debug_path = os.path.join(year_folder_path, f"invalid_{sanitize_filename(file_name)}")
                        with open(debug_path, 'wb') as debug_file:
                            debug_file.write(file_content)
                        return False

                    file_path = os.path.join(year_folder_path, sanitize_filename(file_name))
                    with open(file_path, 'wb') as f:
                        f.write(file_content)
                    images_per_year[year] += 1
                    return True
                else:
                    print(f"Row {row_number}: Unexpected response status: {response.status}")
                    return False
        except aiohttp.ClientError as e:
            print(f"Row {row_number}: Error downloading image (attempt {attempt + 1}): {e}")
            if attempt == MAX_RETRIES - 1:
                return False

async def extract_image_urls(session, driver):
    with open(csv_file_path, 'r', encoding='utf-8') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        for row_number, row in enumerate(csv_reader, 1):
            post_created_date = row['Post Created Date']
            post_year = pd.to_datetime(post_created_date).year

            if post_year not in VALID_YEARS or images_per_year[post_year] >= IMAGE_LIMIT_PER_YEAR:
                continue

            if row['Type'] == 'Photo':
                image_url = row['Link']
                if not is_valid_url(image_url):
                    print(f"Row {row_number}: Invalid URL: {image_url}")
                    continue

                if not navigate_to_url(driver, image_url):
                    continue

                post_url = row['URL']
                post_id = post_url.split("/")[-1]

                if row_number % 1000 == 0:
                    print(f"Row {row_number}: {post_id}")
                print(f"Attempting to access URL: {image_url}")

                try:
                    wait = WebDriverWait(driver, 20)
                    img_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "img[data-visualcompletion*='media-vc-image']")))
                    img_url = img_element.get_attribute("src")
                    print(f'Accessing img_url: {img_url}')

                    file_name = post_id + ".png"
                    yield (row_number, img_url, file_name, post_year)
                except NoSuchElementException:
                    print(f"Row {row_number}: No image found on page: {image_url}")
                except StaleElementReferenceException:
                    print(f"Row {row_number}: Stale element reference: {image_url}")
                except TimeoutException:
                    print(f"Row {row_number}: Timeout waiting for image element: {image_url}")
                except Exception as e:
                    print(f"Row {row_number}: Unknown error downloading image: {image_url}\n{str(e)}")

from selenium.webdriver.chrome.options import Options

chrome_options = Options()
# chrome_options.add_argument("--disable-search-engine-choice-screen")
# Remove headless to observe browser actions
chrome_options.add_argument('--headless=new')

driver = webdriver.Chrome(service=Service(r"C:\Users\hmane\Downloads\chromedriver-win64\chromedriver.exe"), options=chrome_options)

async def main():
    async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(limit=MAX_THREADS)) as session:
        tasks = []
        async for row_number, img_url, file_name, year in extract_image_urls(session, driver):
            tasks.append(download_image_async(session, row_number, img_url, file_name, year))
            if len(tasks) >= MAX_THREADS:
                await asyncio.gather(*tasks)
                tasks = []
                # Introduce a random delay to avoid hitting rate limits
                await asyncio.sleep(random.uniform(1, 5))
        if tasks:
            await asyncio.gather(*tasks)

try:
    asyncio.run(main())
except Exception as e:
    print(f"An error occurred: {e}")

try:
    driver.quit()
except Exception as e:
    print(f"Error while quitting the driver: {e}")


Attempting to access URL: https://www.facebook.com/DivineDarkSkin/photos/a.176940432445956/847655705374422/?type=3
Row 452475: Timeout waiting for image element: https://www.facebook.com/DivineDarkSkin/photos/a.176940432445956/847655705374422/?type=3
Attempting to access URL: https://www.facebook.com/wearemitu/photos/a.1405639273069844/1695666590733776/?type=3
Row 452476: Timeout waiting for image element: https://www.facebook.com/wearemitu/photos/a.1405639273069844/1695666590733776/?type=3
Attempting to access URL: https://www.facebook.com/OneLegSleeper/photos/a.10150871717684071/10154245263794071/?type=3
Accessing img_url: https://scontent-iad3-1.xx.fbcdn.net/v/t1.18169-9/15822530_10154245263794071_798478118537106209_n.jpg?_nc_cat=108&ccb=1-7&_nc_sid=0327a3&_nc_ohc=Skex-dHuRmYQ7kNvgHljzpK&_nc_ht=scontent-iad3-1.xx&oh=00_AYDsqhM5-6Lr1IE_8EPWcWf6a2AAt5It0OHcUwMWxd4Hdw&oe=66CB7804
Attempting to access URL: https://www.facebook.com/didyouknowblog/photos/a.257558077593647/1632495783433196

KeyboardInterrupt: 

In [1]:
import os
import csv
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException, TimeoutException, WebDriverException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from urllib.parse import urlparse
import aiohttp
import asyncio
import time
import random
import nest_asyncio
from collections import defaultdict
import pandas as pd
from PIL import Image
import io

nest_asyncio.apply()

csv_file_path = r"C:\Users\hmane\Desktop\CLEAN_CT\FB_IG_RACE_LGBTQ\fb_ig_race_lgbtq_01012016_06132024_sentiment_raceterm_lgbtqterm_with_gender.csv"
base_image_folder_path = r"C:\Users\hmane\Desktop\label_studio_images\images_yr4"

MAX_THREADS = 5
MAX_RETRIES = 3
IMAGE_LIMIT_PER_YEAR = 15
VALID_YEARS = {2021}

START_DATE = pd.Timestamp('2021-06-15')
END_DATE = pd.Timestamp('2021-07-31')

images_per_year = defaultdict(int)

def sanitize_filename(filename):
    return "".join(c if c.isalnum() or c in "._-" else "_" for c in filename)

def navigate_to_url(driver, url, retries=MAX_RETRIES):
    for i in range(retries):
        try:
            driver.get(url)
            print(f'working on {url}')
            return True
        except WebDriverException as e:
            print(f"Attempt {i + 1} failed. Retrying... Error: {str(e)}")
            if i == retries - 1:
                print(f"Failed to navigate to {url} after {retries} attempts.")
                return False
        time.sleep(5)

def is_valid_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False

async def download_image_async(session, row_number, url, file_name, year):
    year_folder_path = os.path.join(base_image_folder_path, str(year))
    os.makedirs(year_folder_path, exist_ok=True)

    for attempt in range(MAX_RETRIES):
        try:
            async with session.get(url, timeout=30) as response:
                if response.status == 200:
                    file_content = await response.read()
                    
                    if len(file_content) < 100:  # Arbitrary threshold to check for empty content
                        print(f"Row {row_number}: Downloaded content seems too small: {len(file_content)} bytes")
                        return False
                    
                    try:
                        img = Image.open(io.BytesIO(file_content))
                        img.verify()
                        img = Image.open(io.BytesIO(file_content))  # Reopen image to check its properties
                        if img.format not in ["JPEG", "PNG"] or img.size == (1, 1):
                            print(f"Row {row_number}: Invalid image format or size")
                            return False
                    except Exception as e:
                        print(f"Row {row_number}: Image content is invalid: {e}")
                        return False

                    file_path = os.path.join(year_folder_path, sanitize_filename(file_name))
                    with open(file_path, 'wb') as f:
                        f.write(file_content)
                    images_per_year[year] += 1
                    return True
                else:
                    print(f"Row {row_number}: Unexpected response status: {response.status}")
                    return False
        except aiohttp.ClientError as e:
            print(f"Row {row_number}: Error downloading image (attempt {attempt + 1}): {e}")
            if attempt == MAX_RETRIES - 1:
                return False
        except asyncio.TimeoutError:
            print(f"Row {row_number}: Download timed out.")
            return False

async def extract_image_urls(session, driver):
    with open(csv_file_path, 'r', encoding='utf-8') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        for row_number, row in enumerate(csv_reader, 1):
            try:
                post_created_date = pd.to_datetime(row['Post Created Date'])
                post_year = post_created_date.year
            except Exception as e:
                print(f"Row {row_number}: Error parsing date '{row['Post Created Date']}': {e}")
                continue

            if not (START_DATE <= post_created_date <= END_DATE):
                continue

            if images_per_year[post_year] >= IMAGE_LIMIT_PER_YEAR:
                continue

            if row['Type'] == 'Photo':
                image_url = row['Link']
                if not is_valid_url(image_url):
                    continue

                if not navigate_to_url(driver, image_url):
                    continue

                post_url = row['URL']
                post_id = post_url.split("/")[-1]

                file_name = post_id + ".png"
                year_folder_path = os.path.join(base_image_folder_path, str(post_year))
                file_path = os.path.join(year_folder_path, sanitize_filename(file_name))

                if os.path.exists(file_path):
                    images_per_year[post_year] += 1
                    continue

                try:
                    wait = WebDriverWait(driver, 5)  # Increase timeout for WebDriver wait
                    img_element = wait.until(
                        EC.presence_of_element_located((By.XPATH, 
                            "//img[@data-visualcompletion='media-vc-image' or @referrerpolicy='origin-when-cross-origin' or @crossorigin='anonymous']"
                        )))
                    
                    time.sleep(5)
                    
                    img_url = img_element.get_attribute("src")
                    print(f'Accessing img_url: {img_url}')

                    yield (row_number, img_url, file_name, post_year)
                except NoSuchElementException:
                    print(f"Row {row_number}: No image found on page: {image_url}")
                except StaleElementReferenceException:
                    print(f"Row {row_number}: Stale element reference: {image_url}")
                except TimeoutException:
                    print(f"Row {row_number}: Timeout waiting for image element: {image_url}")
                except WebDriverException as e:
                    print(f"Row {row_number}: WebDriverException occurred: {str(e)}")
                except Exception as e:
                    print(f"Row {row_number}: Unknown error downloading image: {image_url}\n{str(e)}")


from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_argument('--headless=new')

async def main():
    async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(limit=MAX_THREADS)) as session:
        tasks = []
        drivers = [webdriver.Chrome(service=Service(r"C:\Users\hmane\Downloads\chromedriver-win64\chromedriver.exe"), options=chrome_options) for _ in range(3)]
        for driver in drivers:
            async for row_number, img_url, file_name, year in extract_image_urls(session, driver):
                tasks.append(download_image_async(session, row_number, img_url, file_name, year))
                if len(tasks) >= MAX_THREADS:
                    await asyncio.gather(*tasks)
                    tasks = []
                    await asyncio.sleep(random.uniform(1, 5))
            if tasks:
                await asyncio.gather(*tasks)
            driver.quit()

try:
    asyncio.run(main())
except Exception as e:
    print(f"An error occurred: {e}")


working on https://www.facebook.com/HIASrefugees/photos/a.252823915267/10165664685470268/?type=3
working on https://www.facebook.com/the.buzz.nba/photos/a.433893786718070/4227069827400428/?type=3
Row 639509: Timeout waiting for image element: https://www.facebook.com/the.buzz.nba/photos/a.433893786718070/4227069827400428/?type=3
working on https://www.facebook.com/OldFriendsSeniorDogSanctuary/photos/a.272358712841350/4133769753366874/?type=3
Row 639510: Timeout waiting for image element: https://www.facebook.com/OldFriendsSeniorDogSanctuary/photos/a.272358712841350/4133769753366874/?type=3
working on https://www.facebook.com/OldFriendsSeniorDogSanctuary/photos/a.272358712841350/4134505039960012/?type=3
Row 639511: Timeout waiting for image element: https://www.facebook.com/OldFriendsSeniorDogSanctuary/photos/a.272358712841350/4134505039960012/?type=3
working on https://www.facebook.com/OldFriendsSeniorDogSanctuary/photos/a.272358712841350/4134111636666019/?type=3
Row 639512: Timeout wa