In [None]:
import pandas as pd
import httpx
from httpx import ConnectTimeout, ReadTimeout, ReadError, ConnectError
import time
import asyncio
from PIL import Image, UnidentifiedImageError
from io import BytesIO
import numpy as np


# Export data from google sheet document and convert to pandas DataFrame
def get_data_from_google_sheet(sheet_id: str, sheet_name: str):
    url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"
    df = pd.read_csv(url)
    return df


# Create async client via httpx for multiple requests
async def get_async_client(url: str):
     async with httpx.AsyncClient() as client:
            try:
                return await client.get(url, timeout=10)
            except (ConnectTimeout, ConnectError, ReadTimeout, ReadError):
                print(url)
                return await client.get(url, timeout=None)
           
            
# Method for division data into data packages
def division_into_packages(data: list, len_of_package:int = 60):
    split_data = [data[x:x+len_of_package] for x in range(0, len(data), len_of_package)]
    return split_data


# Method to verify currupted image or not
def verify_image(image):
    try:
        width, height = Image.open(BytesIO(image.content)).size
        return f"{str(width)}x{str(height)}"
    except UnidentifiedImageError:
        return "Corrupted Image"
        

# Async method for parse and collect image size data
async def collect_image_size(package: list):
    images_size_data = []
    for image in package:
        respsonse = await asyncio.gather(*map(get_async_client,image))
        image_size = [verify_image(image) for image in respsonse]
        images_size_data.extend(image_size)
    return images_size_data


# Method to create excel file and write data
def create_and_write_data_to_excel(file_name: str, data: list, columns_to_convert: list):
    assert len(data) == len(columns_to_convert)
    df = pd.DataFrame(np.column_stack(data), columns=columns_to_convert)
        
    exel_file = df.to_excel(file_name+".xlsx")
    print("Created")
    return


if __name__ == "__main__":                                       
    tm1 = time.perf_counter()
    df = get_data_from_google_sheet("1QX2IhFyYmGDFMvovw2WFz3wAT4piAZ_8hi5Lzp7LjV0", "Parser_ImageSize")
    list_of_urls = list(df["image_url"].dropna())
    split_data = division_into_packages(list_of_urls)
    image_size_lst = await collect_image_size(split_data)
    create_and_write_data_to_excel("test3", data=[list_of_urls, image_size_lst], columns_to_convert=["image_url", "Size"])
    tm2 = time.perf_counter()
    print(f'Total time elapsed: {tm2-tm1:0.2f} seconds')