<a href="https://colab.research.google.com/github/jmillanm/ColabScripts/blob/master/UrlExtractorImagesAttributes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%bash
pip install cairosvg > /dev/null

In [None]:
from bs4 import BeautifulSoup, element
import cairosvg
import csv
from io import BytesIO 
import pandas as pd
from PIL import Image
import re
import requests
import time
from typing import Dict, List, Optional, Text, Tuple
import urllib.request, urllib.parse

In [None]:
def _get_urls_from_csv(path: Text) -> List:
  contents = []
  
  with open(path,'r') as csvf: # Open file in read mode
      urls = csv.reader(csvf)
      for url in urls:
          contents.append(url)
  
  return contents

In [None]:
def _base_url(url:Text, 
              with_path:bool=False) -> Text:
    parsed = urllib.parse.urlparse(url)
    path   = '/'.join(parsed.path.split('/')[:-1]) if with_path else ''
    parsed = parsed._replace(path=path)
    parsed = parsed._replace(params='')
    parsed = parsed._replace(query='')
    parsed = parsed._replace(fragment='')

    return parsed.geturl()

In [None]:
def _request_html_from_url(url:Text, 
                           headers:Dict) -> bool:

    response = requests.get(url, headers=headers)

    if response.status_code == 200:
      response.__len__ = len([response])
      return response
    
    return "request_html_from_url status_code %s" % response.status_code

In [None]:
def _find_img_tags(response: requests.models.Response,
                   parser: 'html.parser' or 'lxml' or 'lxml-xml' or 'html5lib') -> element.ResultSet:
  
  soup = BeautifulSoup(response.text, parser)
  script_tags = soup.findAll('img')

  return script_tags

In [None]:
columns = ['path', 
            'class', 
            'alt',
            'title', 
            'html_tag', 
            'file_extension']

In [None]:
def _value_extractor_from_attr(html_tag: element.Tag, 
                              attrs: List) -> List:

  attributes = {'path':str(html_tag.get('src')), 
                'class':str(html_tag.get('class')), 
                'alt':str(html_tag.get('alt')), 
                'title': str(html_tag.get('title')),
                'html_tag':str(html_tag), 
                'file_extension':str(html_tag.get('src')).split(".")[-1]}
  
  desired_attributes = {k: v for k, v in attributes.items() if k in attrs}

  return list(desired_attributes.values())

In [None]:
def _extract_data_images(script_tags: element.ResultSet, 
                         columns: List) -> pd.core.frame.DataFrame:
    data=[]
    df = pd.DataFrame(columns=columns)

    for script in script_tags:
      values = _value_extractor_from_attr(script, columns)      
      data.append(dict(zip(columns, values)))

    df = df.append(data, True)
    
    return df

In [None]:
def _image_size(url_img: Text) -> Tuple[int, int]:
  # Tested on jpg, jpeg and png
  # For svg, use svg size
  response = requests.get(url_img)
  img = Image.open(BytesIO(response.content))
  img_size = img.size
  img.close()

  return img_size

In [None]:
 def _get_content_length(url_img: Text, 
                         headers) -> str:  
  req = urllib.request.Request(url_img, method='HEAD', headers=headers)
  f = urllib.request.urlopen(req)
  length = f.headers['Content-Length']

  return length

In [None]:
def _svg_size(url_img: Text) -> Tuple[int, int]:
  response = requests.get(url_img)
  out = BytesIO() 
  cairosvg.svg2png(url=url_img, write_to=out)
  img = Image.open(out)
  img_size = img.size
  img.close()

  return img_size

In [None]:
def _get_img_size(url_img: Text, 
                  img_type: Text) -> float:
  if img_type == 'svg':
    try:
      img_size = _svg_size(url_img)
    except:
      #outlier to detect errors
      img_size = 999999999999999
  else:
    try:
      img_size = _image_size(url_img)
    except:
      #outlier to detect errors
      img_size = 999999999999999
  return img_size

In [None]:
def image_scanner_from_url(url: Text, 
                           headers: Dict) -> pd.core.frame.DataFrame:
  html = _request_html_from_url(url, headers)

  if type(html) == str:
    raise Exception(html)

  img_tags = _find_img_tags(html, 'html.parser')
  data_images = _extract_data_images(img_tags, columns)
  
  return data_images

In [None]:
def image_scanner_add_url(url: Text, 
                          data_images: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:

  data_images['url_img'] = data_images.apply(
      lambda x: x.path \
      if x.path.startswith('http')\
      else f"{_base_url(url)}/{x.path}", 
      axis=1)

  return data_images

In [None]:
def image_scanner_add_size(data_images: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:

  data_images['size'] = data_images.apply(
      lambda x: _get_img_size(x.url_img, x.file_extension), 
      axis=1)

  return data_images

In [None]:
def image_scanner_add_content_length(data_images: pd.core.frame.DataFrame, 
                                     headers: Dict) -> pd.core.frame.DataFrame:

  data_images['content_length'] = data_images.apply(
      lambda x: float(_get_content_length(x.url_img, headers)), 
      axis=1)

  return data_images

In [None]:
def parse_all_info_img(url: Text, 
                       headers: Dict) -> pd.core.frame.DataFrame:
  
  data_images = image_scanner_from_url(url, headers)
  
  data_images = image_scanner_add_url(url, data_images)
  data_images = image_scanner_add_size(data_images)
  data_images = image_scanner_add_content_length(data_images, headers)
  data_images['content_length_kB'] = data_images.content_length/1000
  data_images['url'] = url

  return data_images

In [None]:
def pipe_all_info_img(all_info: pd.core.frame.DataFrame, 
                      columns_to_select: List):
  return all_info[columns_to_select]

In [None]:
def make_extraction_process(url: Text, 
                            headers: Dict, columns_to_select: List) -> pd.core.frame.DataFrame:

    all_info = parse_all_info_img(url, headers)

    data = pipe_all_info_img(all_info, columns_to_select)

    return data

In [None]:
if __name__ == "__main__":
  # you only have to change the csv_path and assign a name for the new file
  # csv with the list of urls without column title
  csv_path = 'path/to/your.csv'
  data_extration_name = 'results.csv'
  
  if 'data' in globals():
    del(data)
  urls = _get_urls_from_csv(csv_path)
  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
  columns_to_select = ['url','path', 'alt', 'title', 'size', 'content_length_kB']

  run = 1
  for url in urls:
    if run == 1:
      data = make_extraction_process(url[0], headers, columns_to_select)
    else:
      data = data.append(make_extraction_process(url[0], headers, columns_to_select), ignore_index=True)
    run+=1

  data.to_csv(data_extration_name)
  