In [7]:
from PIL import Image
import requests

In [8]:
import cv2 as cv
import numpy as np

THRESHOLD_INTENSITY = 230

def has_white_background(img):
    # Read image into org_img variable
    
    org_img = cv.imread(img, cv.IMREAD_GRAYSCALE)
    # cv.imshow('Original Image', org_img)

    # Create a black blank image for the mask
    mask = np.zeros_like(org_img)

    # Create a thresholded image, I set my threshold to 200 as this is the value 
    # I found most effective in identifying light colored object
    _, thres_img = cv.threshold(org_img, 200, 255, cv.THRESH_BINARY_INV)

    # Find the most significant contours
    contours, hierarchy = cv.findContours(thres_img, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_NONE)
    # Get the outermost contours
    outer_contours_img = max(contours, key=cv.contourArea)

    # Get the bounding rectangle of the contours
    x,y,w,h = cv.boundingRect(outer_contours_img)
    # Draw a rectangle base on the bounding rectangle of the contours to our mask
    cv.rectangle(mask,(x,y),(x+w,y+h),(255,255,255),-1)
    # Invert the mask so that we create a hole for the detected object in our mask
    mask = cv.bitwise_not(mask)

    # Apply mask to the original image to subtract it and retain only the bg
    img_bg = cv.bitwise_and(org_img, org_img, mask=mask)

    # If the size of the mask is similar to the size of the image then the bg is not white
    if h == org_img.shape[0] and w == org_img.shape[1]:
        return False

    # Create a np array of the 
    np_array = np.array(img_bg)

    # Remove the zeroes from the "remaining bg image" so that we dont consider the black part,
    # and find the average intensity of the remaining pixels
    ave_intensity = np_array[np.nonzero(np_array)].mean()

    if ave_intensity > THRESHOLD_INTENSITY:
        return True
    else:
        return False

In [9]:
import os
import requests

try:
    import urlparse
except ImportError:
    from urllib.parse import urlparse
def download_img(url):
    path = urlparse(url).path
    ext = os.path.splitext(path)[1]

    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
           "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
           "Accept-Language": "en-US,en;q=0.9"
           }
    img_data = requests.get(url, headers=headers).content
    with open(f'test{ext}', 'wb') as handler:
        handler.write(img_data)

In [27]:
url = 'https://www.tangolighting.com/wp-content/uploads/2019/04/21391.jpg'
download_img(url)
has_white_background('test.jpg')

True

In [3]:
import pandas as pd
df = pd.read_excel(r'C:\Users\kchai\Downloads\carpyen.xlsx')
df

Unnamed: 0,name,sku,old_sku,GTIN,option(size),option(shape),option(color),option(finish),color,finish,...,voltage,bulb_type,bulb_text,canopy dimensions,UL Listed,certifications,warranty & care,position,notes,url
0,Aitana Floor Lamp,,AITANA-FLRLMP-PCHRMWHT-19.6,,"19.6"" Dia.",,,Polished Chrome,White,Polished Chrome,...,,,2 x 100W Max Inc E26 Med Base,,,,,,,
1,Aitana Floor Lamp,1541200.0,AITANA-FLRLMP-BLKWHT-19.6,,"19.6"" Dia.",,,Black,White,Black,...,,,2 x 100W Max Inc E26 Med Base,,,,,,,
2,Aitana Floor Lamp,,AITANA-FLRLMP-PCHRMWHT-23.6,,"23.6"" Dia.",,,Polished Chrome,White,Polished Chrome,...,,,2 x 100W Max Inc E26 Med Base,,,,,,,
3,Aitana Floor Lamp,1531200.0,AITANA-FLRLMP-BLKWHT-23.6,,"23.6"" Dia.",,,Black,White,Black,...,,,2 x 100W Max Inc E26 Med Base,,,,,,,
4,Aitana Suspension Lamp,,AITANA-PNDTLMP-PCHRMWHT-23,,"23"" Dia.",,,Polished Chrome,White,Polished Chrome,...,,,2 x 100W Max Inc E26 Med Base,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
392,Woody Suspension Lamp,2171100.0,WOODY-PNDTLMP-PRL,,,,Pearl,,Pearl,,...,,,1x23w Max E26 Med Base Cfl,,,,,,,https://www.tangolighting.com/product/woody/
393,Woody Suspension Lamp,2171800.0,WOODY-PNDTLMP-WHT,,,,White,,White,,...,,,1x23w Max E26 Med Base Cfl,,,,,,,
394,Woody Suspension Lamp,2171700.0,WOODY-PNDTLMP-OGRN,,,,Olive Green,,Olive Green,,...,,,1x23w Max E26 Med Base Cfl,,,,,,,
395,Yoko Wall Lamp,2081000.0,YOKO-WLLMP-GLD,,,,Gold Leaf,,Gold Leaf,,...,,,2x 6.6W 1080 Lumens (total) 2700K CRI90 LED,,,,,,,https://www.carpyen.com/product.php?id=p166&cc...


In [39]:
hero_images = list(df['hero_image'].unique())
hero_images = [img for img in hero_images if img is not np.nan]
hero_images

['https://www.carpyen.com/fotos/1379060172.jpg',
 'https://www.insmatcaldes.com/6821-large_default/aitana-colgante-de-carpyen.jpg',
 'https://www.carpyen.com/fotos/1547810630.jpg',
 'https://www.carpyen.com/fotos/1548156524.jpg',
 'https://www.carpyen.com/fotos/1648908172.jpg',
 'https://www.carpyen.com/fotos/1512042482.jpg',
 'https://www.tangolighting.com/wp-content/uploads/2018/06/carla-floor-main.jpg',
 'https://www.tangolighting.com/wp-content/uploads/2019/02/codol_apagat.jpg',
 'https://image.architonic.com/pro2-3/20063975/colette-40-pend-mint-green-3291700-pro-b-arcit18.jpg',
 'https://th.bing.com/th/id/OIP.fJcOqGfYrcCIWy8i_leJ6gHaHa?pid=ImgDet&rs=1',
 'https://www.carpyen.com/fotos/1548062458.jpg',
 'https://www.carpyen.com/fotos/1379327046.jpg',
 'https://www.carpyen.com/fotos/1548063240.jpg',
 'https://www.carpyen.com/fotos/1538558306.jpg',
 'https://www.carpyen.com/fotos/1538651485.jpg',
 'https://www.carpyen.com/fotos/1559901220.jpg',
 'https://www.tangolighting.com/wp-cont

In [1]:
for url in hero_images:
    try:
        download_img(url)
        white_background = has_white_background('test.jpg')
        os.remove('test.jpg')
        if white_background is False:
            print(f'{url} has white background {white_background}')
    except:
        print(f'not able to check: {url}')

NameError: name 'hero_images' is not defined

In [5]:
def check_data(df):
    hero_images = list(df['hero_image'].unique())
    hero_images = [img for img in hero_images if img is not np.nan]
    names = list(df['name'].unique())
    if len(hero_images) != len(names):
        print('check for hero_images quantity')
    
    print('*'*10)
    print('\n'*2)
    
    for url in hero_images:
        try:
            download_img(url)
            white_background = has_white_background('test.jpg')
            os.remove('test.jpg')
            if white_background is False:
                print(f'{url} has white background {white_background}')
        except:
            print(f'not able to check: {url}')

In [10]:
check_data(df)

check for hero_images quantity
**********



not able to check: https://th.bing.com/th/id/OIP.fJcOqGfYrcCIWy8i_leJ6gHaHa?pid=ImgDet&rs=1
not able to check: https://www.insmatcaldes.com/13475-home_default/harry-ceiling-light-by-carpyen.jpg
https://www.carpyen.com/fotos/1548150189.jpg has white background False
not able to check: https://cdn-s3.touchofmodern.com/products/000/004/988/ea5db967125ef4a6ead852e3846bafdf_medium.jpg?1392330011
not able to check: https://images.lumens.com/is/image/Lumens/uu459387?$Lumens.com-600$
not able to check: https://th.bing.com/th/id/R.bd6dbfd45b2d70988a2604a89f17d781?rik=PLuXuOT9Z1L0xg&riu=http%3a%2f%2fimage.architonic.com%2fimg_pro1-6%2f130%2f4148%2fnura-apli-oro-1-sq.jpg&ehk=WlVqa1eBNDxfpQysnQWA6tgujirEU4RPInyoH2axCGg%3d&risl=&pid=ImgRaw&r=0
not able to check: https://th.bing.com/th/id/OIP._TGz8JXDUlrbAHnBDsbzfAAAAA?pid=ImgDet&rs=1
