Average green levels in pixels

In [20]:
from PIL import Image
import numpy as np

def calculate_green_average(image_path):
    # Open the image file
    img = Image.open(image_path)
    # Convert the image to RGB (if not already in that format)
    img = img.convert('RGB')
    # Convert the image to a numpy array
    img_array = np.array(img)
    # Extract the green channel
    green_channel = img_array[:, :, 1]
    # Calculate the average green value
    avg_green = np.mean(green_channel)
    return avg_green


In [27]:
calculate_green_average(r"E:\Brandbank\brandbank\8710522806326\9263182.jpg")

239.770752

In [26]:
calculate_green_average(r"E:\Brandbank\brandbank\8710522806326\10763101.jpg")


237.060592

In [28]:
from PIL import Image
import numpy as np

def calculate_green_proportion(image_path):
    # Open the image file
    img = Image.open(image_path)
    # Convert the image to RGB (if not already in that format)
    img = img.convert('RGB')
    # Convert the image to a numpy array
    img_array = np.array(img)

    # Set a threshold for what you consider as 'green'
    # This is a simplistic threshold that considers a pixel green
    # if the green channel value is greater than both red and blue by a certain margin
    green_threshold = 50

    # Calculate the green proportion
    green_pixels = np.where((img_array[:, :, 1] > img_array[:, :, 0] + green_threshold) &
                            (img_array[:, :, 1] > img_array[:, :, 2] + green_threshold))
    green_proportion = green_pixels[0].size / img_array[:, :, 1].size

    return green_proportion



In [42]:
from PIL import Image
import numpy as np

def calculate_green_score(image_path):
    """
    Calculate the "greenness" score of an image by averaging the green channel values.
    The score is normalized by the total number of pixels and the maximum color value (255).
    
    Parameters:
    image_path (str): The path to the image file.
    
    Returns:
    float: The greenness score of the image.
    """
    # Open the image file
    with Image.open(image_path) as img:
        # Convert the image to RGB (if not already in that format)
        rgb_image = img.convert('RGB')
        
        # Convert the image to a numpy array
        image_array = np.array(rgb_image)
        
        # Extract the green channel
        green_channel = image_array[:, :, 1]
        
        # Calculate the mean of the green channel
        mean_green = np.mean(green_channel)
        
        # Normalize the score by the total number of pixels and the maximum color value
        green_score = mean_green / (255 * image_array.size / 3)
        
        return green_score


In [45]:
calculate_green_score(r"E:\Brandbank\brandbank\8710908764844\4300561.jpg")

1.4619980298039214e-05

In [47]:
calculate_green_score(r"E:\Brandbank\brandbank\8710908764844\9263684.jpg")

1.4715600815686274e-05

In [46]:
calculate_green_score(r"E:\Brandbank\brandbank\8710908764844\12490451.jpg")

1.4840006776470588e-05

In [50]:
from skimage import color

def calculate_advanced_green_score(image_path):
    """
    Calculate an advanced "greenness" score of an image by considering not only the green channel
    but also the saturation and value of the green in the HSV color space. This might give a better
    approximation of how green an image appears to the human eye.
    
    Parameters:
    image_path (str): The path to the image file.
    
    Returns:
    float: The advanced greenness score of the image.
    """
    # Open the image file
    with Image.open(image_path) as img:
        # Convert the image to RGB (if not already in that format)
        rgb_image = img.convert('RGB')
        
        # Convert the image to a numpy array
        image_array = np.array(rgb_image)
        
        # Convert the RGB image to HSV
        hsv_image = color.rgb2hsv(image_array)
        
        # Extract the hue, saturation, and value channels
        hue_channel = hsv_image[:, :, 0]
        saturation_channel = hsv_image[:, :, 1]
        value_channel = hsv_image[:, :, 2]
        
        # Define a range for "green" in the hue channel (roughly from 90° to 150° in the HSV color wheel)
        green_hue_min = 90 / 360  # Normalize to [0, 1]
        green_hue_max = 150 / 360  # Normalize to [0, 1]
        
        # Create a mask for green regions in the image
        green_mask = np.logical_and(hue_channel >= green_hue_min, hue_channel <= green_hue_max)
        
        # Calculate the score as the product of saturation and value (brightness) for green regions
        green_score = np.mean(saturation_channel[green_mask] * value_channel[green_mask])
        
        return green_score

# Calculate the advanced greenness score for both images
adv_green_score_1 = calculate_advanced_green_score(r"E:\Brandbank\brandbank\8710908764844\9263684.jpg")
adv_green_score_2 = calculate_advanced_green_score(r"E:\Brandbank\brandbank\8710908764844\12490451.jpg")

adv_green_score_1, adv_green_score_2


(0.1056853115254413, 0.11996656829461319)

In [52]:
def calculate_weighted_green_area_score(image_path):
    """
    Calculate a weighted greenness score that puts more emphasis on the green areas rather than the intensity.
    This is done by counting the number of pixels that fall within the green hue range in the HSV color space.
    
    Parameters:
    image_path (str): The path to the image file.
    
    Returns:
    float: The weighted green area score of the image.
    """
    # Open the image file
    with Image.open(image_path) as img:
        # Convert the image to RGB (if not already in that format)
        rgb_image = img.convert('RGB')
        
        # Convert the image to a numpy array
        image_array = np.array(rgb_image)
        
        # Convert the RGB image to HSV
        hsv_image = color.rgb2hsv(image_array)
        
        # Extract the hue, saturation, and value channels
        hue_channel = hsv_image[:, :, 0]
        
        # Define a range for "green" in the hue channel (roughly from 90° to 150° in the HSV color wheel)
        green_hue_min = 90 / 360  # Normalize to [0, 1]
        green_hue_max = 150 / 360  # Normalize to [0, 1]
        
        # Create a mask for green regions in the image
        green_mask = np.logical_and(hue_channel >= green_hue_min, hue_channel <= green_hue_max)
        
        # Calculate the score as the number of green pixels normalized by the total number of pixels
        green_area_score = np.sum(green_mask) / green_mask.size
        
        return green_area_score

# Calculate the weighted green area score for both images
weighted_green_area_score_2 = calculate_weighted_green_area_score(r"E:\Brandbank\brandbank\8710908764844\9263684.jpg")

weighted_green_area_score_1 = calculate_weighted_green_area_score(r"E:\Brandbank\brandbank\8710908764844\12490451.jpg")

weighted_green_area_score_1, weighted_green_area_score_2

(0.042416, 0.042352)

In [54]:
import os
import csv
from tqdm import tqdm

def calculate_green_scores_for_directory(directory):
    """
    Go through all subfolders in the provided directory, find all jpg files, and calculate their green scores.
    Store the results in a CSV file.

    Parameters:
    directory (str): The path to the main folder containing subfolders with jpg images.
    """
    # List to hold all the results
    results = []
    print('getting subdirectories')
    # Get all the subdirectories in the provided directory
    subdirectories = [os.path.join(directory, d) for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))]
    print('initializing progress bar')
    # Initialize a progress bar
    pbar = tqdm(total=len(subdirectories))
    print('begging score extraction')
    # Go through each subdirectory
    for subdir in subdirectories:
        # Get all jpg files in this subdirectory
        for file in os.listdir(subdir):
            if file.lower().endswith('.jpg'):
                file_path = os.path.join(subdir, file)
                
                # Calculate the advanced green score and weighted green area score
                # Functions 'calculate_advanced_green_score' and 'calculate_weighted_green_area_score' should be defined
                advanced_green_score = calculate_advanced_green_score(file_path)
                weighted_green_area_score = calculate_weighted_green_area_score(file_path)

                # Append the results to the list
                results.append([file, advanced_green_score, weighted_green_area_score])

        # Update the progress bar
        pbar.update(1)

    # Close the progress bar
    pbar.close()

    # Write results to a CSV file
    csv_filename = os.path.join(directory, 'green_scores.csv')
    with open(csv_filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['File Name', 'Advanced Green Score', 'Weighted Green Area Score'])
        writer.writerows(results)

# Call the function with a sample directory path (replace with your actual directory path)
calculate_green_scores_for_directory(r'E:\Brandbank\brandbank')

getting subdirectories
initializing progress bar


  0%|          | 4/464455 [00:00<4:46:26, 27.02it/s]

begging score extraction


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|█████████▉| 464448/464455 [6:27:48<00:00, 31.22it/s]   

PermissionError: [WinError 5] Access is denied: 'E:\\Brandbank\\brandbank\\lost+found'

100%|█████████▉| 464451/464455 [6:28:03<00:00, 31.22it/s]

In [56]:
import os
import csv
from tqdm import tqdm

def calculate_green_scores_for_directory(directory):
    """
    Go through all subfolders in the provided directory, find all jpg files, and calculate their green scores.
    The results are written to a CSV file as they are calculated to prevent data loss in case of a crash.

    Parameters:
    directory (str): The path to the main folder containing subfolders with jpg images.
    """
    # CSV file for writing the results
    csv_filename = os.path.join(directory, r'G:\$00-Work\green_scores.csv')

    # Open the CSV file for writing
    with open(csv_filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        # Write the header row
        writer.writerow(['File Name', 'Advanced Green Score', 'Weighted Green Area Score'])
        print('getting subdirectories')
        # Get all the subdirectories in the provided directory
        subdirectories = [os.path.join(directory, d) for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))]
        print('initializing progress bar')
        
        # Initialize a progress bar
        pbar = tqdm(total=len(subdirectories))
        print('begging score extraction')
        # Go through each subdirectory
        for subdir in subdirectories:
            # Get all jpg files in this subdirectory
            for file in os.listdir(subdir):
                if file.lower().endswith('.jpg'):
                    file_path = os.path.join(subdir, file)
                    
                    # Calculate the advanced green score and weighted green area score
                    # Functions 'calculate_advanced_green_score' and 'calculate_weighted_green_area_score' should be defined
                    advanced_green_score = calculate_advanced_green_score(file_path)
                    weighted_green_area_score = calculate_weighted_green_area_score(file_path)

                    # Write the results to the CSV file immediately
                    writer.writerow([file, advanced_green_score, weighted_green_area_score])

            # Update the progress bar
            pbar.update(1)

        # Close the progress bar
        pbar.close()

# This function should be called with the directory path in your local environment.
calculate_green_scores_for_directory(r'E:\Brandbank\brandbank')


getting subdirectories
initializing progress bar




begging score extraction




PermissionError: [WinError 5] Access is denied: 'E:\\Brandbank\\brandbank\\lost+found'



In [2]:
import pandas as pd
dataset = pd.DataFrame({
     'product_id': [1, 1, 2, 2, 3],
     'date': ['2021-01-01', '2021-01-01', '2021-01-02', '2021-01-02', '2021-01-03'],
     'promotion': [None, 'Discount', None, 'Special Offer', None]
})

duplicate_columns = [col for col in dataset.columns if col != 'promotion']

# Mark rows as duplicates based on specified columns, keeping the first occurrence.
dataset['is_duplicate'] = dataset.duplicated(subset=duplicate_columns, keep=False)

# Within duplicates, mark rows with a non-empty 'promotion'.
dataset['has_promotion'] = dataset['promotion'].notna()

# Conditions to remove a row:
# 1. The row is marked as a duplicate.
# 2. The row does not have a promotion.
# Rows to keep:
# 1. Not a duplicate.
# 2. Is a duplicate but has a promotion.
# 3. Is a duplicate, does not have a promotion, but there's no other row with the same identifiers that has a promotion.
conditions_to_remove = (dataset['is_duplicate'] & ~dataset['has_promotion'] & 
                        dataset.duplicated(subset=duplicate_columns, keep=False))

# Filter out rows that do not meet the removal conditions.
dataset = dataset[~conditions_to_remove].drop(columns=['is_duplicate', 'has_promotion'])
dataset

Unnamed: 0,product_id,date,promotion
1,1,2021-01-01,Discount
3,2,2021-01-02,Special Offer
4,3,2021-01-03,
