In [1]:
import pandas as pd
import numpy as np
import requests
from PIL import Image
from io import BytesIO

# Download dataset

In [2]:
small = True

In [3]:
if small == True:
    train_df = pd.read_csv('ladybird_train_small.csv')
    test_df = pd.read_csv('ladybird_test_small.csv')
else:
    train_df = pd.read_csv('ladybird_train.csv')
    test_df = pd.read_csv('ladybird_test.csv')

In [4]:
print(f'Size of train set: {train_df.shape[0]}')
print(f'Size of test set: {test_df.shape[0]}')

Size of train set: 6871
Size of test set: 1718


In [6]:
train_df['url'][0]

'https://inaturalist-open-data.s3.amazonaws.com/photos/148104173/original.jpg'

In [9]:
prepocessing_options = {
    'image_size': (224, 224),
    'scaling': 1/255,
}


In [7]:
def download_and_preprocess_image(url, prepocessing_options):
    try:
        # Download the image
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Check if the download was successful
        image = Image.open(BytesIO(response.content))

        # Resize and convert the image
        image = image.convert("RGB")  # Ensure it's in RGB format
        image = image.resize(prepocessing_options['image_size'])

        # Convert image to numpy array and normalize pixel values to [0, 1]
        image_array = np.array(image) * prepocessing_options['scaling']

        return image_array
    except Exception as e:
        print(f"Error downloading or processing image from {url}: {e}")
        return None

In [7]:
# Helper function to process images for a given DataFrame
def process_images(df, n=5, prepocessing_options=None):
    image_arrays = []
    for _, row in df[:n].iterrows():
        image_array = download_and_preprocess_image(row['url'], prepocessing_options)
        image_arrays.append(image_array if image_array is not None else None)

    # Return a copy of the DataFrame with the new 'image_array' column
    df_with_images = df[:n].copy()
    df_with_images['image_array'] = image_arrays
    return df_with_images

In [8]:
train_n = len(train_df)
test_n = len(test_df)

In [9]:
# Process train and test dataframes
train_df_with_image_arrays = process_images(train_df, n=train_n, prepocessing_options=prepocessing_options)
test_df_with_image_arrays = process_images(test_df, n=test_n, prepocessing_options=prepocessing_options)

Error downloading or processing image from https://inaturalist-open-data.s3.amazonaws.com/photos/194699622/original.jpg: HTTPSConnectionPool(host='inaturalist-open-data.s3.amazonaws.com', port=443): Read timed out.
Error downloading or processing image from https://inaturalist-open-data.s3.amazonaws.com/photos/399101118/original.jpeg: HTTPSConnectionPool(host='inaturalist-open-data.s3.amazonaws.com', port=443): Read timed out. (read timeout=10)
Error downloading or processing image from https://inaturalist-open-data.s3.amazonaws.com/photos/104028326/original.jpg: HTTPSConnectionPool(host='inaturalist-open-data.s3.amazonaws.com', port=443): Read timed out. (read timeout=10)
Error downloading or processing image from https://inaturalist-open-data.s3.amazonaws.com/photos/429986620/original.jpeg: HTTPSConnectionPool(host='inaturalist-open-data.s3.amazonaws.com', port=443): Read timed out.
Error downloading or processing image from https://inaturalist-open-data.s3.amazonaws.com/photos/18873

In [10]:
train_df_with_image_arrays.head()

Unnamed: 0,scientificName,species,url,label,image_array
0,"Coccinella septempunctata Linnaeus, 1758",Coccinella septempunctata,https://inaturalist-open-data.s3.amazonaws.com...,14,"[[[0.807843137254902, 0.8117647058823529, 0.81..."
1,"Coccinella septempunctata Linnaeus, 1758",Coccinella septempunctata,https://inaturalist-open-data.s3.amazonaws.com...,14,"[[[0.4392156862745098, 0.5764705882352941, 0.1..."
2,"Halyzia sedecimguttata (Linnaeus, 1758)",Halyzia sedecimguttata,https://inaturalist-open-data.s3.amazonaws.com...,17,"[[[0.28627450980392155, 0.23529411764705882, 0..."
3,"Calvia quatuordecimguttata (Linnaeus, 1758)",Calvia quatuordecimguttata,https://inaturalist-open-data.s3.amazonaws.com...,6,"[[[0.3568627450980392, 0.38823529411764707, 0...."
4,"Propylea quatuordecimpunctata (Linnaeus, 1758)",Propylaea quatuordecimpunctata,https://inaturalist-open-data.s3.amazonaws.com...,26,"[[[0.6980392156862745, 0.7411764705882353, 0.3..."


In [11]:
# Determine output file names
train_output_path = 'ladybird_train_small_preprocessed.pkl' if small else 'ladybird_train_preprocessed.pkl'
test_output_path = 'ladybird_test_small_preprocessed.pkl' if small else 'ladybird_test_preprocessed.pkl'

# Save to pickle files
train_df_with_image_arrays.to_pickle(train_output_path)
test_df_with_image_arrays.to_pickle(test_output_path)