In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import pandas as pd


file_names = []
for i in range(1, 15):  # Adjust the range based on your files
    batch_filenames = np.load(f'/content/drive/MyDrive/MSc Thesis Data Science/influencer_brands/Data/images/filenames_batch{i}.npy')
    file_names.extend(batch_filenames)

data = pd.read_csv('/content/drive/MyDrive/MSc Thesis Data Science/influencer_brands/Data/Metadata/all_metadata_5.csv', lineterminator='\n')

Step 1: group and split the data by post_id

In [None]:
from sklearn.model_selection import train_test_split

# Group data by 'post_id'
grouped = data.groupby('post_id')

# Extract the first 'Sponsorship label' for each 'post_id'
sponsorship_labels = grouped['Sponsorship label'].first()

# Create a list of unique post IDs
post_ids = pd.Series(grouped.size().index)

# Split the data with stratification
train_ids, temp_ids = train_test_split(post_ids, test_size=0.2, random_state=42, stratify=sponsorship_labels)
val_ids, test_ids = train_test_split(temp_ids, test_size=0.5, random_state=42, stratify=sponsorship_labels[temp_ids])

Step 2: unnest the grouped data

In [None]:
# Unnesting the data
train_data = data[data['post_id'].isin(train_ids)]
val_data = data[data['post_id'].isin(val_ids)]
test_data = data[data['post_id'].isin(test_ids)]

print(f'Train data shape{train_data.shape}')
print(f'Validation data shape{val_data.shape}')
print(f'Test data shape{test_data.shape}')

Train data shape(157038, 33)
Validation data shape(19584, 33)
Test data shape(19557, 33)


Step 3: let's get the image names to match with the image data

In [None]:
# Extract filenames from training and validation datasets
train_filenames = train_data['Image files'].tolist()
val_filenames = val_data['Image files'].tolist()

print(len(train_filenames))
print(len(val_filenames))

157038
19584


Step 3: Match Images to each Dataset

In [None]:
from tensorflow.keras.applications.resnet50 import preprocess_input

def filter_images_by_filenames(image_batch, filenames_batch, relevant_filenames):
    # Filter both the images and filenames
    filtered_images = []
    for img, fname in zip(image_batch, filenames_batch):
        if fname in relevant_filenames:
            filtered_images.append(img)
    return np.array(filtered_images)

# Assuming you load each image batch and filename batch in a loop or manually
# For example, for batch 1
image_batch1 = np.load('/content/drive/MyDrive/MSc Thesis Data Science/influencer_brands/Data/images/data120_batch1.npy')
filenames_batch1 = np.load('/content/drive/MyDrive/MSc Thesis Data Science/influencer_brands/Data/images/filenames_batch1.npy')

# Filter images for training and validation sets
train_images_batch1 = filter_images_by_filenames(image_batch1, filenames_batch1, train_filenames)
val_images_batch1 = filter_images_by_filenames(image_batch1, filenames_batch1, val_filenames)

In [None]:
print(train_images_batch1.shape[1:])

(224, 224, 3)


Step 4: implementing ResNet50 model

In [None]:
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.models import Model
import time

# Load ResNet-50 pre-trained on ImageNet and remove the top layer
base_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')

start_time = time.time()
# Directly extract features without additional preprocessing
train_features_batch1 = base_model.predict(train_images_batch1, verbose=1)
val_features_batch1 = base_model.predict(val_images_batch1, verbose=1)


end_time = time.time()
print(f"Time taken for batch 1: {end_time - start_time} seconds")

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
Time taken for batch 1: 55.98302626609802 seconds


In [3]:
from tensorflow.keras.applications.resnet50 import ResNet50
import numpy as np
import gc  # Import the garbage collector

# Load ResNet-50 pre-trained on ImageNet and remove the top layer
base_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')

for i in range(1, 15):
    # Load images for the current batch
    images = np.load(f'/content/drive/MyDrive/MSc Thesis Data Science/influencer_brands/Data/images/data120_batch{i}.npy')

    # Process the images using the ResNet-50 model
    results = base_model.predict(images, verbose=1)

    # Save the processed features
    np.save(f'/content/drive/MyDrive/Colab Notebooks/MSc thesis/processed_data/images/processed_images_batch{i}.npy', results)

    # Clear memory: delete variables and manually run garbage collection
    del images, results
    gc.collect()



In [4]:
data = np.load('/content/drive/MyDrive/Colab Notebooks/MSc thesis/processed_data/images/processed_images_batch1.npy')

In [13]:
print(data[69])

[0.66626585 2.1659558  0.4566452  ... 0.04400967 0.4170214  0.        ]
