## Import Data from Kaggle

In [1]:
!pip install kaggle



In [2]:
!mkdir -p ~/.kaggle

In [3]:
import os
import json

# Define your Kaggle credentials (replace with your username and API key)
kaggle_credentials = {
    "username": "jingle77",
    "key": "kaggle_api_key"
}

# Path to save the kaggle.json file
kaggle_path = os.path.expanduser("~/.kaggle/kaggle.json")

# Write the credentials to the .kaggle directory in home
with open(kaggle_path, 'w') as f:
    json.dump(kaggle_credentials, f)

In [4]:
!chmod 600 ~/.kaggle/kaggle.json

In [5]:
!kaggle datasets download -d praneshmukhopadhyay/youtube-thumbnail-dataset

Dataset URL: https://www.kaggle.com/datasets/praneshmukhopadhyay/youtube-thumbnail-dataset
License(s): CC0-1.0
Downloading youtube-thumbnail-dataset.zip to /home/sagemaker-user
 99%|███████████████████████████████████████▋| 270M/272M [00:04<00:00, 85.8MB/s]
100%|████████████████████████████████████████| 272M/272M [00:04<00:00, 69.4MB/s]


#### Unzip Data

In [6]:
!unzip youtube-thumbnail-dataset.zip -d ./data

Archive:  youtube-thumbnail-dataset.zip
  inflating: ./data/thumbnails/images/3Blue1Brown/3d6DsjIBzJ4.jpg  
  inflating: ./data/thumbnails/images/3Blue1Brown/CfW845LNObM.jpg  
  inflating: ./data/thumbnails/images/3Blue1Brown/EK32jo7i5LQ.jpg  
  inflating: ./data/thumbnails/images/3Blue1Brown/GNcFjFmqEc8.jpg  
  inflating: ./data/thumbnails/images/3Blue1Brown/HEfHFsfGXjs.jpg  
  inflating: ./data/thumbnails/images/3Blue1Brown/IHZwWFHWa-w.jpg  
  inflating: ./data/thumbnails/images/3Blue1Brown/Ilg3gGewQ5U.jpg  
  inflating: ./data/thumbnails/images/3Blue1Brown/Kas0tIxDvrg.jpg  
  inflating: ./data/thumbnails/images/3Blue1Brown/M64HUIJFTZM.jpg  
  inflating: ./data/thumbnails/images/3Blue1Brown/OkmNXy7er84.jpg  
  inflating: ./data/thumbnails/images/3Blue1Brown/PFDu9oVAE-g.jpg  
  inflating: ./data/thumbnails/images/3Blue1Brown/QJYmyhnaaek.jpg  
  inflating: ./data/thumbnails/images/3Blue1Brown/WUvTyaaNkzM.jpg  
  inflating: ./data/thumbnails/images/3Blue1Brown/XFDM1ip5HdU.jpg  
  inflat

## Preprocess metadata file

In [7]:
import pandas as pd

meta_df = pd.read_csv('data/thumbnails/metadata.csv')

In [8]:
meta_df['Category'].unique()

array(['Science', 'News', 'Food', 'Blog', 'Tech', 'Informative',
       'Comedy,Entertainment', 'Entertainment', 'Automobile',
       'Tech,Informative', 'Automobile,Comedy', 'VideoGames',
       'Food,Entertainment', 'Blog,Comedy', 'Comedy,Informative',
       'Tech,Comedy', 'Comedy', 'Blog,Science', 'Blog,Entertainment',
       'Entertainment,Comedy', 'Tech,News', 'Entertainment,Blog'],
      dtype=object)

In [9]:
def standardize_category(category):
    # Split by comma, sort, join back, and convert to lowercase
    return ','.join(sorted(part.strip().lower() for part in category.split(',')))

# Apply the function to the 'category' column
meta_df['Category'] = meta_df['Category'].apply(standardize_category)

# Remove duplicates (if needed)
meta_df = meta_df.drop_duplicates()

In [10]:
meta_df['Category'].unique()

array(['science', 'news', 'food', 'blog', 'tech', 'informative',
       'comedy,entertainment', 'entertainment', 'automobile',
       'informative,tech', 'automobile,comedy', 'videogames',
       'entertainment,food', 'blog,comedy', 'comedy,informative',
       'comedy,tech', 'comedy', 'blog,science', 'blog,entertainment',
       'news,tech'], dtype=object)

In [11]:
meta_df.head()

Unnamed: 0,Id,Channel,Category,Title
0,OkmNXy7er84,3Blue1Brown,science,The hardest problem on the hardest test
1,r6sGWTCMz2k,3Blue1Brown,science,But what is a Fourier series? From heat flow t...
2,bBC-nXj3Ng4,3Blue1Brown,science,But how does bitcoin actually work?
3,aircAruvnKk,3Blue1Brown,science,"But what is a neural network? | Chapter 1, Dee..."
4,HEfHFsfGXjs,3Blue1Brown,science,The most unexpected answer to a counting puzzle


----

In [12]:
import os
import random
import pandas as pd
import numpy as np
import shutil
from PIL import Image
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker import image_uris

# Initialize SageMaker session and role
sagemaker_session = sagemaker.Session()
role = get_execution_role()
bucket = 'your-s3-bucket-name'

# Load metadata
categories = meta_df['Category'].unique()

# Directories for train/test/validation
data_dir = 'data/thumbnails/images'
train_dir = 'train'
test_dir = 'test'
val_dir = 'validation'

os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)

# Resize and process images, performing a stratified split
for category in categories:
    category_data = meta_df[meta_df['Category'] == category]
    category_ids = category_data['Id'].tolist()

    # Find available images
    available_images = [img_id for img_id in category_ids if os.path.exists(f'{data_dir}/{img_id}.jpg')]

    # Pick one image at random for validation
    val_image = random.choice(available_images)
    val_path = f'{data_dir}/{val_image}.jpg'
    img = Image.open(val_path).resize((224, 224))
    img.save(f'{val_dir}/{val_image}.jpg')

    # Split remaining images for training and testing (stratified by category)
    remaining_images = [img for img in available_images if img != val_image]
    np.random.shuffle(remaining_images)
    split_idx = int(0.7 * len(remaining_images))

    # Train and test paths
    for img_id in remaining_images[:split_idx]:
        img = Image.open(f'{data_dir}/{img_id}.jpg').resize((224, 224))
        img.save(f'{train_dir}/{img_id}.jpg')
    for img_id in remaining_images[split_idx:]:
        img = Image.open(f'{data_dir}/{img_id}.jpg').resize((224, 224))
        img.save(f'{test_dir}/{img_id}.jpg')

# Create .lst files for MXNet
def create_lst_file(data_dir, lst_file):
    with open(lst_file, 'w') as file:
        for idx, filename in enumerate(os.listdir(data_dir)):
            if filename.endswith('.jpg'):
                img_id = filename.split('.')[0]
                label = meta_df[meta_df['Id'] == img_id]['Category'].values[0]
                file.write(f"{idx}\t{label}\t{data_dir}/{filename}\n")

create_lst_file(train_dir, 'train.lst')
create_lst_file(test_dir, 'test.lst')

# Upload to S3
s3_train_path = f's3://{bucket}/{train_dir}'
s3_test_path = f's3://{bucket}/{test_dir}'
s3_val_path = f's3://{bucket}/{val_dir}'

for local_dir, s3_path in [(train_dir, s3_train_path), (test_dir, s3_test_path)]:
    sagemaker_session.upload_data(path=local_dir, bucket=bucket, key_prefix=local_dir)

# Verify .lst files in S3
s3_client = boto3.client('s3')
lst_files = ['train.lst', 'test.lst']

for lst_file in lst_files:
    response = s3_client.list_objects_v2(Bucket=bucket, Prefix=f'{lst_file}')
    if 'Contents' not in response:
        raise FileNotFoundError(f"{lst_file} not found in S3 bucket")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


IndexError: Cannot choose from an empty sequence

In [None]:
import os
import random
import pandas as pd
import numpy as np
import shutil
from PIL import Image
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker import image_uris

# Initialize SageMaker session and role
sagemaker_session = sagemaker.Session()
role = get_execution_role()
bucket = 'your-s3-bucket-name'

# Load metadata
meta_df = pd.read_csv('meta.csv')  # Assuming CSV format
meta_df['Id'] = meta_df['Id'].astype(str)
categories = meta_df['Category'].unique()

# Directories for train/test/validation
data_dir = 'data/thumbnails/images'
train_dir = 'train'
test_dir = 'test'
val_dir = 'validation'

os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)


# Set up and train model
image_uri = image_uris.retrieve('image-classification', sagemaker_session.boto_region_name)
output_path = f's3://{bucket}/output'

# Define model parameters
estimator = sagemaker.estimator.Estimator(
    image_uri=image_uri,
    role=role,
    instance_count=1,
    instance_type='ml.p2.xlarge',
    volume_size=50,
    max_run=3600,
    input_mode='File',
    output_path=output_path,
    sagemaker_session=sagemaker_session
)

estimator.set_hyperparameters(
    num_layers=18,
    use_pretrained_model=1,
    num_classes=len(categories),
    mini_batch_size=32,
    epochs=10,
    learning_rate=0.001,
    resize=224,
    image_shape='3,224,224',
    num_training_samples=len(os.listdir(train_dir)),
)

# Specify input channels
train_data = sagemaker.inputs.TrainingInput(s3_train_path, content_type='application/x-image')
test_data = sagemaker.inputs.TrainingInput(s3_test_path, content_type='application/x-image')
data_channels = {'train': train_data, 'validation': test_data}

# Train the model
estimator.fit(data_channels)

# Deploy model
predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.m5.large')

# Prediction on validation set
val_images = os.listdir(val_dir)
predictions = []

for image in val_images:
    img = Image.open(f'{val_dir}/{image}').resize((224, 224))
    img_array = np.array(img).transpose(2, 0, 1)
    result = predictor.predict(img_array)
    predictions.append((image, result))

# Print predictions
for img, pred in predictions:
    print(f'Image: {img}, Prediction: {pred}')

# Clean up
predictor.delete_endpoint()


## Step 1

In [19]:
import os
import pandas as pd

# Check if images in meta_df exist in the folder and update meta_df
image_dir = 'data/thumbnails/images'
existing_images = []
for folder in os.listdir(image_dir):
    folder_path = os.path.join(image_dir, folder)
    if os.path.isdir(folder_path):
        existing_images += [f.split('.')[0] for f in os.listdir(folder_path)]

# Filter meta_df to only include rows with existing images
meta_df = meta_df[meta_df['Id'].isin(existing_images)].reset_index(drop=True)
print("Filtered meta_df shape:", meta_df.shape)

Filtered meta_df shape: (2262, 4)


## Step 2

In [20]:
from PIL import Image

def resize_image(file_path):
    try:
        with Image.open(file_path) as img:
            img = img.resize((224, 224))
            img.save(file_path)
    except Exception as e:
        print(f"Error resizing {file_path}: {e}")

# Resize all images in the folders
for folder in os.listdir(image_dir):
    folder_path = os.path.join(image_dir, folder)
    if os.path.isdir(folder_path):
        for img_file in os.listdir(folder_path):
            file_path = os.path.join(folder_path, img_file)
            resize_image(file_path)

## Step 3

In [21]:
import shutil

# Create directories for validation, train, and test
for subdir in ['validation', 'train', 'test']:
    os.makedirs(subdir, exist_ok=True)

## Step 4

In [22]:
import random

# Create a validation set with one image from each category
validation_ids = []
for category in meta_df['Category'].unique():
    sample = meta_df[meta_df['Category'] == category].sample(1)
    validation_ids.append(sample['Id'].values[0])

# Move validation images to validation folder
validation_df = meta_df[meta_df['Id'].isin(validation_ids)]
for _, row in validation_df.iterrows():
    image_path = f"{image_dir}/{row['Channel']}/{row['Id']}.jpg"
    shutil.move(image_path, 'validation/')
    
# Update meta_df to exclude validation images
meta_df = meta_df[~meta_df['Id'].isin(validation_ids)]

## Step 5

In [23]:
import shutil
import os

# Define counts for tracking files moved
train_count = 0
test_count = 0

# Updated loop to move images to train and test folders, checking for file existence
for dataset, folder in zip([train_df, test_df], ['train', 'test']):
    for _, row in dataset.iterrows():
        image_path = f"{image_dir}/{row['Channel']}/{row['Id']}.jpg"
        
        # Check if the image exists before moving
        if os.path.isfile(image_path):
            shutil.move(image_path, folder + '/')
            if folder == 'train':
                train_count += 1
            else:
                test_count += 1
        else:
            print(f"Image not found, skipping: {image_path}")

# Total count of images
total_images = train_count + test_count

# Print summary statistics
print(f"Total images: {total_images}")
print(f"Training images: {train_count} ({(train_count / total_images) * 100:.2f}%)")
print(f"Testing images: {test_count} ({(test_count / total_images) * 100:.2f}%)")


Image not found, skipping: data/thumbnails/images/Throttle House/CFUu71dMpqA.jpg
Image not found, skipping: data/thumbnails/images/jacksepticeye/Uil9bh8kJgg.jpg
Image not found, skipping: data/thumbnails/images/Top Gear/jV51BGIzkwU.jpg
Image not found, skipping: data/thumbnails/images/CDawgVA/MCMY2kMVPa4.jpg
Image not found, skipping: data/thumbnails/images/Donut Media/z9rfPuCuDmc.jpg
Image not found, skipping: data/thumbnails/images/Gordon Ramsay/x5nEb-7UKZI.jpg
Image not found, skipping: data/thumbnails/images/Key & Peele/vNIe_UwHPU0.jpg
Image not found, skipping: data/thumbnails/images/Internet Historian/Qh9KBwqGxTI.jpg
Image not found, skipping: data/thumbnails/images/ElectroBOOM/im-PLK7ePhQ.jpg
Image not found, skipping: data/thumbnails/images/The Graham Norton Show/oFF134_iokI.jpg
Image not found, skipping: data/thumbnails/images/The Office/ST-653teD1U.jpg
Image not found, skipping: data/thumbnails/images/SmarterEveryDay/7pOXunRYJIw.jpg
Image not found, skipping: data/thumbnails/

## Step 6

In [24]:
# Function to create .lst files
def create_lst(df, output_file):
    with open(output_file, 'w') as f:
        for index, row in df.iterrows():
            f.write(f"{index}\t{row['Category']}\t{row['Id']}.jpg\n")

create_lst(train_df, 'train/train.lst')
create_lst(test_df, 'test/test.lst')

## Step 7

In [25]:
assert os.path.isfile('train/train.lst'), "train.lst not found in train folder!"
assert os.path.isfile('test/test.lst'), "test.lst not found in test folder!"
print("Both .lst files found successfully.")

Both .lst files found successfully.


## Step 8

In [27]:
import sagemaker
from sagemaker.s3 import S3Uploader
import os

# Set the S3 bucket
s3_bucket = 'youtube-category-project'  # Without the 's3://' prefix for compatibility with SageMaker's uploader

# Check if train and test folders exist locally
train_folder = 'train'
test_folder = 'test'

if not os.path.isdir(train_folder) or not os.path.isdir(test_folder):
    raise FileNotFoundError("Train or test folder does not exist. Ensure they were created correctly.")

# Upload train and test folders to S3, with separate S3 paths
train_s3_path = S3Uploader.upload(local_path=train_folder, desired_s3_uri=f's3://{s3_bucket}/train')
test_s3_path = S3Uploader.upload(local_path=test_folder, desired_s3_uri=f's3://{s3_bucket}/test')

# Print out the paths to confirm they were uploaded correctly
print(f"Train folder uploaded to S3 path: {train_s3_path}")
print(f"Test folder uploaded to S3 path: {test_s3_path}")

Train folder uploaded to S3 path: s3://youtube-category-project/train
Test folder uploaded to S3 path: s3://youtube-category-project/test


## Step 9

In [28]:
s3_client = sagemaker.Session().boto_session.client('s3')

# Define a function to check .lst files in S3
def check_lst_in_s3(s3_bucket, prefix):
    response = s3_client.list_objects_v2(Bucket=s3_bucket, Prefix=prefix)
    lst_files = [obj['Key'] for obj in response.get('Contents', []) if obj['Key'].endswith('.lst')]
    return bool(lst_files)

assert check_lst_in_s3(s3_bucket, 'train/train.lst'), "train.lst not found in S3 train folder!"
assert check_lst_in_s3(s3_bucket, 'test/test.lst'), "test.lst not found in S3 test folder!"
print("Both .lst files are in S3 successfully.")

Both .lst files are in S3 successfully.


## Step 10

In [34]:
from sagemaker.estimator import Estimator
from sagemaker import image_uris
import sagemaker
import os

# Define SageMaker role and instance type
role = sagemaker.get_execution_role()
instance_type = 'ml.p2.xlarge'

# Retrieve the prebuilt image classification container image URI for MXNet in your region
image_uri = image_uris.retrieve(framework='image-classification', region=sagemaker.Session().boto_region_name)

# Count the number of training samples
train_sample_count = sum([len(files) for _, _, files in os.walk('train')])

# Configure the Estimator with required hyperparameters
estimator = Estimator(
    image_uri=image_uri,               # Use the prebuilt SageMaker image
    role=role,
    instance_count=1,
    instance_type=instance_type,
    hyperparameters={
        'epochs': 10,
        'num_classes': 20,
        'use_pretrained_model': 1,
        'num_training_samples': train_sample_count  # Add the total number of training samples
    }
)

# Start training job with train and validation data paths, specifying ContentType
estimator.fit({
    'train': sagemaker.inputs.TrainingInput(train_s3_path, content_type='application/x-image'),
    'validation': sagemaker.inputs.TrainingInput(test_s3_path, content_type='application/x-image')
})


INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: 1.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: image-classification-2024-10-29-01-59-27-965


2024-10-29 01:59:29 Starting - Starting the training job...
2024-10-29 01:59:54 Starting - Preparing the instances for training......
2024-10-29 02:00:39 Downloading - Downloading input data...
2024-10-29 02:01:09 Downloading - Downloading the training image........................
2024-10-29 02:05:22 Training - Training image download completed. Training in progress.....[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34mNvidia gpu devices, drivers and cuda toolkit versions (only available on hosts with GPU):[0m
[34mTue Oct 29 02:06:03 2024       [0m
[34m+-----------------------------------------------------------------------------+[0m
[34m| NVIDIA-SMI 470.256.02   Driver Version: 470.256.02   CUDA Version: 11.4     |[0m
[34m|-------------------------------+----------------------+----------------------+[0m
[34m| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |[0m
[34m| Fan  Te

UnexpectedStatusException: Error for Training job image-classification-2024-10-29-01-59-27-965: Failed. Reason: ClientError: .lst file missing in the train_lst channel., exit code: 2

## Step 11

In [None]:
predictor = mxnet_estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

## Step 12

In [None]:
from PIL import Image
import numpy as np

def predict_image(image_path):
    with open(image_path, 'rb') as img:
        result = predictor.predict(img)
    return result

# Predict on sample images from the validation folder
for img_file in os.listdir('validation/'):
    img_path = os.path.join('validation', img_file)
    prediction = predict_image(img_path)
    print(f"Prediction for {img_file}: {prediction}")


## Step 13

In [None]:
predictor.delete_endpoint()