In [1]:
import torch
torch.cuda.empty_cache()


In [2]:
import pandas as pd
import numpy as np
from PIL import Image
import os

# Assuming the annotations are in a text file
annotations_file = 'annotation.txt'
image_folder = './'

# Read the annotations
annotations = pd.read_csv(annotations_file, sep=' ', header=None)

annotations


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,40
0,537,350,551,343,568,344,582,353,567,357,...,0.028289,0.556645,-0.910367,497.835052,-32.616417,-1.906307,498.891266,33.729710,0.085573,496.778839
1,529,342,543,336,559,337,573,345,558,349,...,0.025278,-3.828883,-4.842280,508.932648,-36.961273,-5.369772,511.060089,29.303507,-4.314788,506.805176
2,564,342,579,335,597,337,612,346,596,349,...,0.039401,14.374902,-4.112084,503.034241,-18.804157,-5.365746,503.405701,47.553963,-2.858423,502.662750
3,544,334,558,327,575,328,590,337,574,341,...,0.036817,4.041533,-8.495733,506.759644,-29.090580,-9.368177,508.774963,37.173645,-7.623289,504.744324
4,531,336,545,330,561,330,575,339,560,343,...,0.027466,-3.035088,-7.765797,510.304565,-36.161705,-8.315956,512.514587,30.091526,-7.215638,508.094513
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,625,402,638,397,653,399,665,408,652,411,...,0.071705,42.613365,30.039637,572.214172,9.628985,27.512640,569.349792,75.597740,32.566635,575.078552
506,616,415,629,410,644,412,656,421,643,423,...,0.060802,37.677147,36.256992,564.379089,4.536333,34.250275,564.847107,70.817963,38.263710,563.911072
507,556,399,570,392,586,393,599,401,584,405,...,0.049988,7.426083,23.939354,532.911804,-25.725267,22.326677,531.938782,40.577435,25.552031,533.884827
508,555,410,569,404,585,404,599,413,584,416,...,0.038618,7.657980,29.278347,525.193726,-25.518261,28.025335,524.622192,40.834221,30.531359,525.765259


In [2]:
# import os
# from PIL import Image
# from torch.utils.data import Dataset, DataLoader
# import torchvision.transforms as transforms
# import torch

# class GazeDataset(Dataset):
#     def __init__(self, annotations, image_folder, transform=None):
#         self.annotations = annotations
#         self.image_folder = image_folder
#         self.transform = transform

#     def __len__(self):
#         return len(self.annotations)

#     def __getitem__(self, idx):
#         # Format the image filename as '0001.jpg', '0002.jpg', ...
#         image_filename = f'{idx + 1:04d}.jpg'
#         image_path = os.path.join(self.image_folder, image_filename)
        
#         # Open the image
#         image = Image.open(image_path).convert('RGB')
        
#         # Apply transformations (if any)
#         if self.transform:
#             image = self.transform(image)
        
#         # Extract gaze vector (last three values)
#         gaze_vector = torch.tensor(self.annotations.iloc[idx, -3:].values, dtype=torch.float32)
        
#         return image, gaze_vector

# # Define any image transformations (e.g., normalization)
# data_transforms = transforms.Compose([
#     transforms.ToTensor(),  # Converts PIL image to torch.Tensor with values in [0, 1]
#     transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])  # Normalizes the data
# ])

# # Create dataset and DataLoader
# gaze_dataset = GazeDataset(annotations, image_folder, transform=data_transforms)
# train_loader = DataLoader(gaze_dataset, batch_size=8, shuffle=True)

# print(f'Dataset created with {len(gaze_dataset)} samples.')

# # Sample usage of the DataLoader
# for images, gaze_vectors in train_loader:
#     print(f'Batch size: {images.size(0)}')
#     break  # Remove this in training loop


In [3]:
import torchvision.transforms as transforms

# Define the transformations: ToTensor converts image to float32, and Normalize scales the values
data_transforms = transforms.Compose([
    transforms.ToTensor(),  # Converts PIL image to torch.FloatTensor and scales pixel values to [0, 1]
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])  # Optional normalization
])

In [4]:
images = []
gaze_vectors = []

# Load images and gaze vectors
for idx, row in annotations.iterrows():
    # Format the image filename as '0001.jpg', '0002.jpg', ...
    image_filename = f'{idx + 1:04d}.jpg'  # +1 to start from 0001 instead of 0000
    image_path = os.path.join(image_folder, image_filename)
    
    # Open the image
    image = Image.open(image_path).convert('RGB')
    
    # Optionally apply transformations
    image = data_transforms(image)  # Uncomment if using transformations
    
    images.append(image)
    gaze_vectors.append(row[-3:].values)  # Gaze vector (last three values)

# # Convert lists to numpy arrays or tensors as needed
# images = np.array(images)  # Note: PIL images cannot be directly converted; consider using a tensor or a list of arrays
# gaze_vectors = np.array(gaze_vectors)

# print(f'Loaded {len(images)} images and corresponding gaze vectors.')

In [5]:
np.set_printoptions(threshold=np.inf)
print(images[0].shape)
print(gaze_vectors[0])

torch.Size([3, 720, 1280])
[3.37297100e+01 8.55730000e-02 4.96778839e+02]


In [6]:
import torch
from torch.utils.data import Dataset

class GazeDataset(Dataset):
    def __init__(self, images, gaze_vectors):
        self.images = images
        self.gaze_vectors = gaze_vectors

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        return self.images[idx], torch.tensor(self.gaze_vectors[idx], dtype=torch.float32)

gaze_dataset = GazeDataset(images, gaze_vectors)


In [7]:
print(gaze_dataset.images[0])

tensor([[[-1., -1., -1.,  ..., -1., -1., -1.],
         [-1., -1., -1.,  ..., -1., -1., -1.],
         [-1., -1., -1.,  ..., -1., -1., -1.],
         ...,
         [-1., -1., -1.,  ..., -1., -1., -1.],
         [-1., -1., -1.,  ..., -1., -1., -1.],
         [-1., -1., -1.,  ..., -1., -1., -1.]],

        [[-1., -1., -1.,  ..., -1., -1., -1.],
         [-1., -1., -1.,  ..., -1., -1., -1.],
         [-1., -1., -1.,  ..., -1., -1., -1.],
         ...,
         [-1., -1., -1.,  ..., -1., -1., -1.],
         [-1., -1., -1.,  ..., -1., -1., -1.],
         [-1., -1., -1.,  ..., -1., -1., -1.]],

        [[-1., -1., -1.,  ..., -1., -1., -1.],
         [-1., -1., -1.,  ..., -1., -1., -1.],
         [-1., -1., -1.,  ..., -1., -1., -1.],
         ...,
         [-1., -1., -1.,  ..., -1., -1., -1.],
         [-1., -1., -1.,  ..., -1., -1., -1.],
         [-1., -1., -1.,  ..., -1., -1., -1.]]])


In [8]:
from torch.utils.data import DataLoader, random_split

train_size = int(0.8 * len(gaze_dataset))
val_size = len(gaze_dataset) - train_size
train_dataset, val_dataset = random_split(gaze_dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)


In [9]:
print(f'Dataset created with {len(gaze_dataset)} samples.')

# Sample usage of the DataLoader
for images, gaze_vectors in train_loader:
    print(f'Batch size: {images.size(0)}')
    break  # Remove this in training loop

Dataset created with 510 samples.
Batch size: 8


In [10]:
# import torch.nn as nn
# import torch.nn.functional as F

# class GazeCNN(nn.Module):
#     def __init__(self):
#         super(GazeCNN, self).__init__()
#         self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1)
#         self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
#         self.fc1 = nn.Linear(64 * 64 * 64, 128)  # Adjust based on input size
#         self.fc2 = nn.Linear(128, 3)  # Output: gaze vector

#     def forward(self, x):
#         x = F.relu(self.conv1(x))
#         x = F.max_pool2d(x, kernel_size=2, stride=2)
#         x = F.relu(self.conv2(x))
#         x = F.max_pool2d(x, kernel_size=2, stride=2)
#         x = x.view(x.size(0), -1)  # Flatten
#         x = F.relu(self.fc1(x))
#         x = self.fc2(x)
#         return x

# model = GazeCNN()


In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CNNModel(nn.Module):
    def __init__(self):
        super(CNNModel, self).__init__()
        # Convolutional layers
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        
        # Fully connected layers
        self.fc1 = nn.Linear(128 * 56 * 56, 128)  # Adjust based on the output size of conv layers
        self.fc2 = nn.Linear(128, 3)  # Output layer (e.g., for 3 gaze vector components)
    
    def forward(self, x):
        # Forward through conv layers
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2)  # Max pooling
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2)  # Max pooling
        
        # Check the output shape of the conv layers
        print(f"Shape after conv layers: {x.shape}")
        
        # Flatten the output from conv layers
        x = x.view(x.size(0), -1)  # Flatten
        print(f"Shape after flattening: {x.shape}")
        
        # Forward through fully connected layers
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Instantiate the model
model = CNNModel()

# Create a dummy input with batch size 8 and image size 224x224
sample_input = torch.randn(8, 3, 224, 224)

# Forward pass
outputs = model(sample_input)


Shape after conv layers: torch.Size([8, 128, 56, 56])
Shape after flattening: torch.Size([8, 401408])


In [12]:
import torch.optim as optim

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [13]:
import torch
print("Is CUDA available: ", torch.cuda.is_available())
print("CUDA version: ", torch.version.cuda)
print("PyTorch version: ", torch.__version__)
print("Number of GPUs: ", torch.cuda.device_count())
print("CUDA device: ", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")


Is CUDA available:  True
CUDA version:  12.1
PyTorch version:  2.4.1+cu121
Number of GPUs:  1
CUDA device:  NVIDIA GeForce RTX 3050 Laptop GPU


In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

CNNModel(
  (conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (fc1): Linear(in_features=401408, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=3, bias=True)
)

In [16]:
num_epochs = 10

for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    running_loss = 0.0
    
    # Training loop
    for images, gaze_vectors in train_loader:
        # Move data to GPU if using CUDA
        images = images.to(device)
        gaze_vectors = gaze_vectors.to(device)
        
        # Zero gradients from the previous step
        optimizer.zero_grad()
        
        # Forward pass: compute the output
        outputs = model(images)
        
        # Compute the loss
        loss = criterion(outputs, gaze_vectors)
        
        # Backward pass: compute gradients
        loss.backward()
        
        # Update the model parameters
        optimizer.step()
        
        # Accumulate the running loss
        running_loss += loss.item()
    
    # Calculate average loss over the epoch
    epoch_loss = running_loss / len(train_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')

    # Optional: Validation loop (use a validation DataLoader)
    model.eval()  # Set model to evaluation mode
    val_loss = 0.0
    with torch.no_grad():  # No need to compute gradients for validation
        for images, gaze_vectors in val_loader:
            images = images.to(device)
            gaze_vectors = gaze_vectors.to(device)
            
            outputs = model(images)
            loss = criterion(outputs, gaze_vectors)
            val_loss += loss.item()
    
    val_loss = val_loss / len(val_loader)
    print(f'Validation Loss: {val_loss:.4f}')


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.76 GiB. GPU 0 has a total capacity of 3.80 GiB of which 1.59 GiB is free. Including non-PyTorch memory, this process has 2.20 GiB memory in use. Of the allocated memory 2.11 GiB is allocated by PyTorch, and 4.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

: 

In [14]:
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    for images, gaze_vectors in train_loader:
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, gaze_vectors)
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


RuntimeError: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor

In [2]:
from PIL import Image

# Open an image
image_path = '0011.jpg'
image = Image.open(image_path)

# Convert image to RGB if not already
image = image.convert('RGB')

# Get pixel values
pixels = list(image.getdata())  # Get all pixel values

# Print the first 10 pixel values
print(pixels[:10])

# To see the size of the image
print(f'Image size: {image.size}')  # Outputs (width, height)


[(0, 0, 0), (0, 0, 0), (0, 0, 0), (0, 0, 0), (0, 0, 0), (0, 0, 0), (0, 0, 0), (0, 0, 0), (0, 0, 0), (0, 0, 0)]
Image size: (1280, 720)
