# Set up packages and server

In [12]:
# specify the GPU
import os
os.environ['TORCH_HOME'] = 'your/own/cache/directory'
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [13]:
# Data visualization
import plotly.express as px
from ipywidgets import interact

# Data processing
import pandas as pd
import numpy as np
import geopandas as gpd

# Model training
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
import copy
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau

import timm
from sklearn.model_selection import train_test_split
# import lightning as L
# from lightning import Trainer
# from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint

# Others
import os
import sys
import gc
from pathlib import Path
import os.path


from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

from torchvision import transforms
from torch.utils.data import DataLoader, Dataset
from PIL import Image

# project lib
PROJECT_SRC_PATH = os.path.join( '/workspace/workspace/ufo-prediction', 'src-RCA-UFO')
sys.path.append(PROJECT_SRC_PATH)
import dataset

In [14]:
torch.cuda.empty_cache()

In [15]:
# Check is GPU is enabled
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device: {}".format(device))

# Get specific GPU model
if str(device) == "cuda:0":
  print("GPU: {}".format(torch.cuda.get_device_name(0)))

Device: cuda:0
GPU: NVIDIA A100-PCIE-40GB


# 2. Load and preprocess the training and validation data

In [25]:
directories = [
    Path('/workspace/workspace/ufo-prediction/image_data_NLD'),
    Path('/workspace/workspace/ufo-prediction/image_data_ESP'),  # Add your second directory path here
    Path('/workspace/workspace/ufo-prediction/image_data_FRA')   # Add your third directory path here
]

filepaths = pd.Series(
    [item for directory in directories for item in directory.glob('**/*.jpg')],
    name='Filepath'
).astype(str)

def get_age_bracket(age):
    if 1900 <= age <= 1924:
        return '1900-1924'
    elif 1925 <= age <= 1945:
        return '1925-1945'
    elif 1946 <= age <= 1959:
        return '1946-1959'
    elif 1960 <= age <= 1969:
        return '1960-1969'
    elif 1970 <= age <= 1984:
        return '1970-1984'
    elif 1985 <= age <= 1999:
        return '1985-1999'
    elif 2000 <= age <= 2050:
        return '2000-2050'
    else:
        return 'Unknown'  # For ages outside the specified brackets or if age couldn't be determined


def extract_age_and_name(filepath):
    filename = os.path.basename(filepath)  # Get the filename from the filepath
    age_and_name = filename.split('.')[0]  # Split by dot and take the first part
    age = ''.join(filter(str.isdigit, age_and_name))[:4]  # Extract first 4 digits for age
    if age:  # Ensure age string is not empty
        age = int(age)
        age_bracket = get_age_bracket(age)  # Get the age bracket
    else:  # Default age if no digits found
        age_bracket = 'Unknown'
    return age_bracket

age_bracket_to_int = {
    '1900-1924': 0,
    '1925-1945': 1,
    '1946-1959': 2,
    '1960-1969': 3,
    '1970-1984': 4,
    '1985-1999': 5,
    '2000-2050': 6
}

age_brackets = pd.Series(filepaths.apply(lambda x: extract_age_and_name(x)), name='Age Brackets')
filtered_age_brackets = age_brackets[age_brackets != 'Unknown']
filtered_filepaths = filepaths[age_brackets != 'Unknown']
trainval_df = pd.concat([filtered_filepaths, filtered_age_brackets], axis=1).sample(frac=1.0, random_state=1).reset_index(drop=True)
trainval_df['Target'] = trainval_df['Age Brackets'].map(age_bracket_to_int)

In [17]:
train_df, val_df = train_test_split(trainval_df, train_size=0.7, shuffle=True, random_state=1)

In [18]:
class AgeBracketDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx, 0]
        image = Image.open(img_path).convert("RGB")
        label = int(self.dataframe.iloc[idx, 2])  # Assuming the label/target is in the third column

        if self.transform:
            image = self.transform(image)

        return image, label


In [19]:
train_transforms = transforms.Compose([
    transforms.RandomResizedCrop(224),  # ResNet-18 expects 224x224 inputs
    transforms.RandomHorizontalFlip(),  # A common form of augmentation
    #transforms.RandomRotation(15),  # Rotates the image by up to 15 degrees
    #transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1),  # Randomly jitters color
    #transforms.RandomAffine(degrees=0, translate=(0.1, 0.1), scale=(0.9, 1.1)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])  # Normalization
])

val_test_transforms = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

train_dataset = AgeBracketDataset(train_df, transform=train_transforms)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

val_dataset = AgeBracketDataset(val_df, transform=val_test_transforms)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)

# 3. Train model

In [20]:
def train_and_evaluate_val_loss(model, train_loader, val_loader, optimizer, criterion, scheduler, num_epochs=50, patience=7):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    best_val_loss = float('inf')  # Initialize the best validation loss as infinity
    best_model_wts = copy.deepcopy(model.state_dict())
    epochs_no_improve = 0
    early_stop = False

    for epoch in range(num_epochs):
        model.train()  # Set model to training mode
        running_loss = 0.0

        # Training phase
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        # Validation phase
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

        avg_val_loss = val_loss / len(val_loader)

        print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {running_loss / len(train_loader):.4f}, Validation Loss: {avg_val_loss:.4f}')
        
        scheduler.step(avg_val_loss)  # Learning rate scheduler step based on validation loss

        # Check for improvement based on validation loss
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_model_wts = copy.deepcopy(model.state_dict())
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1

        # Early stopping check
        if epochs_no_improve >= patience:
            print(f'Early stopping triggered after {epoch + 1} epochs.')
            early_stop = True
            break
    
    if not early_stop:
        print('Reached maximum epoch limit.')

    # Load best model weights based on lowest validation loss
    model.load_state_dict(best_model_wts)
    return best_val_loss  # Return the best validation loss achieved


In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Best Parameters
batch_size = 32
learning_rate = 0.0005
unfreeze_option = 'last_plus_one'

# Initialize the model
model = models.resnet18(pretrained=True)

# Freeze all layers first
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the selected layers
if unfreeze_option == 'last':
    for param in model.fc.parameters():
        param.requires_grad = True
elif unfreeze_option == 'last_plus_one':
    for param in model.layer4.parameters():
        param.requires_grad = True
    for param in model.fc.parameters():
        param.requires_grad = True

model.to(device)

# Setup DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Setup optimizer, criterion, and scheduler
optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)
criterion = nn.CrossEntropyLoss()
scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.1, patience=5, verbose=True)

# Train the model
train_and_evaluate_val_loss(model, train_loader, val_loader, optimizer, criterion, scheduler, num_epochs=50, patience=7)

KeyboardInterrupt: 

## 4. Prepare the Test DataLoader

In [41]:
directory = Path('/workspace/workspace/ufo-prediction/image_data_ID')

filepaths = pd.Series(list(directory.glob(r'**/*.jpg')), name='Filepath').astype(str)
age_brackets = pd.Series(filepaths.apply(lambda x: extract_age_and_name(x)), name='Age Brackets')
filtered_age_brackets = age_brackets[age_brackets != 'Unknown']
filtered_filepaths = filepaths[age_brackets != 'Unknown']

test_df = pd.concat([filtered_filepaths, filtered_age_brackets], axis=1).sample(frac=1.0, random_state=1).reset_index(drop=True)
test_df['Target'] = test_df['Age Brackets'].map(age_bracket_to_int)

In [42]:
from numpy import unique
building_ids = [extract_building_id(path) for path in filepaths]
len(unique(building_ids))

331

In [62]:
test_dataset = AgeBracketDataset(test_df, transform=val_test_transforms)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [67]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx, 0]
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image, img_path

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Assuming test_df is your DataFrame
test_dataset = CustomDataset(dataframe=test_df, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [60]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt

# Evaluate the model
model.eval()
true_classes = []
predicted_classes = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        true_classes.extend(labels.cpu().numpy())
        predicted_classes.extend(predicted.cpu().numpy())

# Convert lists to numpy arrays for evaluation
true_classes = np.array(true_classes)
predicted_classes = np.array(predicted_classes)

# Compute metrics
accuracy = accuracy_score(true_classes, predicted_classes)
precision, recall, f1, _ = precision_recall_fscore_support(true_classes, predicted_classes, average='weighted')

print(f"Test Accuracy: {accuracy:.5f}")
print(f"Test Precision: {precision:.5f}")
print(f"Test Recall: {recall:.5f}")
print(f"Test F1-Score: {f1:.5f}")

# Plot confusion matrix
cm = confusion_matrix(true_classes, predicted_classes)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['1900-1944', '1945-1969', '1970-1979','1980-1989', '1990-1999', '2000-2009', '2010-2050'], yticklabels=['1900-1944', '1945-1969', '1970-1979','1980-1989', '1990-1999', '2000-2009', '2010-2050'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

KeyboardInterrupt: 

# 5. Infer age brackets

In [68]:
model.eval()  # Set model to evaluation mode
predictions = []
file_paths = []

with torch.no_grad():
    for inputs, paths in test_loader:
        inputs = inputs.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        predictions.extend(predicted.cpu().numpy())
        file_paths.extend(paths)


KeyboardInterrupt: 

# 6. Concatenate Building ID and use to merge to rca-ufo dataset

In [49]:
path_data_RCA = os.path.join(dataset.DATA_DIR, 'rca-ufo-merge_ALL.csv')
df = pd.read_csv(path_data_RCA, encoding='latin1')

In [50]:
def extract_building_id(filepath):
    """
    Extracts the building ID from the given filepath string.
    Knowing that the building ID always contains an underscore and considering the filename
    format '[age_right]_[building_id]_[subscript].jpg', this function returns the 'building_id'.
    """
    # Isolate the filename from the filepath
    filename = filepath.split('/')[-1]
    # Split the filename at underscores
    parts = filename.split('_')
    # Considering the first part is age_right and the last part is the subscript with .jpg,
    # the building ID is everything in between.
    # Rejoin the middle parts to account for underscores within the building ID itself.
    building_id = '_'.join(parts[1:-1])
    return building_id

# Use the updated function to extract building IDs from file paths
building_ids = [extract_building_id(path) for path in file_paths]

# Prepare the predictions DataFrame with the correct building IDs
predictions_df = pd.DataFrame({
    'id': building_ids,
    'CNN_age_bracket': predictions  # Or map to your age bracket strings if needed
})

# Now, predictions_df contains the correct Building IDs and their corresponding predicted age brackets


In [53]:
predictions_df

Unnamed: 0,id,CNN_age_bracket
0,v0.1-NLD.5.7_1-23475,0
1,v0.1-NLD.10.11_1-33501,4
2,v0.1-NLD.10.11_1-33501,4
3,v0.1-NLD.7.44_1-21006,3
4,v0.1-NLD.9.4_1-112828,0
...,...,...
487,v0.1-NLD.11.27_1-47300,5
488,v0.1-NLD.11.27_1-33074,4
489,v0.1-NLD.9.4_1-121153,1
490,v0.1-NLD.14.2_1-112216,2


In [46]:
main_df = pd.merge(df, predictions_df, on='id', how='left')

In [47]:
main_df.to_csv('/workspace/workspace/ufo-prediction/demo/rca-ufo-CNN.csv', index=False)

In [48]:
gc.collect()

10842