#### Step 1: explain the data
- what is included
- how it was obtained
- all important details about how it was sampled from the student's own digital archive.

**Data Description**

"if you want to know what someone fears losing watch what they photograph."

One of my hobbies is taking photos with
I love to record how I see the world through camera, especially film and ccd, so I have a collection of 1000+ pictures in my computer. 
-> some are sghare but some are only downloaded on to the phone and --> so diff in resolution (remove if not necessary)

Something special about taking photos with antique cameras There is a more grain in --> so requires extra processing to increase contrast fopr better processing???? 

I removed the film pictures that are overexposed as well as dispoable camera that i used when i first started taking pics
try to only to use pictures from one camera (nikon lite touch zoom 110w)
but since i like to experiment different brands and types of film --> the colour gradient is diverse


**Overall Objective**


**Objective for draft 1**

Build a simple image classification model to differentiate human portraits from non-human portraits in film photography.

A human portrait is an image where a human is the central and dominant subject, with a clear emphasis on their face, expression, or body (e.g., a headshot, half-body, or full-body portrait).
An image is not a portrait if the human is small in the background or if other elements (landscape, architecture, etc.) dominate the frame.

#### Step 2: Data Ingestion
- converting this data to Python readable format (and sci-kit-learn) 
- loading this data into an appropriate data structure (np.array, pandas dataframe, glob etc.).


In [None]:
# Import necessary libraries
import os
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.utils import shuffle

# Define the paths to your image directories
portrait_dir = "data/portraits/"  # Folder containing portrait images
non_portrait_dir = "data/non_portraits/"  # Folder containing non-portrait images

# Function to load images from a given folder
def load_images_from_folder(folder, label, target_size=(224, 224)):
    """
    Loads images from a specified folder, resizes them to a fixed size, 
    and converts them to NumPy arrays.

    Parameters:
    - folder (str): Path to the folder containing images.
    - label (int): Label assigned to images (1 for portraits, 0 for non-portraits).
    - target_size (tuple): Desired image dimensions (default is 224x224 pixels).

    Returns:
    - images (list): List of image arrays.
    - labels (list): List of corresponding labels.
    """
    images = []
    labels = []

    # Iterate through all image files in the folder
    for filename in os.listdir(folder):
        img_path = os.path.join(folder, filename)  # Get the full file path
        
        try:
            with Image.open(img_path) as img:  # Open the image
                img = img.resize(target_size)  # Resize image to maintain consistency
                img_array = np.array(img)  # Convert image to a NumPy array
                
                images.append(img_array)  # Append processed image to list
                labels.append(label)  # Append corresponding label (1 or 0)

        except Exception as e:
            print(f"Error loading {filename}: {e}")  # Print error message if loading fails
    
    return images, labels

# Load images and labels from both folders
portrait_images, portrait_labels = load_images_from_folder(portrait_dir, label=1)  # Load portraits
non_portrait_images, non_portrait_labels = load_images_from_folder(non_portrait_dir, label=0)  # Load non-portraits

# Combine portrait and non-portrait data
X = np.array(portrait_images + non_portrait_images)  # Stack all image arrays
y = np.array(portrait_labels + non_portrait_labels)  # Stack all labels

# Shuffle dataset to randomize image order
X, y = shuffle(X, y, random_state=42)

# Function to display sample images
def show_sample_images(X, y, num_samples=5):
    """
    Displays a few sample images from the dataset with labels.

    Parameters:
    - X (numpy array): Array of image data.
    - y (numpy array): Array of labels (1 = portrait, 0 = non-portrait).
    - num_samples (int): Number of sample images to display (default is 5).
    """
    plt.figure(figsize=(10, 5))  # Set figure size
    for i in range(num_samples):
        ax = plt.subplot(1, num_samples, i + 1)  # Create subplots
        plt.imshow(X[i])  # Show the image
        plt.title("Portrait" if y[i] == 1 else "Non-Portrait")  # Assign title based on label
        plt.axis("off")  # Hide axes
    plt.show()

# Display 5 sample images to verify the dataset
show_sample_images(X, y, num_samples=5)

# Print dataset statistics
print(f"Total images loaded: {X.shape[0]}")  # Number of images loaded
print(f"Image dimensions: {X.shape[1:]}")  # Shape of each image (Height, Width, Channels)
print(f"Label distribution: {np.unique(y, return_counts=True)}")  # Count of each category

#### Step 3: Data Cleaning 
explaining any necessary cleaning, pre-processing, and feature engineering the data requires

+ [code block] completing these steps. 
perform some basic exploratory data analysis at this point reporting and visualizing the samples and computing appropriate descriptive statistics.

In [None]:
# Import required libraries
import os
import numpy as np
import cv2  # For image processing
import random
import matplotlib.pyplot as plt
from PIL import Image, ImageEnhance

# Set image directories
portrait_dir = "path_to_your_images/portraits/"
non_portrait_dir = "path_to_your_images/non_portraits/"

# Parameters
IMG_SIZE = (224, 224)  # Standard CNN input size
USE_GRAYSCALE = False  # Set to True if you want grayscale images
APPLY_AUGMENTATION = True  # Apply data augmentation

def preprocess_image(img_path, target_size=IMG_SIZE, grayscale=USE_GRAYSCALE, augment=APPLY_AUGMENTATION):
    """
    Loads an image, resizes it, applies optional grayscale conversion,
    normalizes pixel values, and performs data augmentation if enabled.
    
    Parameters:
    - img_path (str): Path to the image file.
    - target_size (tuple): The desired image size (width, height).
    - grayscale (bool): Convert to grayscale if True.
    - augment (bool): Apply data augmentation if True.

    Returns:
    - Processed image as a NumPy array.
    """
    try:
        # Load image using PIL
        img = Image.open(img_path)
        
        # Convert to grayscale if required
        if grayscale:
            img = img.convert("L")  # Convert to grayscale
        
        # Resize image
        img = img.resize(target_size)

        # Apply data augmentation
        if augment:
            img = apply_augmentation(img)

        # Convert to NumPy array
        img_array = np.array(img)

        # Normalize pixel values to range [0,1]
        img_array = img_array / 255.0

        return img_array

    except Exception as e:
        print(f"Error processing {img_path}: {e}")
        return None

def apply_augmentation(img):
    """
    Applies random augmentations to the given image to improve model generalization.
    - Random rotation
    - Random brightness adjustment
    - Random horizontal flipping
    - Gaussian noise (to simulate film grain)

    Parameters:
    - img (PIL.Image): The input image.

    Returns:
    - Augmented image.
    """
    # Random rotation (-15 to +15 degrees)
    angle = random.uniform(-15, 15)
    img = img.rotate(angle)

    # Random brightness adjustment (80% to 120% brightness)
    enhancer = ImageEnhance.Brightness(img)
    brightness_factor = random.uniform(0.8, 1.2)
    img = enhancer.enhance(brightness_factor)

    # Random horizontal flip
    if random.random() > 0.5:  # 50% chance
        img = img.transpose(Image.FLIP_LEFT_RIGHT)

    # Convert to NumPy for Gaussian noise
    img_np = np.array(img)

    # Apply Gaussian noise (simulates film grain)
    noise = np.random.normal(loc=0, scale=0.02, size=img_np.shape)  # Small noise
    img_np = np.clip(img_np + noise * 255, 0, 255).astype(np.uint8)

    return Image.fromarray(img_np)  # Convert back to PIL format

# Load and preprocess all images
def load_and_preprocess_images(folder, label):
    """
    Loads all images from a folder, preprocesses them, and stores them in arrays.

    Parameters:
    - folder (str): Path to the folder containing images.
    - label (int): Label for classification (1 = portrait, 0 = non-portrait).

    Returns:
    - images (list): List of preprocessed images as NumPy arrays.
    - labels (list): Corresponding labels.
    """
    images, labels = [], []

    for filename in os.listdir(folder):
        img_path = os.path.join(folder, filename)
        processed_img = preprocess_image(img_path)
        
        if processed_img is not None:
            images.append(processed_img)
            labels.append(label)

    return images, labels

# Load and preprocess portrait and non-portrait images
portrait_images, portrait_labels = load_and_preprocess_images(portrait_dir, label=1)
non_portrait_images, non_portrait_labels = load_and_preprocess_images(non_portrait_dir, label=0)

# Combine dataset
X = np.array(portrait_images + non_portrait_images)  # Image data
y = np.array(portrait_labels + non_portrait_labels)  # Labels

# Shuffle dataset
X, y = shuffle(X, y, random_state=42)

# Display some preprocessed images
def show_preprocessed_samples(X, y, num_samples=5):
    """
    Displays a few sample images from the preprocessed dataset.

    Parameters:
    - X (numpy array): Preprocessed image data.
    - y (numpy array): Labels (1 = portrait, 0 = non-portrait).
    - num_samples (int): Number of images to display.
    """
    plt.figure(figsize=(10, 5))
    for i in range(num_samples):
        ax = plt.subplot(1, num_samples, i + 1)
        plt.imshow(X[i] if not USE_GRAYSCALE else X[i].squeeze(), cmap="gray" if USE_GRAYSCALE else None)
        plt.title("Portrait" if y[i] == 1 else "Non-Portrait")
        plt.axis("off")
    plt.show()

# Show 5 sample preprocessed images
show_preprocessed_samples(X, y, num_samples=5)

# Print dataset details
print(f"Total images processed: {X.shape[0]}")
print(f"Image shape: {X.shape[1:]}")  # Expected: (224, 224, 3) or (224, 224) for grayscale
print(f"Label distribution: {np.unique(y, return_counts=True)}")  # Check class balance


#### Step 4: Analysis
- discussing the analysis (classification, regression, or clustering) that will be conducted on the data
- code performs any necessary data splits (such as creating training and test sets)

In [1]:
# [well commented code block] performs any necessary data splits (such as creating training and test sets)

#### Step 5: Model Selection

- discuss model** selection** in a markdown section 
- [code] model initialization and construction in a well-commented code block. 
- This section should include a clear discussion of the model's mathematical underpinnings -> include typeset equations and/or algorithms as pseudocode.


[draft]
To establish a benchmark for performance, we first consider a baseline model using Logistic Regression or Support Vector Machines (SVM). These models are widely used for classification tasks but are not typically applied to raw image data due to the high dimensionality. However, using them as a starting point allows us to quantify the challenge of the classification task before introducing deep learning models.

For this approach, images are flattened into one-dimensional vectors, where each pixel becomes a feature in a traditional machine learning model. Despite its limitations, this method provides insights into whether raw pixel values alone carry enough information for classification. If the baseline performs poorly, it indicates that a more advanced feature extraction approach, such as convolutional neural networks (CNNs), is necessary. The key advantage of this baseline approach is its simplicity—training a logistic regression or SVM model is computationally inexpensive and provides an interpretable reference point.

#### Step 6: train the model
- code and explanations for necessary cross validation or hyperparameter tuning.


#### Step 7: 
- code to generate predictions for out of sample data, and compute appropriate performance** metrics.**

#### Step 8: visualize the results and discuss your conclusions.

#### Step 9: executive summary (of the prior eight sections)
- clearly explaining your steps
- diagramming your pipeline
- visualizing any key results
- explaining any key insights or shortcomings of your approach. 
- You may wish to include a discussion of how the model might be improved.


#### Step 10: references 
- documents, guides, or code repos you accessed for the project.

##### Feature Feature Extraction Using BERT Embeddings

We generate embeddings for each song's lyrics using a BERT model. These embeddings will be used for similarity comparisons.

##### Feature Feature Extraction Using BERT Embeddings

We generate embeddings for each song's lyrics using a BERT model. These embeddings will be used for similarity comparisons.

In [None]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np

# We generate embeddings for each song's lyrics using a BERT model. These embeddings will be used for similarity comparisons.

# Initialize the tokenizer and model (using the base uncased model)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embedding(text):
    # If 'text' is a list of tokens, join them into a single string
    if isinstance(text, list):
        text = " ".join(text)
    
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Get all token embeddings from the last hidden state
    embeddings = outputs.last_hidden_state
    # Mean pooling over the sequence dimension; detach from graph and move to CPU
    mean_embedding_tensor = torch.mean(embeddings, dim=1).squeeze().cpu().detach()
    # Convert tensor to list then to a NumPy array
    mean_embedding = np.array(mean_embedding_tensor.tolist())
    return mean_embedding

# Example: Generate BERT embeddings for each song in the DataFrame
# Assume 'df' contains a 'lyrics' column with the cleaned lyric text.
df['bert_embedding'] = df['cleaned_tokens'].apply(get_bert_embedding)

# Convert embeddings into a feature matrix (stacking arrays)
X_bert = np.vstack(df['bert_embedding'].values)
print("BERT Embedding shape:", X_bert.shape)


In [None]:
# %% [markdown]
# ## Step 5: Query Representation, Similarity Calculation, and Ranking
# In this section we restrict the input to one imagery-related word (e.g. "sunset"), generate its embedding, compute cosine similarity with each song's embedding, and rank the songs.

import sklearn
from sklearn.metrics.pairwise import cosine_similarity

def get_query_embedding(query_word):
    """
    Process the input query word (or phrase) to generate a BERT embedding.
    Since the input is restricted to an imagery-related word, we split and process it.
    """
    tokens = query_word.split()  # For a single word or a simple phrase
    return get_bert_embedding(tokens)

# Set your imagery-related query word here
query = "apple"  # You can change this as needed

# Generate the embedding for the query word
query_embedding = get_query_embedding(query)

# Reshape the query embedding for cosine similarity computation
query_embedding = query_embedding.reshape(1, -1)

# Compute cosine similarities between the query and all song embeddings
similarities = cosine_similarity(query_embedding, X_bert).flatten()

# Add the similarity scores to the DataFrame
df['similarity'] = similarities

# Rank the songs by similarity (descending order)
df_sorted = df.sort_values(by='similarity', ascending=False)

print(f"\nTop ranked songs for query '{query}':")
print(df_sorted[['filename', 'similarity']].head())


In [None]:
# Optionally, visualize the distribution of similarity scores
import seaborn as sns
sns.histplot(similarities, kde=True)
plt.title(f"Cosine Similarity Distribution for Query: '{query}'")
plt.xlabel("Cosine Similarity")
plt.ylabel("Frequency")
plt.show()

In [None]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np

# We generate embeddings for each song's lyrics using a BERT model. These embeddings will be used for similarity comparisons.

# Initialize the tokenizer and model (using the base uncased model)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embedding(text):
    # If 'text' is a list of tokens, join them into a single string
    if isinstance(text, list):
        text = " ".join(text)
    
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Get all token embeddings from the last hidden state
    embeddings = outputs.last_hidden_state
    # Mean pooling over the sequence dimension; detach from graph and move to CPU
    mean_embedding_tensor = torch.mean(embeddings, dim=1).squeeze().cpu().detach()
    # Convert tensor to list then to a NumPy array
    mean_embedding = np.array(mean_embedding_tensor.tolist())
    return mean_embedding

# Example: Generate BERT embeddings for each song in the DataFrame
# Assume 'df' contains a 'lyrics' column with the cleaned lyric text.
df['bert_embedding'] = df['cleaned_tokens'].apply(get_bert_embedding)

# Convert embeddings into a feature matrix (stacking arrays)
X_bert = np.vstack(df['bert_embedding'].values)
print("BERT Embedding shape:", X_bert.shape)


In [None]:
# %% [markdown]
# ## Step 5: Query Representation, Similarity Calculation, and Ranking
# In this section we restrict the input to one imagery-related word (e.g. "sunset"), generate its embedding, compute cosine similarity with each song's embedding, and rank the songs.

import sklearn
from sklearn.metrics.pairwise import cosine_similarity

def get_query_embedding(query_word):
    """
    Process the input query word (or phrase) to generate a BERT embedding.
    Since the input is restricted to an imagery-related word, we split and process it.
    """
    tokens = query_word.split()  # For a single word or a simple phrase
    return get_bert_embedding(tokens)

# Set your imagery-related query word here
query = "apple"  # You can change this as needed

# Generate the embedding for the query word
query_embedding = get_query_embedding(query)

# Reshape the query embedding for cosine similarity computation
query_embedding = query_embedding.reshape(1, -1)

# Compute cosine similarities between the query and all song embeddings
similarities = cosine_similarity(query_embedding, X_bert).flatten()

# Add the similarity scores to the DataFrame
df['similarity'] = similarities

# Rank the songs by similarity (descending order)
df_sorted = df.sort_values(by='similarity', ascending=False)

print(f"\nTop ranked songs for query '{query}':")
print(df_sorted[['filename', 'similarity']].head())


In [None]:
# Optionally, visualize the distribution of similarity scores
import seaborn as sns
sns.histplot(similarities, kde=True)
plt.title(f"Cosine Similarity Distribution for Query: '{query}'")
plt.xlabel("Cosine Similarity")
plt.ylabel("Frequency")
plt.show()

In [None]:
import re
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter

# Download required NLTK resources (if not already downloaded)
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_lyrics(text):
    # 1. Split the text into lines
    lines = text.split('\n')
    # 2. Remove the first line (if you always want to discard it unconditionally)
    if len(lines) > 0:
        lines = lines[1:]
    # 3. Remove lines containing square brackets (e.g. [Chorus], [Verse 1])
    filtered_lines = []
    for line in lines:
        # If the line does not contain square brackets, keep it
        if not re.search(r'\[.*?\]', line):
            filtered_lines.append(line)
    # 4. Rejoin the remaining lines into a single string
    cleaned_text = ' '.join(filtered_lines)
    # 5. Convert to lowercase
    cleaned_text = cleaned_text.lower()
    # 6. Remove punctuation (keep spaces and word characters only)
    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)
    # 7. Tokenize the text
    tokens = nltk.word_tokenize(cleaned_text)
    # 8. Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # 9. Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

# Apply preprocessing to the lyrics column and create a new column for cleaned tokens
df['cleaned_tokens'] = df['lyrics'].apply(preprocess_lyrics)

# Add a column for the length of the lyrics (number of tokens)
df['num_tokens'] = df['cleaned_tokens'].apply(len)

print("\nData Cleaning: DataFrame after preprocessing")
print(df[['filename', 'num_tokens']].head())