# Q1

In [2]:
import pandas as pd
from collections import deque, defaultdict
import time
from typing import List, Dict, Optional, Tuple

In [3]:
actors = {}
movies = {}
actor_movies = defaultdict(list)  # actor_id -> [movie_ids]
movie_actors = defaultdict(list)  # movie_id -> [actor_ids]


In [4]:
def load_data(people_csv='Dataset_FinalExam\Q1_dataset\small\people.csv', movies_csv='Dataset_FinalExam\Q1_dataset\small\movies.csv', stars_csv='Dataset_FinalExam\Q1_dataset\small\stars.csv'):
    """
    Load all CSV data into global variables
    Just run this once at the beginning!
    """
    global actors, movies, actor_movies, movie_actors
    
    try:
        print("🔄 Loading data...")
        
        # Load people data
        people_df = pd.read_csv(people_csv)
        for _, row in people_df.iterrows():
            actors[row['id']] = {
                'id': row['id'],
                'name': row['name'],
                'birth': row['birth']
            }
        
        # Load movies data
        movies_df = pd.read_csv(movies_csv)
        for _, row in movies_df.iterrows():
            movies[row['id']] = {
                'id': row['id'],
                'title': row['title'],
                'year': row['year']
            }
        
        # Load stars data and build connections
        stars_df = pd.read_csv(stars_csv)
        for _, row in stars_df.iterrows():
            person_id = row['person_id']
            movie_id = row['movie_id']
            
            actor_movies[person_id].append(movie_id)
            movie_actors[movie_id].append(person_id)
        
        print(f"✅ Data loaded successfully!")
        print(f"   📊 Actors: {len(actors)}")
        print(f"   🎬 Movies: {len(movies)}")
        print(f"   🔗 Connections: {len(stars_df)}")
        
        # Some cool stats
        total_connections = sum(len(movies_list) for movies_list in actor_movies.values())
        avg_movies = total_connections / len(actors) if actors else 0
        print(f"   📈 Average movies per actor: {avg_movies:.1f}")
        
    except Exception as e:
        print(f"❌ Error loading data: {e}")


In [5]:
def find_shortest_path(start_actor_id, end_actor_id):
    """
    BFS Algorithm to find shortest connection between two actors
    Returns: [actor1, movie1, actor2, movie2, ..., target_actor] or None
    """
    if start_actor_id == end_actor_id:
        return [start_actor_id]
    
    if start_actor_id not in actors or end_actor_id not in actors:
        return None
    
    # BFS setup
    queue = deque([[start_actor_id]])
    visited = {start_actor_id}
    
    while queue:
        path = queue.popleft()
        current_actor = path[-1]
        
        # Get all movies this actor appeared in
        for movie_id in actor_movies[current_actor]:
            # Get all co-actors in this movie
            for co_actor_id in movie_actors[movie_id]:
                if co_actor_id == current_actor:
                    continue
                
                if co_actor_id == end_actor_id:
                    # Found target! Return complete path
                    return path + [movie_id, co_actor_id]
                
                if co_actor_id not in visited:
                    visited.add(co_actor_id)
                    queue.append(path + [movie_id, co_actor_id])
    
    return None  # No connection found


In [6]:
def search_actors(query):
    """Search for actors by name (partial match, case-insensitive)"""
    query = query.lower()
    matches = []
    
    for actor_id, actor_info in actors.items():
        if query in actor_info['name'].lower():
            matches.append(actor_info)
    
    return sorted(matches, key=lambda x: x['name'])

In [7]:
def display_path(path, show_timing=True):
    """Display the connection path in a nice format"""
    if not path:
        print("❌ No connection found!")
        return
    
    if len(path) == 1:
        actor = actors[path[0]]
        print(f"🎭 Same actor: {actor['name']}")
        return
    
    # Calculate degrees of separation
    degrees = len(path) // 2
    
    print(f"\n🎯 CONNECTION FOUND!")
    print(f"{'='*50}")
    print(f"🎯 Degrees of Separation: {degrees}")
    print(f"📏 Path Length: {len(path)} steps")
    
    print(f"\n🛤️  CONNECTION PATH:")
    print("-" * 30)
    
    for i in range(0, len(path), 2):
        actor_id = path[i]
        actor = actors[actor_id]
        
        if i == 0:
            print(f"🎭 {actor['name']} ({actor['birth']})")
        else:
            print(f"    ⬇️")
            print(f"🎭 {actor['name']} ({actor['birth']})")
        
        # Add movie info if not the last actor
        if i < len(path) - 1:
            movie_id = path[i + 1]
            movie = movies[movie_id]
            print(f"    🎬 appeared together in: \"{movie['title']}\" ({movie['year']})")


In [8]:
def find_connection_interactive():
    """Interactive function to find connections between actors"""
    print("\n🔍 FIND CONNECTION BETWEEN ACTORS")
    print("-" * 35)
    
    # Get first actor
    while True:
        query1 = input("Enter first actor name (or part of it): ").strip()
        if not query1:
            continue
            
        matches1 = search_actors(query1)
        if not matches1:
            print(f"❌ No actors found matching '{query1}'")
            continue
        elif len(matches1) == 1:
            actor1 = matches1[0]
            break
        else:
            print(f"\n📋 Found {len(matches1)} matches:")
            for i, actor in enumerate(matches1, 1):
                print(f"{i}. {actor['name']} ({actor['birth']})")
            
            try:
                choice = int(input(f"Select actor (1-{len(matches1)}): ")) - 1
                if 0 <= choice < len(matches1):
                    actor1 = matches1[choice]
                    break
                else:
                    print("❌ Invalid choice")
            except ValueError:
                print("❌ Invalid input")
    
    # Get second actor
    while True:
        query2 = input("Enter second actor name (or part of it): ").strip()
        if not query2:
            continue
            
        matches2 = search_actors(query2)
        if not matches2:
            print(f"❌ No actors found matching '{query2}'")
            continue
        elif len(matches2) == 1:
            actor2 = matches2[0]
            break
        else:
            print(f"\n📋 Found {len(matches2)} matches:")
            for i, actor in enumerate(matches2, 1):
                print(f"{i}. {actor['name']} ({actor['birth']})")
            
            try:
                choice = int(input(f"Select actor (1-{len(matches2)}): ")) - 1
                if 0 <= choice < len(matches2):
                    actor2 = matches2[choice]
                    break
                else:
                    print("❌ Invalid choice")
            except ValueError:
                print("❌ Invalid input")
    
    if actor1['id'] == actor2['id']:
        print("❌ Please select two different actors!")
        return
    
    # Find and display connection
    print(f"\n🔍 Searching for connection between {actor1['name']} and {actor2['name']}...")
    start_time = time.time()
    path = find_shortest_path(actor1['id'], actor2['id'])
    search_time = (time.time() - start_time) * 1000
    
    display_path(path)
    if path:
        print(f"⚡ Search completed in {search_time:.2f}ms")


In [9]:
def list_all_actors():
    """Display all actors in the database"""
    print(f"\n👥 ALL ACTORS ({len(actors)} total):")
    print("=" * 40)
    
    sorted_actors = sorted(actors.values(), key=lambda x: x['name'])
    for i, actor in enumerate(sorted_actors, 1):
        movie_count = len(actor_movies[actor['id']])
        print(f"{i:2d}. {actor['name']} ({actor['birth']}) - {movie_count} movies")


In [10]:
def get_actor_filmography(actor_name):
    """Get filmography for an actor by name"""
    matches = search_actors(actor_name)
    
    if not matches:
        print(f"❌ No actors found matching '{actor_name}'")
        return
    
    if len(matches) > 1:
        print(f"📋 Found {len(matches)} matches:")
        for i, actor in enumerate(matches, 1):
            print(f"{i}. {actor['name']} ({actor['birth']})")
        return
    
    actor = matches[0]
    print(f"\n🎭 {actor['name']} - Filmography:")
    print("=" * 40)
    
    # Get movies and sort by year
    actor_movie_ids = actor_movies[actor['id']]
    filmography = []
    
    for movie_id in actor_movie_ids:
        movie_info = movies[movie_id].copy()
        # Add co-stars
        co_stars = [actors[aid]['name'] for aid in movie_actors[movie_id] if aid != actor['id']]
        movie_info['co_stars'] = co_stars
        filmography.append(movie_info)
    
    filmography.sort(key=lambda x: x['year'])
    
    for movie in filmography:
        co_stars_str = ", ".join(movie['co_stars'][:3])
        if len(movie['co_stars']) > 3:
            co_stars_str += f" (+{len(movie['co_stars'])-3} more)"
        
        print(f"🎬 {movie['title']} ({movie['year']})")
        if co_stars_str:
            print(f"   👥 Co-stars: {co_stars_str}")
        print()


In [11]:
def quick_connection(actor1_name, actor2_name):
    """Quick function to find connection between two actors by name"""
    # Find actors
    matches1 = search_actors(actor1_name)
    matches2 = search_actors(actor2_name)
    
    if not matches1:
        print(f"❌ No actors found matching '{actor1_name}'")
        return
    if not matches2:
        print(f"❌ No actors found matching '{actor2_name}'")
        return
    
    # Use first match for each
    actor1 = matches1[0]
    actor2 = matches2[0]
    
    print(f"🔍 Finding connection: {actor1['name']} ↔ {actor2['name']}")
    
    start_time = time.time()
    path = find_shortest_path(actor1['id'], actor2['id'])
    search_time = (time.time() - start_time) * 1000
    
    display_path(path)
    if path:
        print(f"⚡ Search completed in {search_time:.2f}ms")


In [12]:
def analyze_actor_connections(actor_name):
    """Analyze all possible connections for an actor"""
    matches = search_actors(actor_name)
    if not matches:
        print(f"❌ No actors found matching '{actor_name}'")
        return
    
    actor = matches[0]
    print(f"\n📊 CONNECTION ANALYSIS for {actor['name']}")
    print("=" * 50)
    
    connections = {}
    start_time = time.time()
    
    # Find shortest path to all other actors
    for target_id, target_actor in actors.items():
        if target_id == actor['id']:
            continue
        
        path = find_shortest_path(actor['id'], target_id)
        if path:
            degrees = len(path) // 2
            if degrees not in connections:
                connections[degrees] = []
            connections[degrees].append(target_actor['name'])
    
    analysis_time = time.time() - start_time
    
    print(f"⚡ Analysis completed in {analysis_time:.2f}s")
    print(f"🔗 Connected to {len(sum(connections.values(), []))} actors")
    print(f"❌ No connection to {len(actors) - 1 - len(sum(connections.values(), []))} actors")
    
    for degree in sorted(connections.keys()):
        print(f"\n{degree} degree{'s' if degree > 1 else ''} of separation: {len(connections[degree])} actors")
        if degree <= 2:  # Show details for close connections
            for name in sorted(connections[degree])[:10]:  # Show first 10
                print(f"   • {name}")
            if len(connections[degree]) > 10:
                print(f"   ... and {len(connections[degree]) - 10} more")


In [13]:
load_data()
# find_connection_interactive()
quick_connection("Tom Hanks", "Tom Cruise")

🔄 Loading data...
✅ Data loaded successfully!
   📊 Actors: 16
   🎬 Movies: 5
   🔗 Connections: 20
   📈 Average movies per actor: 1.2
🔍 Finding connection: Tom Hanks ↔ Tom Cruise

🎯 CONNECTION FOUND!
🎯 Degrees of Separation: 2
📏 Path Length: 5 steps

🛤️  CONNECTION PATH:
------------------------------
🎭 Tom Hanks (1956)
    🎬 appeared together in: "Apollo 13" (1995)
    ⬇️
🎭 Kevin Bacon (1958)
    🎬 appeared together in: "A Few Good Men" (1992)
    ⬇️
🎭 Tom Cruise (1962)
⚡ Search completed in 0.00ms


Why BFS is perfect here:
1. Finds SHORTEST path guaranteed (explores level by level)
2. Unweighted graph - all connections equal
3. Fast: O(V + E) time complexity
4. Memory efficient with deque

# Q2

In [14]:
import os
import cv2
import numpy as np

In [15]:
def load_data(data_dir="Dataset_FinalExam\\Q2_dataset", img_width=32, img_height=32):
    images = []
    labels = []
    
    # Loop through each category folder (assumed to be named 0 to 42)
    for label in range(43):  # 0 to 42 inclusive
        print(f"Loading images for label: {label}")
        folder_path = os.path.join(data_dir, str(label))
        
        if not os.path.isdir(folder_path):
            continue  # skip if not a directory

        for filename in os.listdir(folder_path):
            if filename.endswith('.ppm'):
                img_path = os.path.join(folder_path, filename)
                img = cv2.imread(img_path)
                
                if img is None:
                    continue  # skip unreadable files

                # Resize image to desired size
                img_resized = cv2.resize(img, (img_width, img_height))
                images.append(img_resized)
                labels.append(label)

    return images, labels

In [None]:
images, labels = load_data()

print(f"Loaded {len(images)} images.")
print(f"Sample shape: {images[0].shape}")
print(f"Sample label: {labels[0]}")

Loading images for label: 0
Loading images for label: 1
Loading images for label: 2
Loading images for label: 3
Loading images for label: 4
Loading images for label: 5
Loading images for label: 6
Loading images for label: 7
Loading images for label: 8
Loading images for label: 9
Loading images for label: 10
Loading images for label: 11
Loading images for label: 12
Loading images for label: 13
Loading images for label: 14
Loading images for label: 15
Loading images for label: 16
Loading images for label: 17
Loading images for label: 18
Loading images for label: 19
Loading images for label: 20
Loading images for label: 21
Loading images for label: 22
Loading images for label: 23
Loading images for label: 24
Loading images for label: 25
Loading images for label: 26
Loading images for label: 27
Loading images for label: 28
Loading images for label: 29
Loading images for label: 30
Loading images for label: 31
Loading images for label: 32
Loading images for label: 33
Loading images for label

In [17]:
import numpy as np
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from sklearn.model_selection import train_test_split

# Assuming `images` and `labels` are already loaded and contain all data
IMG_WIDTH = 32
IMG_HEIGHT = 32
NUM_CLASSES = 43

# Convert list of images to numpy array
X = np.array(images)
y = np.array(labels)

# Normalize pixel values to [0, 1]
X = X / 255.0

# One-hot encode labels
y = to_categorical(y, NUM_CLASSES)

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Build the CNN model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(IMG_WIDTH, IMG_HEIGHT, 3)),
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.25),

    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.25),

    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.25),

    Flatten(),
    Dense(256, activation='relu'),
    Dropout(0.5),

    Dense(NUM_CLASSES, activation='softmax')  # Output layer
])

# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=15, batch_size=64,
                    validation_data=(X_val, y_val))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/15
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.1374 - loss: 3.2344 - val_accuracy: 0.5617 - val_loss: 1.4751
Epoch 2/15
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.5368 - loss: 1.4358 - val_accuracy: 0.8735 - val_loss: 0.4964
Epoch 3/15
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.7673 - loss: 0.7184 - val_accuracy: 0.9450 - val_loss: 0.2242
Epoch 4/15
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.8620 - loss: 0.4237 - val_accuracy: 0.9681 - val_loss: 0.1348
Epoch 5/15
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.9052 - loss: 0.2962 - val_accuracy: 0.9799 - val_loss: 0.0935
Epoch 6/15
[1m333/333[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.9305 - loss: 0.2256 - val_accuracy: 0.9886 - val_loss: 0.0635
Epoch 7/15
[1m333/333[0m

In [18]:
val_loss, val_acc = model.evaluate(X_val, y_val)
print(f"Validation accuracy: {val_acc:.4f}")

[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9942 - loss: 0.0306
Validation accuracy: 0.9938


In [27]:

def predict_single_image(model, image_path, img_width=32, img_height=32):
    """
    Predict the class of a single image
    
    Args:
        model: Trained Keras model
        image_path: Path to the image file
        img_width: Width to resize image to
        img_height: Height to resize image to
    
    Returns:
        predicted_class: The predicted class (0-42)
        confidence: Confidence score of the prediction
    """
    # Load and preprocess the image
    img = cv2.imread(image_path)
    
    if img is None:
        raise ValueError(f"Could not load image from {image_path}")
    
    # Resize image to match training dimensions
    img_resized = cv2.resize(img, (img_width, img_height))
    
    # Normalize pixel values to [0, 1]
    img_normalized = img_resized / 255.0
    
    # Add batch dimension (model expects 4D input)
    img_batch = np.expand_dims(img_normalized, axis=0)
    
    # Make prediction
    predictions = model.predict(img_batch, verbose=0)

    # Prepare prediction results as a table
    results_df = pd.DataFrame({
        'Class': list(range(predictions.shape[1])),
        'Confidence': predictions[0]*100
    })

    # Sort by confidence descending
    results_df = results_df.sort_values(by='Confidence', ascending=False).reset_index(drop=True)

    print(results_df.head(10))  # Show top 10 predictions

    
    # Get the predicted class and confidence
    predicted_class = np.argmax(predictions[0])
    confidence = np.max(predictions[0])
    
    return predicted_class, confidence

In [31]:
print("Predicting single image...")
cl,conf = (predict_single_image(model, "Dataset_FinalExam\\Q2_dataset\\0\\00000_00000.ppm"))  # change this accoridng your needs

print(f"\n\n\nPredicted class: {cl}, Confidence: {conf:.2f}")

Predicting single image...
   Class  Confidence
0      0   97.234001
1      1    2.321922
2      8    0.258520
3      4    0.133995
4      2    0.049881
5      7    0.000932
6      5    0.000355
7     40    0.000124
8     15    0.000062
9     24    0.000048



Predicted class: 0, Confidence: 0.97


Revenge is a Fool's Game (c) Arthur Morgan