In [None]:
!git clone https://GITHUBTOKEN@github.com/Nekromant-cpu/fds_project.git

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Data Loading and Preprocessing

In [None]:
embedding_dim = 512  # Universal Sentence Encoder produces 512-dimensional embeddings
batch_size = 16
num_classes = 29

In [None]:
import csv
import os
import nltk
from PIL import Image
import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


nltk.download('stopwords')
nltk.download('punkt_tab')


# File paths
csv_file_path_train = './fds_project/data/full_data/book_descriptions_train_balanced.csv'
csv_file_path_test = './fds_project/data/full_data/book_descriptions_test_balanced.csv'
images_folder = './fds_project/data/images/'


def clean_text_column(text: str) -> str:
    # Replace digits and punctuation with spaces
    text = text.translate(str.maketrans(string.digits, " " * len(string.digits)))
    text = text.translate(str.maketrans(string.punctuation, " " * len(string.punctuation)))
    # Collapse multiple spaces
    text = " ".join(text.split())
    # Tokenize and remove stopwords
    words = word_tokenize(text.lower())
    stop_words = set(stopwords.words("english"))
    cleaned_text = " ".join([word for word in words if word not in stop_words])
    return cleaned_text


# Read and process training data
def load_data(csv_file_path, images_folder):
    image_names = []
    descriptions = []
    category_ids = []

    df = pd.read_csv(csv_file_path)

    df.dropna(subset=["description"], inplace=True)
    df.dropna(subset=["title"], inplace=True)


    df['description_cleaned'] = df['description'].apply(clean_text_column)
    df['title_cleaned'] = df['title'].apply(clean_text_column)

    df['description'] = df['title_cleaned'] + " " + df['description_cleaned']



    for i, row in df.iterrows():
        img_name = row['img_name']
        description = row['description']

        tokens = word_tokenize(description)
        truncated_description = " ".join(tokens[:500])

        category_id = row['category_id']  # Category as string

        # Convert category_id from str to int
        try:
            category_id = int(category_id)
        except ValueError:
            print(f"Invalid category ID {category_id}. Skipping entry.")
            continue

        # Append the data to respective lists
        image_names.append(img_name)
        descriptions.append(truncated_description)
        category_ids.append(category_id)

    # Convert lists to NumPy arrays
    images = np.array(image_names)
    descriptions = np.array(descriptions, dtype=object)  # Keep descriptions as strings
    category_ids = np.array(category_ids)

    return images, descriptions, category_ids

# Load training and testing data
image_names_train, descriptions_train, category_ids_train = load_data(csv_file_path_train, images_folder)
image_names_test, descriptions_test, category_ids_test = load_data(csv_file_path_test, images_folder)

# Output shapes and types to verify
print("Training Data:")
print("Images shape:", image_names_train.shape)
print("Descriptions shape:", descriptions_train.shape)
print("Category IDs shape:", category_ids_train.shape)
print("Category IDs dtype:", category_ids_train.dtype)

print("\nTesting Data:")
print("Images shape:", image_names_test.shape)
print("Descriptions shape:", descriptions_test.shape)
print("Category IDs shape:", category_ids_test.shape)
print("Category IDs dtype:", category_ids_test.dtype)

### execute the following block only if emebeddings are not already precomputed

In [None]:
import tensorflow_hub as hub
from tqdm import tqdm
import time

# Load USE model
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")


def text_to_use_embedding(sentences, use_model):
    embeddings = []
    for sentence in tqdm(sentences, desc="Processing sentences", unit="sentence"):
        if not sentence or len(sentence.strip()) == 0:
            embeddings.append(np.zeros((embedding_dim,)))  # Empty description placeholder
            continue

        # Generate USE embeddings
        sentence_embedding = use_model([sentence]).numpy().squeeze(0)

        embeddings.append(sentence_embedding)

    return np.array(embeddings)



### load embeddings

In [None]:
#use only for evaluation
embedding = "USE/USE_cleaned"

try:
    text_embeddings_train = np.load(f"/content/drive/MyDrive/{embedding}/text_embeddings_train.npy")
    text_embeddings_test = np.load(f"/content/drive/MyDrive/{embedding}/text_embeddings_test.npy")
except OSError:
    # Precompute and save USE embeddings
    text_embeddings_train = text_to_use_embedding(descriptions_train, use_model)
    np.save("/content/drive/MyDrive/USE/text_embeddings_train.npy", text_embeddings_train)

    text_embeddings_test = text_to_use_embedding(descriptions_test, use_model)
    np.save("/content/drive/MyDrive/USE/text_embeddings_test.npy", text_embeddings_test)


### create training, validation and testing dataset

In [None]:
from sklearn.model_selection import train_test_split
import tensorflow as tf

def data_generator(img_names, descriptions_embeddings, category_ids, images_folder, batch_size=16, shuffle=True):
    def parse_function(img_name, embedding, category_id):
        # Load and preprocess the image
        image_path = tf.strings.join([images_folder, img_name])
        image = tf.io.read_file(image_path)
        image = tf.image.decode_jpeg(image, channels=3)
        image = tf.image.resize(image, [224, 224])  # Resize
        image = tf.cast(image, tf.float32) / 255.0  # Normalize

        embedding = tf.cast(embedding, tf.float32) # float64 to float32

        return (image, embedding), tf.one_hot(category_id, num_classes)

    # Create dataset
    dataset = tf.data.Dataset.from_tensor_slices((img_names, descriptions_embeddings, category_ids))

    if shuffle:
        dataset = (
            dataset.map(parse_function, num_parallel_calls=tf.data.AUTOTUNE)
            .shuffle(buffer_size=1000)
            .batch(batch_size)
            .prefetch(tf.data.AUTOTUNE)
        )
    else:
        dataset = (
            dataset.map(parse_function, num_parallel_calls=tf.data.AUTOTUNE)
            .batch(batch_size)
            .prefetch(tf.data.AUTOTUNE)
        )
    return dataset

train_img_names, val_img_names, train_embeddings, val_embeddings, train_category_ids, val_category_ids = train_test_split(
    image_names_train, text_embeddings_train, category_ids_train, test_size=0.1, random_state=42
)

train_dataset = data_generator(
    train_img_names, train_embeddings, train_category_ids, images_folder, batch_size
)
validation_dataset = data_generator(
    val_img_names, val_embeddings, val_category_ids, images_folder, batch_size
)
test_dataset = data_generator(
    image_names_test, text_embeddings_test, category_ids_test, images_folder, batch_size
)

# only for plotting after training
test_dataset_without_shuffle = data_generator(
    image_names_test, text_embeddings_test, category_ids_test, images_folder, batch_size, shuffle=False
)

# Training

In [None]:
from tensorflow.keras.layers import Input, Dense, Flatten, GlobalAveragePooling2D, concatenate
from tensorflow.keras.applications import ResNet50, MobileNetV2
import numpy as np
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
    tf.keras.callbacks.ModelCheckpoint('/content/drive/MyDrive/fds/models/best_model.keras', save_best_only=True, save_weights_only=False)
]


# Image Feature Extractor using ImageNet v2
def build_image_model():
    #base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
    base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3)) # (128, 128, 3)
    for layer in base_model.layers:
        layer.trainable = False
    x = GlobalAveragePooling2D()(base_model.output)
    x = Dense(128, activation='relu')(x)
    return base_model.input, x

# Joint Model
def build_joint_model(embedding_dim, num_classes):
    # Image Model
    image_input, image_features = build_image_model()

    # Text Model
    text_input = Input(shape=(embedding_dim,), name="text_input")  # Precomputed text embeddings
    text_features = Dense(128, activation='relu')(text_input)

    # Combine Features
    combined = concatenate([image_features, text_features])
    combined = Dense(256, activation='relu')(combined) # 128
    combined = Dense(128, activation='relu')(combined) # 64

    # Output
    output = Dense(num_classes, activation='softmax', name="output")(combined)

    # Model
    model = tf.keras.Model(inputs=[image_input, text_input], outputs=output)
    return model


# Build the joint model
model = build_joint_model(embedding_dim, num_classes)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


# Train the model
model.fit(
    train_dataset,
    validation_data=validation_dataset,
    epochs=10,
    callbacks=callbacks
)

test_loss, test_accuracy = model.evaluate(test_dataset)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")



In [None]:
# Save the model after training
model.save('/content/drive/MyDrive/fds/models/model1.keras')

Continue training if needed

In [None]:
# Load the previously saved model
model = tf.keras.models.load_model('/content/drive/MyDrive/fds/models/best_model.keras')

continue_epoch = 3

# Continue training from the 3rd epoch
model.fit(
    train_dataset,
    validation_data=validation_dataset,
    epochs=10,
    callbacks=callbacks,
    initial_epoch=continue_epoch
)


# Model evaluation

load some model

In [None]:
loaded_model = tf.keras.models.load_model('/content/drive/MyDrive/fds/models_cleaned/ResNet+USE_16_10val.keras')

In [None]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

test_loss, test_accuracy = loaded_model.evaluate(test_dataset)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

# Get true labels and predicted labels
y_true = np.concatenate([y for x, y in test_dataset_without_shuffle], axis=0)  # True labels
y_pred_probs = loaded_model.predict(test_dataset_without_shuffle)             # Predicted probabilities
y_pred = np.argmax(y_pred_probs, axis=1)                      # Predicted classes

# Compute precision, recall, and F1-score
precision, recall, f1, _ = precision_recall_fscore_support(
    y_true=np.argmax(y_true, axis=1),  # Convert one-hot to integer labels if needed
    y_pred=y_pred,
    average='weighted'                # Use 'weighted' for class imbalance
)

print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")

cm = confusion_matrix(np.argmax(y_true, axis=1), y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
category_mapping = {
    0: "Arts & Photography",
    1: "Biographies & Memoirs",
    2: "Business & Money",
    3: "Children's Books",
    4: "Comics & Graphic Novels",
    5: "Computers & Technology",
    6: "Cookbooks, Food & Wine",
    7: "Crafts, Hobbies & Home",
    8: "Christian Books & Bibles",
    9: "Engineering & Transportation",
    10: "Health, Fitness & Dieting",
    11: "History",
    12: "Humor & Entertainment",
    13: "Law",
    14: "Literature & Fiction",
    15: "Medical Books",
    16: "Mystery, Thriller & Suspense",
    17: "Parenting & Relationships",
    18: "Politics & Social Sciences",
    19: "Reference",
    20: "Religion & Spirituality",
    21: "Romance",
    22: "Science & Math",
    23: "Science Fiction & Fantasy",
    24: "Self-Help",
    25: "Sports & Outdoors",
    26: "Teen & Young Adult",
    27: "Test Preparation",
    28: "Travel",
}

## Top-k (5) accuracy

In [None]:
import numpy as np
from tensorflow.keras.metrics import top_k_categorical_accuracy

def calculate_top_k_accuracy(model, dataset, k=5):
    total_samples = 0
    correct_predictions = 0

    for batch in dataset:
        # Unpack data
        x, y_true = batch

        # Predict probabilities
        y_pred = model.predict(x)

        # Get the indices of the top k predictions
        top_k_preds = np.argsort(y_pred, axis=-1)[:, -k:]

        # Compare with true labels
        for true, top_k in zip(np.argmax(y_true, axis=-1), top_k_preds):
            if true in top_k:
                correct_predictions += 1

        total_samples += y_true.shape[0]

    # Calculate accuracy
    top_k_accuracy = correct_predictions / total_samples
    return top_k_accuracy

# Example usage:
k=5
top_k_acc = calculate_top_k_accuracy(loaded_model, test_dataset, k=k)
print(f"Top-k Accuracy with k={k}: {top_k_acc:.2f}")


top-k accuracy by class

In [None]:
import numpy as np

def calculate_top_k_accuracy_by_class(model, dataset, k=5, num_classes=29):
    class_correct = np.zeros(num_classes)
    class_total = np.zeros(num_classes)

    for batch in dataset:
        # Unpack data
        x, y_true = batch

        # Predict probabilities
        y_pred = model.predict(x)

        # Get the indices of the top k predictions
        top_k_preds = np.argsort(y_pred, axis=-1)[:, -k:]

        # Compare with true labels
        for true, top_k in zip(np.argmax(y_true, axis=-1), top_k_preds):
            class_total[true] += 1
            if true in top_k:
                class_correct[true] += 1

    # Calculate top-k accuracy for each class
    class_accuracy = class_correct / class_total

    # Sort the classes by accuracy in descending order
    sorted_indices = np.argsort(class_accuracy)[::-1]

    # Print sorted top-k accuracy with category names
    print(f"Top-K Accuracy by Class with k={k} (sorted):\n")
    for class_id in sorted_indices:
        accuracy = class_accuracy[class_id]
        category_name = category_mapping[class_id]
        print(f"{category_name}: {accuracy:.2f}")

# Example usage:
top_k_acc_by_class = calculate_top_k_accuracy_by_class(loaded_model, test_dataset, k=5, num_classes=29)


top-1 accuracy sorted by class

In [None]:
import numpy as np

def calculate_top_k_accuracy_by_class(model, dataset, k=5, num_classes=29):
    class_correct = np.zeros(num_classes)
    class_total = np.zeros(num_classes)

    for batch in dataset:
        # Unpack data
        x, y_true = batch

        # Predict probabilities
        y_pred = model.predict(x)

        # Get the indices of the top k predictions
        top_k_preds = np.argsort(y_pred, axis=-1)[:, -k:]

        # Compare with true labels
        for true, top_k in zip(np.argmax(y_true, axis=-1), top_k_preds):
            class_total[true] += 1
            if true in top_k:
                class_correct[true] += 1

    # Calculate top-k accuracy for each class
    class_accuracy = class_correct / class_total

    # Sort the classes by accuracy in descending order
    sorted_indices = np.argsort(class_accuracy)[::-1]

    # Print sorted top-k accuracy with category names
    print(f"Top-K Accuracy by Class with k={k} (sorted):\n")
    for class_id in sorted_indices:
        accuracy = class_accuracy[class_id]
        category_name = category_mapping[class_id]
        print(f"{category_name}: {accuracy:.2f}")

# Example usage:
top_k_acc_by_class = calculate_top_k_accuracy_by_class(loaded_model, test_dataset, k=1, num_classes=29)


## Example prediction

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import random
import textwrap

def wrap_text(text, width=50):
    return textwrap.fill(text, width=width)

# Randomly select 8 samples from the dataset
def random_sample_from_dataset(dataset, n=8):
    dataset = list(dataset)  # Convert to list for sampling
    chosen_indices = random.sample(range(len(dataset)), n)
    return [dataset[i] for i in chosen_indices], chosen_indices
    #return random.sample(dataset, n)

def get_original_description(sample_index):
    # Fetch and truncate description
    description_text = descriptions_test[sample_index * batch_size][:80] + "..."
    return description_text

def visualize_predictions(samples, sample_idx, model, category_mapping):
    fig, axes = plt.subplots(2, 4, figsize=(20, 12))
    axes = axes.flatten()
    fig.tight_layout(pad=5.0)

    for i, (sample, sample_index) in enumerate(zip(samples, sample_idx)):
        data, label = sample
        image_batch, description_batch = data[0], data[1]
        true_label = np.argmax(label)

        # Extract a single sample from the batch
        image = image_batch[0]
        description = description_batch[0]

        # Make predictions
        predictions = model.predict((image[None, ...], description[None, ...]))
        top_k = np.argsort(predictions[0])[-5:][::-1]
        top_k_probs = predictions[0][top_k]
        top_k_labels = [category_mapping[idx] for idx in top_k]

        description_text = get_original_description(sample_index)

        # Display the image
        ax = axes[i]
        ax.imshow(image.numpy())
        ax.axis('off')

        # Create a readable caption
        true_category = f"True: {category_mapping[true_label]}"
        predicted_category = "Pred: " + ", ".join(
            [f"{label} ({prob:.2f})" for label, prob in zip(top_k_labels, top_k_probs)]
        )

        caption = f"{true_category}\n{wrap_text(predicted_category, width=50)}\nDesc: {wrap_text(description_text, width=50)}"

        # Add caption
        ax.set_title(caption, fontsize=10, loc='center', wrap=True)

    plt.show()

# Example Usage
samples, sample_idx = random_sample_from_dataset(test_dataset_without_shuffle)
visualize_predictions(samples, sample_idx, loaded_model, category_mapping)


Own book

In [None]:
from sklearn.model_selection import train_test_split
import tensorflow as tf

description = """


Your faithful companion throughout the grades with all relevant topics in mathematics.

    128 pages
    All topics from 5th to 10th grade
    for grammar schools, comprehensive schools, secondary schools and intermediate schools
    101 learning videos by Daniel Jung
    21 tasks at exam level incl. solutions


In this learning booklet for lower secondary level, you will find explanations and sample exercises on all relevant topics from 5th to 10th grade for grammar schools, comprehensive schools, lower secondary schools and intermediate secondary schools.

Regardless of whether you simply want to consolidate the current material or systematically prepare for the upcoming final exam (ZAP or MSA), you will find what you are looking for here! Seemingly complex contexts are presented in a language that you can fully understand. You will also find all the formulas, sketches and anything else you need to understand the subject matter. As a special bonus, this booklet contains a short introduction to spreadsheets (Excel).

The 5th-10th grade booklet will be your faithful companion throughout the grades, helping you to brush up on important topics and prepare you optimally for the mathematical content.

All topics are divided into clear chapters that will guide you in your preparation:

    Basics
    Fractions
    Negative numbers
    Multiplication/factorization (factoring out)
    Terms and equations/fractional equations/inequalities
    Assignments and dr"""

description_embedding = text_to_use_embedding([description], use_model)

image = tf.io.read_file("71mrv+fvwwL._SL1500_.jpg")
image = tf.image.decode_jpeg(image, channels=3)
image = tf.image.resize(image, [224, 224])  # Resize
image = tf.cast(image, tf.float32) / 255.0  # Normalize
image = tf.expand_dims(image, axis=0)  # Add batch dimension

# Make predictions
predictions = loaded_model.predict((image, description_embedding))
top_k = np.argsort(predictions[0])[-5:][::-1]
top_k_probs = predictions[0][top_k]
top_k_labels = [category_mapping[idx] for idx in top_k]

# Create a readable caption
predicted_category = "Pred: " + ", ".join(
    [f"{label} ({prob:.2f})" for label, prob in zip(top_k_labels, top_k_probs)]
)

print(predicted_category)