# Deep Learning Foundations Assignment
**Name:**
**Date:**


## Part 1 — Neural Networks (Tabular FFNN)
### 1.1 Single Neuron Forward Pass

In [None]:
import numpy as np

def neuron_forward(x, w, b):
    z = np.dot(w, x) + b
    relu = np.maximum(0, z)
    sigmoid = 1 / (1 + np.exp(-z))
    return z, relu, sigmoid

# Example
x = np.array([2.0, -1.0, 3.0])
w = np.array([0.5, -0.25, 1.0])
b = 0.1

z, relu_z, sigmoid_z = neuron_forward(x, w, b)
print(f"z = {z:.3f}")
print(f"ReLU(z) = {relu_z:.3f}")
print(f"Sigmoid(z) = {sigmoid_z:.3f}")


z = 4.350
ReLU(z) = 4.350
Sigmoid(z) = 0.987


**Explanation:**  
Activation functions are essential in neural networks because they introduce non-linearity, enabling the network to learn complex patterns. Without them, the network would behave like a linear model, regardless of its depth.  
ReLU is typically used in hidden layers for its simplicity and effectiveness in mitigating vanishing gradients.  
Sigmoid is used in output layers for binary classification, as it maps outputs to (0, 1), representing probabilities.


### 1.2 Preprocessing + Split

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('tabular.csv')
print("Shape:", df.shape)
print("Columns:", df.columns)
print("Dtypes:\n", df.dtypes)
print("Missing values:\n", df.isnull().sum())

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
categorical_cols = [col for col in categorical_cols if col not in ['customer_id']]

for col in numeric_cols:
    df[col].fillna(df[col].median(), inplace=True)
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

X = df.drop(['target', 'customer_id'], axis=1)
y = df['target']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

X_train = pd.get_dummies(X_train, columns=categorical_cols)
X_val = pd.get_dummies(X_val, columns=categorical_cols)
X_test = pd.get_dummies(X_test, columns=categorical_cols)

X_val = X_val.reindex(columns=X_train.columns, fill_value=0)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

scaler = StandardScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_val[numeric_cols] = scaler.transform(X_val[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("X_test shape:", X_test.shape)
print("Number of features after encoding:", X_train.shape[1])


FileNotFoundError: [Errno 2] No such file or directory: 'tabular.csv'

**Explanation:**  
Data leakage happens when information from outside the training set is used to create the model, leading to over-optimistic results. Fitting the scaler on the full dataset would leak information from validation/test into training, invalidating the evaluation. Always fit the scaler only on the training data.


### 1.3 FFNN Training + Evaluation

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error

model = models.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='linear')
])
model.compile(optimizer='adam', loss='mse')

es = callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=32,
    callbacks=[es],
    verbose=1
)

model.summary()

plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.title('Training vs Validation Loss')
plt.legend()
plt.show()

y_pred = model.predict(X_test).flatten()
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Test MAE: {mae:.2f}")
print(f"Test RMSE: {rmse:.2f}")

plt.scatter(y_test, y_pred, alpha=0.7)
plt.xlabel('Actual Target')
plt.ylabel('Predicted Target')
plt.title('Parity Plot: Actual vs Predicted')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.show()


### 1.4 Overfit/Underfit Diagnosis

*Write your diagnosis and next steps here in markdown.*

## Part 2 — NLP (Embeddings + RNN)
### 2.1 Tokenization + Padding

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

df_text = pd.read_csv('text.csv')
print(df_text['label'].value_counts())

df_text['text'] = df_text['text'].str.lower().str.strip().str.replace('\s+', ' ', regex=True)

X_text = df_text['text'].values
y_text = df_text['label'].values
X_train, X_temp, y_train, y_temp = train_test_split(X_text, y_text, test_size=0.3, random_state=42, stratify=y_text)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
vocab_size = min(10000, len(tokenizer.word_index) + 1)
print("Final vocabulary size:", vocab_size)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_len = 40
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

print("Raw text:", X_train[0])
print("Token IDs:", X_train_seq[0])
print("Padded:", X_train_pad[0])


**Justification:**  
A max_len of 40 covers most short feedback sentences, ensuring minimal truncation while keeping computation efficient.


### 2.2 Baseline Embedding Model

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

num_classes = len(np.unique(y_train))
output_activation = 'sigmoid' if num_classes == 2 else 'softmax'
loss_fn = 'binary_crossentropy' if num_classes == 2 else 'sparse_categorical_crossentropy'

model_nlp_base = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=max_len),
    GlobalAveragePooling1D(),
    Dense(64, activation='relu'),
    Dense(1 if num_classes == 2 else num_classes, activation=output_activation)
])
model_nlp_base.compile(optimizer='adam', loss=loss_fn, metrics=['accuracy'])
model_nlp_base.summary()

es = callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model_nlp_base.fit(
    X_train_pad, y_train,
    validation_data=(X_val_pad, y_val),
    epochs=20,
    batch_size=32,
    callbacks=[es],
    verbose=1
)

y_pred = model_nlp_base.predict(X_test_pad)
if num_classes == 2:
    y_pred_label = (y_pred > 0.5).astype(int).flatten()
else:
    y_pred_label = np.argmax(y_pred, axis=1)

acc = accuracy_score(y_test, y_pred_label)
f1 = f1_score(y_test, y_pred_label, average='weighted')
cm = confusion_matrix(y_test, y_pred_label)
print(f"Test accuracy: {acc:.3f}")
print(f"Test F1-score: {f1:.3f}")
print("Confusion matrix:\n", cm)


### 2.3 RNN Model (SimpleRNN or LSTM)

In [None]:
from tensorflow.keras.layers import LSTM, SimpleRNN

model_nlp_rnn = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=max_len),
    LSTM(32),
    Dense(64, activation='relu'),
    Dense(1 if num_classes == 2 else num_classes, activation=output_activation)
])
model_nlp_rnn.compile(optimizer='adam', loss=loss_fn, metrics=['accuracy'])
model_nlp_rnn.summary()

es = callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model_nlp_rnn.fit(
    X_train_pad, y_train,
    validation_data=(X_val_pad, y_val),
    epochs=20,
    batch_size=32,
    callbacks=[es],
    verbose=1
)

y_pred = model_nlp_rnn.predict(X_test_pad)
if num_classes == 2:
    y_pred_label = (y_pred > 0.5).astype(int).flatten()
else:
    y_pred_label = np.argmax(y_pred, axis=1)

acc = accuracy_score(y_test, y_pred_label)
f1 = f1_score(y_test, y_pred_label, average='weighted')
cm = confusion_matrix(y_test, y_pred_label)
print(f"Test accuracy: {acc:.3f}")
print(f"Test F1-score: {f1:.3f}")
print("Confusion matrix:\n", cm)


**Explanation:**  
LSTM is chosen over SimpleRNN because it can capture longer dependencies and mitigate vanishing gradients, which is important for text data with context spread over several words.


### 2.4 Comparison + Transformer Note

In [None]:
import time

start = time.time()
_ = model_nlp_base.fit(X_train_pad, y_train, epochs=1, batch_size=32, verbose=0)
base_time = time.time() - start

start = time.time()
_ = model_nlp_rnn.fit(X_train_pad, y_train, epochs=1, batch_size=32, verbose=0)
rnn_time = time.time() - start

print("| Model      | Accuracy | F1-score | Training Time (1 epoch, s) |")
print(f"| Baseline   | {acc:.3f}   | {f1:.3f}   | {base_time:.2f} |")
print(f"| RNN/LSTM   | {acc:.3f}   | {f1:.3f}   | {rnn_time:.2f} |")


**Markdown Explanation:**  
RNNs struggle with long sequences due to vanishing gradients and limited memory, making it hard to capture distant dependencies. Transformers address this with self-attention, allowing direct connections between all positions in the sequence, leading to better performance on long texts and parallelizable training.


## Part 3 — Computer Vision (CNN)
### 3.1 Data Loading + Visual Checks

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt

img_size = (128, 128)
batch_size = 32
train_ds = tf.keras.utils.image_dataset_from_directory(
    "images",
    validation_split=0.3,
    subset="training",
    seed=42,
    image_size=img_size,
    batch_size=batch_size
)
val_ds = tf.keras.utils.image_dataset_from_directory(
    "images",
    validation_split=0.3,
    subset="validation",
    seed=42,
    image_size=img_size,
    batch_size=batch_size
)

class_names = train_ds.class_names
num_classes = len(class_names)
print("Class names:", class_names)
for images, labels in train_ds.take(1):
    print("Batch shape:", images.shape, labels.shape)

normalization_layer = tf.keras.layers.Rescaling(1./255)
train_ds = train_ds.map(lambda x, y: (normalization_layer(x), y))
val_ds = val_ds.map(lambda x, y: (normalization_layer(x), y))

plt.figure(figsize=(8, 8))
for images, labels in train_ds.take(1):
    for i in range(9):
        ax = plt.subplot(3, 3, i + 1)
        plt.imshow(images[i].numpy())
        plt.title(class_names[labels[i]])
        plt.axis("off")
plt.show()


### 3.2 CNN Training

In [None]:
from tensorflow.keras import layers, models

model_cnn = models.Sequential([
    layers.Input(shape=img_size + (3,)),
    layers.Conv2D(32, (3, 3), activation='relu'),
    layers.MaxPooling2D(),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D(),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(num_classes, activation='softmax' if num_classes > 2 else 'sigmoid')
])
model_cnn.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy' if num_classes > 2 else 'binary_crossentropy',
    metrics=['accuracy']
)
model_cnn.summary()

es = callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model_cnn.fit(
    train_ds,
    validation_data=val_ds,
    epochs=20,
    callbacks=[es]
)

plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('CNN Training vs Validation Loss')
plt.legend()
plt.show()


### 3.3 Evaluation + Misclassifications

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

test_images, test_labels = next(iter(val_ds.unbatch().batch(len(val_ds))))
y_pred = model_cnn.predict(test_images)
if num_classes > 2:
    y_pred_label = np.argmax(y_pred, axis=1)
else:
    y_pred_label = (y_pred > 0.5).astype(int).flatten()

acc = accuracy_score(test_labels, y_pred_label)
print(f"Test accuracy: {acc:.3f}")

cm = confusion_matrix(test_labels, y_pred_label)
print("Confusion matrix:\n", cm)
print("Classification report:\n", classification_report(test_labels, y_pred_label))

mis_idx = np.where(test_labels != y_pred_label)[0][:5]
plt.figure(figsize=(15, 3))
for i, idx in enumerate(mis_idx):
    ax = plt.subplot(1, 5, i + 1)
    plt.imshow(test_images[idx].numpy())
    plt.title(f"True: {class_names[test_labels[idx]]}\nPred: {class_names[y_pred_label[idx]]}")
    plt.axis("off")
plt.show()


**Markdown Write-up:**  
Review the confusion matrix and misclassified images. Are errors concentrated in certain classes? Are there systematic confusions?  
To improve, try data augmentation, a deeper CNN, or transfer learning with a pretrained model.


## Final Summary

*Write 5–8 sentences summarizing your findings, model performance, and next steps.*