In [1]:
import numpy as np
import pandas as pd

In [2]:
train_clean = pd.read_csv("data/clean/train_clean.csv")

# Features only (drop ID + label)
feature_cols = [c for c in train_clean.columns 
                if c not in ["PassengerId", "Transported"]]

X = train_clean[feature_cols].to_numpy().astype(np.float32)  # shape (n, d)

n, d = X.shape
print(n, d)

8693 30


In [3]:
rng = np.random.default_rng(0)

k = 16

W_enc = rng.normal(0, 0.01, size=(d, k))  # (d, k)
b_enc = np.zeros(k, dtype=np.float32)     # (k,)

W_dec = rng.normal(0, 0.01, size=(k, d))  # (k, d)
b_dec = np.zeros(d, dtype=np.float32)     # (d,)

In [4]:
def encoder_forward(X, W_enc, b_enc):
    Z_lin = X @ W_enc + b_enc          # (n, k)
    Z = np.tanh(Z_lin)                 # nonlinearity
    return Z, Z_lin

def decoder_forward(Z, W_dec, b_dec):
    X_hat = Z @ W_dec + b_dec          # (n, d) linear output
    return X_hat

def forward_pass(X, W_enc, b_enc, W_dec, b_dec):
    Z, Z_lin = encoder_forward(X, W_enc, b_enc)
    X_hat = decoder_forward(Z, W_dec, b_dec)
    return Z, Z_lin, X_hat


In [5]:
def mse_loss(X, X_hat):
    diff = X_hat - X
    return np.mean(np.sum(diff**2, axis=1))  # average squared L2 per sample

In [6]:
def backward_pass(X, Z, Z_lin, X_hat, W_enc, W_dec):
    n = X.shape[0]
    
    # dL/dX_hat
    dX_hat = (2.0 / n) * (X_hat - X)         # (n, d)
    
    # Grad decoder
    dW_dec = Z.T @ dX_hat                    # (k, d)
    db_dec = np.sum(dX_hat, axis=0)          # (d,)
    
    # Backprop to Z
    dZ = dX_hat @ W_dec.T                    # (n, k)
    dZ_lin = dZ * (1.0 - np.tanh(Z_lin)**2)  # tanh' = 1 - tanh^2
    
    # Grad encoder
    dW_enc = X.T @ dZ_lin                    # (d, k)
    db_enc = np.sum(dZ_lin, axis=0)          # (k,)
    
    return dW_enc, db_enc, dW_dec, db_dec


In [7]:
lr = 1e-3
num_epochs = 4000

for epoch in range(num_epochs):
    # Forward
    Z, Z_lin, X_hat = forward_pass(X, W_enc, b_enc, W_dec, b_dec)
    loss = mse_loss(X, X_hat)
    
    # Backward
    dW_enc, db_enc, dW_dec, db_dec = backward_pass(
        X, Z, Z_lin, X_hat, W_enc, W_dec
    )
    
    # Gradient step
    W_enc -= lr * dW_enc
    b_enc -= lr * db_enc
    W_dec -= lr * dW_dec
    b_dec -= lr * db_dec
    
    if epoch % 200 == 0:
        print(f"Epoch {epoch}: loss = {loss:.4f}")

print(f'Final Loss (Epoch 4000): {loss}')


Epoch 0: loss = 11.3968
Epoch 200: loss = 10.4157
Epoch 400: loss = 9.9243
Epoch 600: loss = 9.5550
Epoch 800: loss = 9.0600
Epoch 1000: loss = 8.4070
Epoch 1200: loss = 7.8035
Epoch 1400: loss = 7.3207
Epoch 1600: loss = 6.8750
Epoch 1800: loss = 6.4209
Epoch 2000: loss = 5.9699
Epoch 2200: loss = 5.5476
Epoch 2400: loss = 5.1731
Epoch 2600: loss = 4.8530
Epoch 2800: loss = 4.5830
Epoch 3000: loss = 4.3521
Epoch 3200: loss = 4.1483
Epoch 3400: loss = 3.9617
Epoch 3600: loss = 3.7857
Epoch 3800: loss = 3.6169
Final Loss (Epoch 4000): 3.4548792581615704


In [8]:
np.savez(
    "autoencoder_weights_k16.npz", 
    W_enc=W_enc,
    b_enc=b_enc,
    W_dec=W_dec,
    b_dec=b_dec,
)

In [9]:
weights = np.load("autoencoder_weights_k16.npz")
W_enc = weights["W_enc"]
b_enc = weights["b_enc"]
W_dec = weights["W_dec"]
b_dec = weights["b_dec"]

In [10]:
def encode(X, W_enc, b_enc):
    Z_lin = X @ W_enc + b_enc
    Z = np.tanh(Z_lin)
    return Z

In [11]:
# X is full train feature matrix (no PassengerId/Transported)
Z_train = encode(X, W_enc, b_enc)
print(Z_train.shape)  # (n, k)


(8693, 16)


In [12]:
test_clean = pd.read_csv("data/clean/test_clean.csv")
X_test = test_clean[feature_cols].to_numpy().astype(np.float32)

Z_test = encode(X_test, W_enc, b_enc)
print(Z_test.shape)   # (n_test, k)


(4277, 16)


In [13]:
from sklearn.model_selection import train_test_split

y = train_clean["Transported"].values.astype(int)
X = train_clean[feature_cols].to_numpy().astype(np.float32)

# Encode
Z = encode(X, W_enc, b_enc)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=0, stratify=y
)

Z_train, Z_val, _, _ = train_test_split(
    Z, y, test_size=0.2, random_state=0, stratify=y
)


In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Baseline on original features
clf_raw = LogisticRegression(max_iter=1000)
clf_raw.fit(X_train, y_train)
y_val_pred_raw = clf_raw.predict(X_val)
acc_raw = accuracy_score(y_val, y_val_pred_raw)

# On AE features
clf_ae = LogisticRegression(max_iter=1000)
clf_ae.fit(Z_train, y_train)
y_val_pred_ae = clf_ae.predict(Z_val)
acc_ae = accuracy_score(y_val, y_val_pred_ae)

print(f"Baseline (original X) val accuracy: {acc_raw:.4f}")
print(f"AE features (Z) val accuracy:      {acc_ae:.4f}")


Baseline (original X) val accuracy: 0.7878
AE features (Z) val accuracy:      0.7826


In [15]:
train_clean = pd.read_csv("data/clean/train_clean.csv")
test_clean  = pd.read_csv("data/clean/test_clean.csv")

feature_cols = [c for c in train_clean.columns 
                if c not in ["PassengerId", "Transported"]]

X_train = train_clean[feature_cols].to_numpy().astype(np.float32)
X_test  = test_clean[feature_cols].to_numpy().astype(np.float32)
y_train = train_clean["Transported"].values.astype(int)

# Encode with k=16 model
Z_train = encode(X_train, W_enc, b_enc)   # (n_train, 16)
Z_test  = encode(X_test,  W_enc, b_enc)   # (n_test, 16)

k = Z_train.shape[1]
z_cols = [f"z{i+1}" for i in range(k)]

train_ae_df = pd.DataFrame(Z_train, columns=z_cols)
train_ae_df.insert(0, "PassengerId", train_clean["PassengerId"].values)
train_ae_df["Transported"] = y_train

test_ae_df = pd.DataFrame(Z_test, columns=z_cols)
test_ae_df.insert(0, "PassengerId", test_clean["PassengerId"].values)


train_ae_df.to_csv(f"data/ae/train_ae_k{k}.csv", index=False)
test_ae_df.to_csv(f"data/ae/test_ae_k{k}.csv", index=False)

In [16]:
train_ae_df.shape

(8693, 18)

In [17]:
test_ae_df.shape

(4277, 17)

In [18]:
train_ae_df.head()

Unnamed: 0,PassengerId,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,Transported
0,0001_01,-0.158205,0.132704,-0.430807,-0.320589,-0.410417,-0.624105,0.276699,0.4539,-0.490857,-0.162396,0.337717,0.015406,-0.22923,0.350633,-0.614734,-0.468702,0
1,0002_01,-0.434602,-0.19717,-0.18541,-0.199295,0.21646,-0.364508,0.191649,0.0349,-0.275214,0.009058,0.399423,-0.22413,0.25464,-0.056157,0.080958,-0.222974,1
2,0003_01,-0.105637,-0.946713,-0.866595,-0.815469,0.616284,0.464645,-0.668298,0.909662,-0.641514,0.906753,-0.910717,0.509792,-0.798082,-0.948456,-0.954355,-0.166209,0
3,0003_02,-0.513136,-0.653231,-0.621006,-0.626368,0.640578,-0.08055,-0.419558,0.690669,-0.351341,0.676015,-0.162317,0.053313,-0.299055,-0.498215,-0.634049,-0.048354,0
4,0004_01,-0.543692,-0.181811,-0.117058,-0.175164,0.369839,-0.201059,0.175983,0.066464,-0.270667,0.157981,0.604665,-0.244855,0.446801,0.002046,0.383281,0.037364,1


In [19]:
test_ae_df.head()

Unnamed: 0,PassengerId,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16
0,0013_01,-0.305838,-0.109026,-0.385449,-0.24749,0.115889,-0.613962,0.449924,0.093813,-0.214199,-0.365258,0.492248,-0.520311,0.354259,-0.010057,-0.111742,-0.554061
1,0018_01,-0.643633,-0.654931,-0.560464,-0.447726,0.565963,0.123656,-0.162871,0.189621,-0.221318,0.657746,0.135491,-0.034298,0.514649,-0.40098,-0.002305,-0.281538
2,0019_01,-0.237433,-0.248537,-0.344327,-0.47751,0.307793,-0.665555,0.441962,0.251513,-0.35502,-0.246297,0.411254,-0.559257,-0.029451,0.321478,-0.659127,-0.42322
3,0021_01,0.772947,0.395442,-0.395713,-0.762571,0.608881,0.132064,-0.565351,0.755982,-0.759984,-0.708483,0.073982,-0.71151,-0.696957,-0.745797,-0.701284,0.670113
4,0023_01,-0.488656,0.167971,0.130204,-0.054924,0.410369,-0.603872,-0.101239,0.16634,0.100143,0.039978,0.651015,-0.323748,0.123732,0.23122,0.306087,0.050709
