In [9]:
import numpy as np
import pandas as pd

# Load the data
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission_y = sample_submission.drop(columns=["UID"])

# Extract features and target
X_train = train_data.iloc[500:, 1:-1]
y_train = train_data.iloc[500:, -1]
X_val = train_data.iloc[:500, 1:-1]
y_val = train_data.iloc[:500, -1]
X_test = test_data.iloc[:, 1:]

# Categorical columns
categorical_cols = ['col_0', 'col_1', 'col_2', 'col_3', 'col_5', 'col_6']

# # Fill NaN values in categorical columns with 'Unknown'
# def fill_na(data, categorical_cols):
#     for col in categorical_cols:
#         data[col].fillna('Unknown', inplace=True)
#     return data

# Fill NaN values in categorical columns with the most frequent value in the respective columns
def fill_na(data, categorical_cols):
    for col in categorical_cols:
        most_frequent_value = data[col].mode().iloc[0]  # Get the mode (most frequent value)
        data[col].fillna(most_frequent_value, inplace=True)
    return data

X_train = fill_na(X_train, categorical_cols)
X_val = fill_na(X_val, categorical_cols)
X_test = fill_na(X_test, categorical_cols)

print(X_train)
print(X_train.isnull().sum())

# label encoding
def label_encoding(data, categorical_cols):
    label_encoders = {}
    for col in categorical_cols:
        unique_categories = data[col].unique()
        encoding_map = {category: index + 1 for index, category in enumerate(unique_categories)}
        data[col] = data[col].map(encoding_map)
        label_encoders[col] = encoding_map
    return data

X_train = label_encoding(X_train, categorical_cols)
X_val = label_encoding(X_val, categorical_cols)
X_test = label_encoding(X_test, categorical_cols)

# normalize
def normalize(df):
    return (df - df.min()) / (df.max() - df.min())

X_train = normalize(X_train)
X_val = normalize(X_val)
X_test = normalize(X_test)

# convert df into numpy
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()
X_val = X_val.to_numpy()
y_val = y_val.to_numpy()
X_test = X_test.to_numpy()
sample_submission_y = sample_submission_y.to_numpy()

     col_0   col_1 col_2 col_3  col_4 col_5 col_6
500     A1      B0    C2   D58    100    E1    F1
501     A0      B0    C2    D0      0    E0    F2
502     A0      B0   C11    D1    100    E1    F2
503     A0      B0    C4   D71    100    E1    F2
504     A2      B0    C9    D1      0    E1    F2
...    ...     ...   ...   ...    ...   ...   ...
2623    A1      B0    C2    D1      0    E1    F2
2624    A0   B0       C8    D1      0    E1    F2
2625    A0      BO    C7    D1    100    E1    F2
2626    A0      B0   C11    D1    100    E1    F2
2627    A0      B0    C2    D1    100    E1    F2

[2128 rows x 7 columns]
col_0    0
col_1    0
col_2    0
col_3    0
col_4    0
col_5    0
col_6    0
dtype: int64


In [10]:
# Neural network architecture
input_size = X_train.shape[1]
hidden_size = 32
output_size = 1

# Hyperparameters
learning_rate = 0.01   
epochs = 60000
lambda_reg = 0.01  # L2 regularization strength

# Initialize weights and biases

w1 = np.random.randn(input_size, hidden_size) * np.sqrt(2.0 / (input_size + hidden_size))
w2 = np.random.randn(hidden_size, output_size) * np.sqrt(2.0 / (hidden_size + output_size))
# w1 = np.random.randn(input_size, hidden_size) - 0.5
b1 = np.zeros((1, hidden_size))
# w2 = np.random.randn(hidden_size, output_size) - 0.5
b2 = np.zeros((1, output_size))

# ReLU activation function
def relu(x):
    return np.maximum(0, x)

# Prediction function
def predict(X, w1, b1, w2, b2):
    # Forward pass
    z1 = np.dot(X, w1) + b1
    a1 = relu(z1)
    z2 = np.dot(a1, w2) + b2
    predicted_output = z2
    return predicted_output

# RMSE function
def calculate_rmse(y_true, y_pred):
    squared_errors = (y_true - y_pred) ** 2
    mean_squared_error = np.mean(squared_errors)
    rmse = np.sqrt(mean_squared_error)
    return rmse

# Early stopping parameters
patience = 60
best_val_rmse = float("inf")
best_weights = None
best_epoch = 0
# Training loop
for epoch in range(epochs):
    # Forward pass
    z1 = np.dot(X_train, w1) + b1
    a1 = relu(z1)
    z2 = np.dot(a1, w2) + b2
    
    # loss(MSE with L2 regularization)
    loss = (0.5 / X_train.shape[0]) * np.mean((z2 - y_train.reshape(-1, 1)) ** 2)
    loss += (lambda_reg / (2 * X_train.shape[0])) * (np.sum(w1 ** 2) + np.sum(w2 ** 2))

    # Backpropagation
    output_error = z2 - y_train.reshape(-1, 1)
    d_z2 = output_error / X_train.shape[0]

    hidden_layer_error = np.dot(d_z2, w2.T)
    d_hidden_layer = hidden_layer_error * (z1 > 0)

    # Update weights and biases
    w2 -= learning_rate * np.dot(a1.T, d_z2) + lambda_reg * w2
    b2 -= learning_rate * np.sum(d_z2, axis=0, keepdims=True)
    w1 -= learning_rate * np.dot(X_train.T, d_hidden_layer) + lambda_reg * w1
    b1 -= learning_rate * np.sum(d_hidden_layer, axis=0, keepdims=True)

    # Print loss for monitoring
    if (epoch + 1) % 1000 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.4f}")

    # Check validation RMSE for early stopping
    if (epoch + 1) % 10 == 0:
        val_predictions = predict(X_val, w1, b1, w2, b2)
        val_rmse = calculate_rmse(y_val, val_predictions)

        # If validation RMSE is the best so far, update the best RMSE and weights
        if val_rmse < best_val_rmse:
            best_val_rmse = val_rmse
            best_weights = (w1.copy(), b1.copy(), w2.copy(), b2.copy())
            best_epoch = epoch + 1

        # If there's no improvement for 'patience' epochs, stop training
        if epoch + 1 - best_epoch >= patience:
            print(f"Early stopping at epoch {epoch + 1} with best validation RMSE: {best_val_rmse:.4f}")
            break


Epoch 999, Loss: 929598.7805
Epoch 1999, Loss: 929598.7592
Epoch 2999, Loss: 929598.7592
Early stopping at epoch 3260 with best validation RMSE: 62781.8045


In [11]:

# Use the best weights for prediction
best_w1, best_b1, best_w2, best_b2 = best_weights

#prediction and rmse on train set
train_predict=predict(X_train,best_w1,best_b1,best_w2,best_b2)
train_rmse=calculate_rmse(train_predict,y_train)

#prediction and rmse on train set
val_predict=predict(X_val,best_w1,best_b1,best_w2,best_b2)
val_rmse=calculate_rmse(val_predict,y_val)

#prediction and rmse on test set
test_predict=predict(X_test,best_w1,best_b1,best_w2,best_b2)
test_rmse=calculate_rmse(test_predict,sample_submission_y)
print("train rmse",train_rmse)
print("val rmse",val_rmse)
print("test rmse",test_rmse)


train rmse 62899.700468136485
val rmse 62781.80448634513
test rmse 144949.47073465976


In [12]:
#predicitions on test set using best weights and biases
#best_predictions = predict(X_test, best_w1, best_b1, best_w2, best_b2)
result_df = pd.DataFrame({'UID': test_data['UID'], 'y': test_predict.flatten()})
# Save predictions
result_df.to_csv('nn_submission.csv', index=False)