## Import Dataset

In [1]:
import pandas as pd
housing_train = pd.read_csv('../datasets/house-prices-advanced-regression-techniques/train.csv')
housing_test = pd.read_csv('../datasets/house-prices-advanced-regression-techniques/test.csv')

In [2]:
housing_train_target = housing_train['SalePrice']
housing_train_features = housing_train.drop('SalePrice', axis=1)

In [3]:
housing_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

# Preprocess data

In [31]:
from sklearn.pipeline import Pipeline, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np

def set_index(X, index_column):
    X = X.copy()  # Avoid SettingWithCopyWarning
    X.set_index(index_column, inplace=True)
    return X

def drop_columns(X, columns):
    return X.drop(columns=columns)

def impute_numerical_columns(X, columns):
    imputer = SimpleImputer(strategy="median")
    X = imputer.fit_transform(X[columns])
    return X

def scale_numerical_columns(X, columns):
    scaler = StandardScaler
    X = scaler.fit_transform(X[columns])
    return X

    

columns_to_drop = ['Alley','PoolQC','MiscFeature','Fence']
categorical_columns = ['MSSubClass','MSZoning','Street','LotShape','LandContour','Utilities', 'BedroomAbvGr',
                       'LotConfig','Foundation','LandSlope','Neighborhood','Condition1','Condition2','BldgType',
                       'HouseStyle','OverallQual','OverallCond','YearBuilt','YearRemodAdd','RoofStyle',
                       'RoofMatl','Exterior1st','Exterior2nd','MasVnrType','ExterQual','ExterCond','BsmtQual',
                       'BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','Heating','HeatingQC','CentralAir',
                       'Electrical','KitchenQual','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','Functional',
                       'FireplaceQu','GarageCars','GarageType','GarageYrBlt','GarageFinish','GarageQual','GarageCond',
                       'PavedDrive','MoSold','YrSold','SaleType','SaleCondition','KitchenAbvGr','Fireplaces','TotRmsAbvGrd']
numerical_columns = [col for col in housing_train_features.columns if col != 'Id' and col not in  categorical_columns and col not in columns_to_drop]

# Column transformer for numerical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
    ('scaler', StandardScaler())  # Normalize (standardize) the values
])

# Categorical transformer
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical variables
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ]
)

# Function to convert data type to float32
def to_float32(X):
    return X.astype(np.float32)

# Transformer to convert data type
dtype_transformer = FunctionTransformer(to_float32)

pipeline = Pipeline(
    steps=[
        ('drop_columns', FunctionTransformer(drop_columns, kw_args={'columns': columns_to_drop})),
        ('set_index', FunctionTransformer(set_index, kw_args={'index_column': 'Id'})),
        ('preprocessor', preprocessor),
        ('to_float32', dtype_transformer)  # Convert to float32
    ]
)

pipeline.fit(housing_train_features)

train_transformed = pipeline.transform(housing_train_features)
test_transformed = pipeline.transform(housing_test)

# Retrieve column names for the transformed DataFrame
transformed_columns = numerical_columns + list(pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_columns))
housing_train_transformed = pd.DataFrame(train_transformed.toarray(), columns=transformed_columns)
housing_test_transformed = pd.DataFrame(test_transformed.toarray(), columns=transformed_columns)

In [32]:
housing_train_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Columns: 638 entries, LotFrontage to TotRmsAbvGrd_14
dtypes: float32(638)
memory usage: 3.6 MB


## Training

#### Standardise the target value

In [33]:
target = housing_train_target.astype(np.float32).to_numpy()
mean_val = np.mean(target)
std_dev = np.std(target)
standardized_target = (target - mean_val) / std_dev

print("Original Target Values:", target)
print("Standardized Target Values:", standardized_target)

Original Target Values: [208500. 181500. 223500. ... 266500. 142125. 147500.]
Standardized Target Values: [ 0.34727335  0.00728843  0.53615385 ...  1.0776113  -0.48852292
 -0.4208407 ]


#### Transform dataset for training a basic neural network

In [34]:
from sklearn.model_selection import train_test_split
housing_clean_x_np = housing_train_transformed.to_numpy()
housing_clean_y_np = standardized_target
X_train, X_test, y_train, y_test = train_test_split(housing_clean_x_np, housing_clean_y_np, test_size=0.2, random_state=42)
y_train = np.expand_dims(y_train, axis=0)
y_test = np.expand_dims(y_test, axis=0)

In [71]:
import torch
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)
X_train.shape[1]

  X_train = torch.tensor(X_train, dtype=torch.float32)
  X_test = torch.tensor(X_test, dtype=torch.float32)
  y_train = torch.tensor(y_train, dtype=torch.float32)
  y_test = torch.tensor(y_test, dtype=torch.float32)


638

#### Define and train the neural network

In [77]:
import torch.nn as nn
import torch.optim as optim

# Step 2: Define the Model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.layer1 = nn.Linear(X_train.shape[1], 128)
        self.layer2 = nn.Linear(128, 64)
        self.layer3 = nn.Linear(64, 32)
        self.layer4 = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        x = torch.relu(self.layer3(x))
        x = self.sigmoid(self.layer4(x))
        return x

model = NeuralNetwork()

# Step 3: Define the Loss Function and Optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.00012)

# Step 4: Train the Model
num_epochs = 100
batch_size = 10

for epoch in range(num_epochs):
    model.train()
    
    # Shuffle the data at the beginning of each epoch
    perm = torch.randperm(X_train.size(0))
    X_train = X_train[perm]
    y_train = y_train.flatten()[perm]
    
    for i in range(0, X_train.size(0), batch_size):
        X_batch = X_train[i:i+batch_size]
        y_batch = y_train[i:i+batch_size]
        
        # Forward pass
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Step 5: Evaluate the Model
model.eval()
with torch.no_grad():
    predictions = model(X_test)
    test_loss = criterion(predictions, y_test)
    print(f'Test Loss: {test_loss.item():.4f}')

# Step 6: Make Predictions
predictions = model(X_test)
print(predictions)

Epoch [10/100], Loss: 0.7633
Epoch [20/100], Loss: 0.3529
Epoch [30/100], Loss: 1.2771
Epoch [40/100], Loss: 0.1318
Epoch [50/100], Loss: 0.9222
Epoch [60/100], Loss: 0.1946
Epoch [70/100], Loss: 0.4047
Epoch [80/100], Loss: 0.4217
Epoch [90/100], Loss: 0.2668
Epoch [100/100], Loss: 1.0500
Test Loss: 1.2269
tensor([[3.1108e-04],
        [1.6204e-01],
        [1.6431e-04],
        [1.9083e-03],
        [2.6239e-01],
        [2.1176e-05],
        [1.7777e-02],
        [1.1080e-03],
        [1.4325e-05],
        [7.2267e-04],
        [4.6483e-04],
        [2.1886e-04],
        [5.7652e-04],
        [2.9737e-02],
        [1.2054e-02],
        [5.1666e-04],
        [2.2132e-02],
        [5.2160e-04],
        [1.6034e-04],
        [4.8142e-02],
        [4.6451e-03],
        [2.6497e-02],
        [1.3097e-02],
        [4.0179e-04],
        [4.0997e-02],
        [1.0472e-02],
        [2.4967e-02],
        [4.9854e-04],
        [1.2597e-02],
        [5.6578e-02],
        [6.6205e-04],
        [

#### Get Predictions

In [79]:
test_data = housing_test_transformed.to_numpy()
test_sub = torch.tensor(test_data, dtype=torch.float32)
predictions_test = model(test_sub)
denormalized_predictions_test = predictions_test * std_dev + mean_val
housing_test['SalePrice'] = denormalized_predictions_test.flatten().detach().numpy()
housing_test[['Id','SalePrice']].reset_index().to_csv('../datasets/house-prices-advanced-regression-techniques/preds.csv')