In [34]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_absolute_error, r2_score
import torch
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch import optim
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from pandas.api.types import is_numeric_dtype
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import normalize

# Preprocessing data with no normalizing

In [16]:
#load data
data = pd.read_csv('train.csv')

#drop id nad misc features
data = data.drop(columns = ['Id', 'MiscFeature', 'MiscVal'])

#separate into x and y
x_columns = [col for col in data.columns if col != 'SalePrice']
x = data[x_columns]
y = data['SalePrice']

#preprocess categorical variables by lable encoding
x = x.fillna(0)
categorical_var =['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities','LotConfig','LandSlope','Neighborhood','Condition1',
'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl','Exterior1st', 'Exterior2nd','MasVnrType','ExterQual',
'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure','BsmtFinType1','BsmtFinType2', 'Heating', 'HeatingQC', 
'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 
'GarageCond', 'PavedDrive','PoolQC','Fence','SaleType','SaleCondition']
for i in categorical_var:
    x[i] = x[i].astype('category')
    x[i] = x[i].cat.codes

#dropping all features with correlation < 0.1
col_drop = []
for (columnName, columnData) in x.items():
    correlation = abs(x[columnName].corr(y))
    if correlation < 0.1:
        col_drop.append(columnName)
x = x.drop(columns = col_drop)


#split data
df_train_x, df_test_x, df_train_y, df_test_y = train_test_split(x, y, test_size=0.2)

In [17]:
train_x = df_train_x.to_numpy(dtype='float32')
train_y = df_train_y.to_numpy(dtype='float32')
test_x = df_test_x.to_numpy(dtype='float32')
test_y = df_test_y.to_numpy(dtype='float32')

## Linear Regression

Build and predict on a basic linear model

In [18]:
from sklearn.linear_model import LinearRegression

# Create model
linear_regr_model = LinearRegression()

# Train model
linear_regr_model.fit(train_x, train_y)

# Predict with model
linear_regr_pred = linear_regr_model.predict(test_x)

# Calculate and print mean absolute error
linear_regr_error = mean_absolute_error(test_y, linear_regr_pred)
linear_regr_accuracy = r2_score(test_y, linear_regr_pred)
print(f"Mean absolute error = {linear_regr_error}")
print(f"Accuracy: {linear_regr_accuracy}")

Mean absolute error = 19855.310546875
Accuracy: 0.8701127982394099


## Ridge Regression

Build and predict using a ridge regression model

In [19]:
from sklearn.linear_model import Ridge
#Suppress warning that arised due to lack of normalization
import warnings
warnings.filterwarnings('ignore')

def train_ridge_model(alphas, train_x, train_y, test_x, test_y) -> [Ridge, float]:
    '''Train a ridge model that performs best given an array of alphas'''
    import sys
    model: Ridge
    lowest_error = sys.float_info.max
    max_accuracy = 0
    for alpha in alphas:
        test_model = Ridge(alpha=alpha)
        test_model.fit(train_x, train_y)
        predictions = test_model.predict(test_x)
        accuracy = r2_score(test_y, predictions)
        error = mean_absolute_error(test_y, predictions)
        if (accuracy > max_accuracy):
            max_accuracy = accuracy
            model = test_model
            lowest_error = error
    return model, max_accuracy, lowest_error


alphas = [0.5, 1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
ridge_model, ridge_accuracy, ridge_error = train_ridge_model(alphas, train_x, train_y, test_x, test_y)

print(f"Alpha chosen: {ridge_model.alpha}")
print(f"Mean absolute error: {ridge_error}")
print(f"Accuracy: {ridge_accuracy}")

Alpha chosen: 50
Mean absolute error: 19540.888671875
Accuracy: 0.8715720368258242


## Lasso Regression

Build and predict using a lasso model

In [20]:
from sklearn.linear_model import Lasso
#Suppress warning that arised due to lack of normalization
import warnings
warnings.filterwarnings('ignore')

def train_lasso_model(alphas, train_x, train_y, test_x, test_y) -> [Lasso, float]:
    '''Train a lasso model that performs best given an array of alphas'''
    import sys
    model: Lasso
    lowest_error = sys.float_info.max
    max_accuracy = 0
    for alpha in alphas:
        test_model = Lasso(alpha=alpha)
        test_model.fit(train_x, train_y)
        predictions = test_model.predict(test_x)
        accuracy = r2_score(test_y, predictions)
        error = mean_absolute_error(test_y, predictions)
        if (accuracy > max_accuracy):
            max_accuracy = accuracy
            model = test_model
            lowest_error = error
    return model, max_accuracy, lowest_error


alphas.extend(range(55, 100, 5))
lasso_model, lasso_accuracy, lasso_error = train_lasso_model(alphas, train_x, train_y, test_x, test_y)

print(f"Alpha chosen: {lasso_model.alpha}")
print(f"Mean absolute error: {lasso_error}")
print(f"Accuracy: {lasso_accuracy}")

Alpha chosen: 95
Mean absolute error: 19712.19921875
Accuracy: 0.8713694396570028


# Preprocessing data with Normalization

In [45]:
#find numerical features
numerical_features = []
for (colName, colData) in df_train_x.items():
    if is_numeric_dtype(colData):
        numerical_features.append(colName)
        
#normalizing only numerical features
scaler = StandardScaler()              #RobustScaler, MinMaxScaler
    
df_train_x[numerical_features] = scaler.fit_transform(df_train_x[numerical_features])
df_test_x[numerical_features] = scaler.transform(df_test_x[numerical_features])  

In [46]:
norm_train_x = df_train_x.to_numpy(dtype='float32')
norm_train_y = df_train_y.to_numpy(dtype='float32')
norm_test_x = df_test_x.to_numpy(dtype='float32')
norm_test_y = df_test_y.to_numpy(dtype='float32')

In [50]:
# Create model
norm_linear_regr_model = LinearRegression()

# Train model
norm_linear_regr_model.fit(norm_train_x, train_y)

# Predict with model
norm_linear_regr_pred = norm_linear_regr_model.predict(norm_test_x)

# Calculate and print mean absolute error
norm_linear_regr_error = mean_absolute_error(test_y, norm_linear_regr_pred)
norm_linear_regr_accuracy = r2_score(test_y, norm_linear_regr_pred)
print(f"Mean absolute error = {norm_linear_regr_error}")
print(f"Accuracy: {norm_linear_regr_accuracy}")

Mean absolute error = 19836.626953125
Accuracy: 0.8702047528878749


## Ridge Regression

Build and predict using a ridge regression model

In [51]:
norm_ridge_model, norm_ridge_accuracy, norm_ridge_error = train_ridge_model(alphas, norm_train_x, train_y, norm_test_x, test_y)

print(f"Alpha chosen: {norm_ridge_model.alpha}")
print(f"Mean absolute error: {norm_ridge_error}")
print(f"Accuracy: {norm_ridge_accuracy}")

Alpha chosen: 90
Mean absolute error: 19188.08203125
Accuracy: 0.8717353570976965


## Lasso Regression

Build and predict using a lasso model

In [49]:
norm_lasso_model, norm_lasso_accuracy, norm_lasso_error = train_lasso_model(alphas, norm_train_x, train_y, norm_test_x, test_y)

print(f"Alpha chosen: {norm_lasso_model.alpha}")
print(f"Mean absolute error: {norm_lasso_error}")
print(f"Accuracy: {norm_lasso_accuracy}")

Alpha chosen: 95
Mean absolute error: 19675.1171875
Accuracy: 0.8711388633573841


# Convolutional Neural Network

In [26]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        
        self.layer1 = nn.Sequential(
            nn.Conv1d(1, 16, kernel_size=3, padding=1),   # 54, 16
            nn.ReLU(inplace=True), 
            nn.Conv1d(16, 16, kernel_size=3, padding=1),  
            nn.ReLU(inplace=True),
            nn.MaxPool1d(kernel_size=2, stride=2)         # 27, 16
        )

        self.layer2 = nn.Sequential(
            nn.Conv1d(16, 32, kernel_size=3, padding=1),           
            nn.ReLU(inplace=True),
            nn.Conv1d(32, 32, kernel_size=3, padding=1),           
            nn.ReLU(inplace=True),
            nn.MaxPool1d(kernel_size=2, stride=2),          #13, 32       
        )

        self.layer3 = nn.Sequential(
            nn.Conv1d(32, 64, kernel_size=3, padding=1),           
            nn.ReLU(inplace=True),
            nn.Conv1d(64, 64, kernel_size=3, padding=1),           
            nn.ReLU(inplace=True),
            nn.MaxPool1d(kernel_size=2, stride=2),                #6, 64
        )
        
        self.layer4 = nn.Sequential(
            nn.Conv1d(64, 128, kernel_size=3, padding=1),        
            nn.ReLU(inplace=True),
            nn.Conv1d(128, 128, kernel_size=3, padding=1),       
            nn.ReLU(inplace=True),
            nn.MaxPool1d(kernel_size=2, stride=2)              #3 128  
        )
        
        
        self.fc = nn.Sequential(
            nn.Linear(384, 128),                                 
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5),
            nn.Linear(128, 1),

        )

    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x
 

In [27]:
class LoadDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, index):
        dataX = torch.tensor(self.X[index]).type(torch.float)
        dataY = torch.tensor(self.Y[index]).type(torch.float)

        return (dataX, dataY)

In [31]:
train_data = LoadDataset(train_x, train_y)
test_data = LoadDataset(test_x, test_y)
train_loader = DataLoader(dataset=train_data, batch_size=100, shuffle=True, drop_last=True)
test_loader = DataLoader(dataset=test_data, batch_size=100)
model = Model()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)
for epoch in range(30):
    for batch_idx, (data, targets) in enumerate(train_loader):
        data.requires_grad_()
        scores = model(data)
        scores = scores.view(targets.size())
        loss = criterion(scores, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [32]:
running_accuracy = 0
running_MAE = 0
num_batches = 0
model.eval()
criterion = nn.L1Loss()
with torch.no_grad():
    for batch_idx, (data, targets) in enumerate(test_loader):
        # forward
        scores = model(data)
        scores = scores.view(targets.size()) 
        accuracy = r2_score(targets, scores)
        running_accuracy += accuracy.item()
        MAE = criterion(scores, targets)
        running_MAE += MAE.item()
        num_batches+=1
accuracy_final = running_accuracy/num_batches
MAE_final = running_MAE/num_batches
print(accuracy_final)
print(MAE_final)

0.3666068317661771
41537.875
