<a href="https://colab.research.google.com/github/juliohsu/house-prediction-reg/blob/main/house_prediction_reg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'house-prices-advanced-regression-techniques:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F5407%2F868283%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240924%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240924T232312Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Dae42fbda1ae94a97aebcf325d4a91153f54b971e06c8b3590b49f82d68f85768b1fa617f823c4b2adb48d6c5bcf3af6907cdb8c97345e0c5cde8fe853322ae2978880d2a4c0c96f59f1dd1bdc81cf15aef28e1e0a8d9a06e972f36d5bac21f6efd756ea6d16f85895e26fa74a9fd713b82de1bab9f3f3896d4a09cf8cb7a60afea85a3425cc0e4dbb809512a542dc67dbd433e4dc1e96e80c11a6822c0a05960f4f31d641f4f1cd6e7069c498c66fccf9dc9e0de43df27ff71e4f744ffcb0d43344f043fc7ed68e549349160f29be09bb710ce1940c57918171ed0ce0ceed4b065ad56f93429be67d6d8be0cf3b5875e79941acc0975197eea09e133100dd181'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Visualization & Exploration

In [None]:
import pandas as pd

# load dataset
X = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
X_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
# overview dataframe and its types
X.info()

In [None]:
# overview train dataframe and its values
pd.set_option('display.max_columns', None)
X.head(5)

In [None]:
X_test.head(5)

# Data Preprocessing (Handle Missing Value)

In [None]:
# show percentage of features missing values
cat_missing = X.isnull().sum()
cat_total = X.isnull().count()
cat_miss_percent = cat_missing / cat_total
df_missing = pd.DataFrame(
    cat_miss_percent,
    columns=['Missing Value']
).sort_values(
    by='Missing Value',
    ascending=False
)

df_missing.head(20)

In [None]:
# drop some of the most missing values features columns
most_miss_col = df_missing[df_missing['Missing Value'] > 0.06].index
#X = X.drop(columns=most_miss_col)
#X_test = X_test.drop(columns=most_miss_col)

X_test.info()

In [None]:
# select numeric features
num_col = X.select_dtypes(exclude=['object'])

num_col.isnull().sum().sort_values(ascending=False).head(20)

In [None]:
from sklearn.impute import SimpleImputer

# convert numeric features nan values into mean values
num_imputer = SimpleImputer(strategy='median')
num_imputer.fit(num_col)
num_impute_train = num_imputer.transform(num_col)
num_col = pd.DataFrame(num_impute_train, columns=num_col.columns)
X[num_col.columns] = num_col
X_test[num_col.columns] = num_col

num_col.isnull().sum().sort_values(ascending=False).head(20)

In [None]:
# select categorical feature
cat_col = X.select_dtypes(include=['object'])

cat_col.isnull().sum().sort_values(ascending=False).head(20)

In [None]:
# convert categorical features nan values into mode values
cat_imputer = SimpleImputer(strategy='most_frequent')
cat_imputer.fit(cat_col)
cat_impute_train = cat_imputer.transform(cat_col)
cat_col = pd.DataFrame(cat_impute_train, columns=cat_col.columns)
X[cat_col.columns] = cat_col
X_test[cat_col.columns] = cat_col

cat_col.isnull().sum().sort_values(ascending=False).head(20)

In [None]:
# check for the dataset
X_test.info()


# Data Preprocessing (Feature Engineering)

In [None]:
# overview dataset again to remove unnecessary features
X

In [None]:
# combining interesting features
def combine_features(df):
    df['TotalLivArea'] = df['GrLivArea'] + df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
    df['TotalBath'] = df['FullBath'] + (0.5 * df['HalfBath']) + df['BsmtFullBath'] + (0.5 * df['BsmtHalfBath'])
    df['GarageScore'] = df['GarageCars'] + df['GarageArea'] / 100
    return df

X = combine_features(X)
X_test = combine_features(X_test)

X

In [None]:
# overview dataset to check for unnecessary features
unnecessary_features = [
                        'Street', 'Utilities', 'LandSlope',
                        'Condition1', 'Condition2',
                        'RoofMatl', 'Exterior2nd', 'ExterCond',
                        'Heating', 'Electrical',
                        'MoSold', 'SaleCondition'
                       ]

X = X.drop(columns=unnecessary_features)
X_test = X_test.drop(columns=unnecessary_features)
X

In [None]:
# categorical overview target feature correlation
cat_col = X.select_dtypes(include=['object'])

In [None]:
# numeric overview target feature correlation
corr_X = X.corr(numeric_only=True)
corr_Xtarget = corr_X['SalePrice']

corr_Xtarget

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# plot target feature correlation
plt.figure(figsize=(12, 8))
sns.heatmap(corr_X[['SalePrice']], annot=True, cmap='coolwarm')
plt.title('SalePrice Correlation')
plt.show()

In [None]:
# filter out unnecessary features columns
unnecessary_col = corr_Xtarget[abs(corr_Xtarget) < 0.1].index.tolist()
X = X.drop(columns=unnecessary_col)
X_test = X_test.drop(columns=unnecessary_col)
X

# Data Preprocessing (Data Transformation)

In [None]:
import numpy as np

# separate the target feature and reduce its skewness
y = X['SalePrice']
X = X.drop(columns=['SalePrice'])
y = np.log1p(y)
y

# Data Preprocessing (Data Scaling)

In [None]:
# check for the dataset its types
X.info()

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# identify numeric and categorical features
numFT_X = X.select_dtypes(include=['int64', 'float64']).columns
catFT_X = X.select_dtypes(exclude=['int64', 'float64']).columns

# pipeline for data scaling
scaling = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numFT_X),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), catFT_X)
    ]
)

# Data Splitting

In [None]:
from sklearn.model_selection import train_test_split

# let's split the dataset 80% training and 20% validation
bins = pd.cut(y, bins=5, labels=False)
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.80, random_state=42)

# plot splitted data
plt.figure(figsize=(12, 8))
plt.scatter(X.iloc[:,0], y, color='gray', alpha=0.2, label='Original Data', s=10)
plt.scatter(X_train.iloc[:,0], y_train, color='blue', alpha=0.6, label='Training Data', s=10)
plt.scatter(X_val.iloc[:,0], y_val, color='red', alpha=0.6, label='Validation Data', s=10)

plt.title('Scatter Plot of Splitted Data Distribution')
plt.xlabel('Features')
plt.ylabel('Sale Price')
plt.show()

# Model Training

In [None]:
print(X.columns)

In [None]:
from sklearn.linear_model import LinearRegression

# linear model pipeline
linear_pipeline = Pipeline(
    steps = [
        ('preprocessor', scaling),
        ('linear', LinearRegression())
    ]
)

# fit the data into the pipeline
linear_pipeline.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error

# predict fitted model
y_pred = linear_pipeline.predict(X_val)

# mean squared error from the dataset and predicted
mse = mean_squared_error(y_val, y_pred)

np.sqrt(mse)

# Submission

In [None]:
test_file_path = "../input/house-prices-advanced-regression-techniques/test.csv"
test_data = pd.read_csv(test_file_path)
test_data['Id'] = test_data['Id'].astype(int)
ids = test_data.pop('Id')

test_pred = linear_pipeline.predict(X_test)

pd.DataFrame({
    'Id': ids,
    'SalePrice': np.exp(test_pred)
}).to_csv('/kaggle/working/submission.csv', index=False)