In [1]:
import pandas as pd
import numpy as np
from os.path import join
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

<h2>Data loading</h2>

In [2]:
def load_data(filepath: str) -> pd.DataFrame:
    return pd.read_csv(filepath)

In [10]:
train_data = load_data(join("..", "Datasets", "House_prices", "raw", "train.csv"))
test_data = load_data(join("..", "Datasets", "House_prices", "raw", "test.csv"))

In [11]:
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
test_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [12]:
prices = train_data.pop("SalePrice")

In [13]:
prices

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64

In [17]:
features_to_select = ["MSSubClass", "LotArea", "TotalBsmtSF", "KitchenAbvGr", "GarageCars"]

In [18]:
train_features = train_data[features_to_select]
test_features = test_data[features_to_select]

In [19]:
train_features.head()

Unnamed: 0,MSSubClass,LotArea,TotalBsmtSF,KitchenAbvGr,GarageCars
0,60,8450,856,1,2
1,20,9600,1262,1,2
2,60,11250,920,1,2
3,70,9550,756,1,3
4,60,14260,1145,1,3


In [20]:
test_features.head()

Unnamed: 0,MSSubClass,LotArea,TotalBsmtSF,KitchenAbvGr,GarageCars
0,20,11622,882.0,1,1.0
1,20,14267,1329.0,1,1.0
2,60,13830,928.0,1,2.0
3,60,9978,926.0,1,2.0
4,120,5005,1280.0,1,2.0


<h2>Data preprocessing</h2>

In [21]:
train_features.isna().mean()

MSSubClass      0.0
LotArea         0.0
TotalBsmtSF     0.0
KitchenAbvGr    0.0
GarageCars      0.0
dtype: float64

In [22]:
test_features.isna().mean()

MSSubClass      0.000000
LotArea         0.000000
TotalBsmtSF     0.000685
KitchenAbvGr    0.000000
GarageCars      0.000685
dtype: float64

In [26]:
test_features["TotalBsmtSF"] = test_features["TotalBsmtSF"].fillna(method="bfill")
test_features["GarageCars"] = test_features["GarageCars"].fillna(method="ffill")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_features["TotalBsmtSF"] = test_features["TotalBsmtSF"].fillna(method="bfill")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_features["GarageCars"] = test_features["GarageCars"].fillna(method="ffill")


In [27]:
train_features.isna().mean()

MSSubClass      0.0
LotArea         0.0
TotalBsmtSF     0.0
KitchenAbvGr    0.0
GarageCars      0.0
dtype: float64

In [28]:
test_features.isna().mean()

MSSubClass      0.0
LotArea         0.0
TotalBsmtSF     0.0
KitchenAbvGr    0.0
GarageCars      0.0
dtype: float64

In [29]:
train_features.head()

Unnamed: 0,MSSubClass,LotArea,TotalBsmtSF,KitchenAbvGr,GarageCars
0,60,8450,856,1,2
1,20,9600,1262,1,2
2,60,11250,920,1,2
3,70,9550,756,1,3
4,60,14260,1145,1,3


In [30]:
test_features.head()

Unnamed: 0,MSSubClass,LotArea,TotalBsmtSF,KitchenAbvGr,GarageCars
0,20,11622,882.0,1,1.0
1,20,14267,1329.0,1,1.0
2,60,13830,928.0,1,2.0
3,60,9978,926.0,1,2.0
4,120,5005,1280.0,1,2.0


<h2>Data scaling</h2>

In [31]:
scaler = MinMaxScaler(feature_range=(0,1))

In [33]:
scaled_train_features = scaler.fit_transform(train_features)
scaled_train_dataset = pd.DataFrame(scaled_train_features, columns=train_features.columns)

In [34]:
scaled_train_dataset.head()

Unnamed: 0,MSSubClass,LotArea,TotalBsmtSF,KitchenAbvGr,GarageCars
0,0.235294,0.03342,0.140098,0.333333,0.5
1,0.0,0.038795,0.206547,0.333333,0.5
2,0.235294,0.046507,0.150573,0.333333,0.5
3,0.294118,0.038561,0.123732,0.333333,0.75
4,0.235294,0.060576,0.187398,0.333333,0.75


In [35]:
scaled_test_features = scaler.transform(test_features)
scaled_test_dataset = pd.DataFrame(scaled_test_features, columns=test_features.columns)

In [36]:
scaled_test_dataset.head()

Unnamed: 0,MSSubClass,LotArea,TotalBsmtSF,KitchenAbvGr,GarageCars
0,0.0,0.048246,0.144354,0.333333,0.25
1,0.0,0.060609,0.217512,0.333333,0.25
2,0.235294,0.058566,0.151882,0.333333,0.5
3,0.235294,0.040562,0.151555,0.333333,0.5
4,0.588235,0.017318,0.209493,0.333333,0.5


<h2>Persist data in files</h2>

In [37]:
scaled_train_dataset.to_csv(join("..", "Datasets", "House_prices", "proceed", "train_experiment_features.csv"), index=False)
scaled_test_dataset.to_csv(join("..", "Datasets", "House_prices", "proceed", "test_experiment_features.csv"), index=False)
prices.to_csv(join("..", "Datasets", "House_prices", "proceed", "prices.csv"), index=False)