In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, Lasso

# Load real_estate_train

In [2]:
df_train = pd.read_csv('real_estate_train.csv')
df_train.head()

Unnamed: 0,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,2012.917,32.0,84.87882,,24.98298,121.54024,37.9
1,2012.917,19.5,306.5947,9.0,24.98034,121.53951,42.2
2,2013.583,13.3,561.9845,5.0,24.98746,121.54391,47.3
3,2013.5,13.3,561.9845,5.0,24.98746,121.54391,54.8
4,2012.833,5.0,390.5684,5.0,24.97937,121.54245,43.1


In [3]:
feature_columns = ['X1 transaction date', 'X2 house age','X3 distance to the nearest MRT station',
                   'X4 number of convenience stores', 'X5 latitude', 'X6 longitude']
label_columns = "Y house price of unit area"

features = df_train.loc[:, feature_columns]
label = df_train.loc[:, label_columns]

In [4]:
X_train = features.values
y_train = label.values

# Missing value

In [5]:
print(features.isnull().sum())
print('Label: ', label.isnull().sum())

X1 transaction date                       1
X2 house age                              3
X3 distance to the nearest MRT station    0
X4 number of convenience stores           4
X5 latitude                               2
X6 longitude                              1
dtype: int64
Label:  2


In [6]:
X1_X2_imputer = SimpleImputer(np.nan, strategy = 'mean')
X_train[:, :2] = X1_X2_imputer.fit_transform(X_train[:, :2])

In [7]:
X4_imputer = SimpleImputer(np.nan, strategy = 'most_frequent')
X_train[:, 3] = X4_imputer.fit_transform(X_train[:, 3].reshape(-1, 1)).ravel()

In [8]:
X5_X6_imputer = SimpleImputer(np.nan, strategy = 'mean')
X_train[:, 4:6] = X5_X6_imputer.fit_transform(X_train[:, 4:6])

In [9]:
features = pd.DataFrame.from_records(X_train, columns = feature_columns)
print(features.isnull().sum())
features.head()

X1 transaction date                       0
X2 house age                              0
X3 distance to the nearest MRT station    0
X4 number of convenience stores           0
X5 latitude                               0
X6 longitude                              0
dtype: int64


Unnamed: 0,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude
0,2012.917,32.0,84.87882,0.0,24.98298,121.54024
1,2012.917,19.5,306.5947,9.0,24.98034,121.53951
2,2013.583,13.3,561.9845,5.0,24.98746,121.54391
3,2013.5,13.3,561.9845,5.0,24.98746,121.54391
4,2012.833,5.0,390.5684,5.0,24.97937,121.54245


In [10]:
y_imputer = SimpleImputer(np.nan, strategy = 'mean')
y_train = y_imputer.fit_transform(y_train.reshape(-1, 1)).ravel()

# Normalize train

In [11]:
#Không hiệu quả
# minmax_scaler = MinMaxScaler()
# X_train[:, 0] = minmax_scaler.fit_transform(X_train[:, 0].reshape(-1, 1)).ravel()
# X_train[:, 2] = minmax_scaler.fit_transform(X_train[:, 2].reshape(-1, 1)).ravel()
# X_train[:, 5] = minmax_scaler.fit_transform(X_train[:, 5].reshape(-1, 1)).ravel()


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

# Load real_estate_test

In [13]:
df_test = pd.read_csv('real_estate_test.csv')
df_test.head()

Unnamed: 0,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,2013.333,25.6,4519.69,0,24.94826,121.49587,15.6
1,2013.167,39.8,617.7134,2,24.97577,121.53475,39.6
2,2012.75,7.8,104.8101,5,24.96674,121.54067,38.4
3,2012.917,30.0,1013.341,5,24.99006,121.5346,22.8
4,2013.583,27.3,337.6016,6,24.96431,121.54063,36.5


In [14]:
features_test = df_test.loc[:, feature_columns]
label_test = df_test.loc[:, label_columns]

In [15]:
X_test = features_test.values
y_test = label_test.values

In [16]:
print(features_test.isnull().sum())
print('Label: ', label_test.isnull().sum())

X1 transaction date                       0
X2 house age                              0
X3 distance to the nearest MRT station    0
X4 number of convenience stores           0
X5 latitude                               0
X6 longitude                              0
dtype: int64
Label:  0


In [17]:
# minmax_scaler = MinMaxScaler()
# X_test[:, 0] = minmax_scaler.fit_transform(X_test[:, 0].reshape(-1, 1)).ravel()
# X_test[:, 2] = minmax_scaler.fit_transform(X_test[:, 2].reshape(-1, 1)).ravel()
# X_test[:, 5] = minmax_scaler.fit_transform(X_test[:, 5].reshape(-1, 1)).ravel()

# Train model

# Linear

In [18]:
linear = LinearRegression()
linear.fit(X_train, y_train)

predict_train = linear.predict(X_train)
print("Root Mean Square Error Train: {}".format(np.sqrt(mean_squared_error(y_train, predict_train))))
predict_test = linear.predict(X_test)
print("Root Mean Square Error Test: {}".format(np.sqrt(mean_squared_error(y_test, predict_test))))

Mean Square Error Train: 9.401471996659001
Mean Square Error Test: 7.7076386878085135


# Ridge

In [28]:
ridge = Ridge(alpha = 0.01)
ridge.fit(X_train, y_train)

predict_train = ridge.predict(X_train)
print("Root Mean Square Error Train: {}".format(np.sqrt(mean_squared_error(y_train, predict_train))))

predict_test = ridge.predict(X_test)
print("Root Mean Square Error Test: {}".format(np.sqrt(mean_squared_error(y_test, predict_test))))

Root Mean Square Error Train: 9.401472017514857
Root Mean Square Error Test: 7.70762925604203


# Lasso

In [26]:
lasso = Lasso()
lasso.fit(X_train, y_train)

predict_train = lasso.predict(X_train)
predict_test = lasso.predict(X_test)

print("Root Mean Square Error: {}".format(np.sqrt(mean_squared_error(y_train, predict_train))))
print("Root Mean Square Error: {}".format(np.sqrt(mean_squared_error(y_test, predict_test))))

Root Mean Square Error: 9.40156121134243
Root Mean Square Error: 7.706770543028014
