In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [4]:
df = pd.read_csv('all_perth_310121.csv')
print(df.head)

<bound method NDFrame.head of                   ADDRESS           SUBURB    PRICE  BEDROOMS  BATHROOMS  \
0           1 Acorn Place       South Lake   565000         4          2   
1             1 Addis Way            Wandi   365000         3          2   
2         1 Ainsley Court          Camillo   287000         3          1   
3         1 Albert Street         Bellevue   255000         2          1   
4            1 Aman Place        Lockridge   325000         4          1   
...                   ...              ...      ...       ...        ...   
33651      9C Gold Street  South Fremantle  1040000         4          3   
33652      9C Pycombe Way      Westminster   410000         3          2   
33653      9D Pycombe Way      Westminster   427000         3          2   
33654     9D Shalford Way       Girrawheen   295000         3          1   
33655  9E Margaret Street          Midland   295000         3          1   

       GARAGE  LAND_AREA  FLOOR_AREA  BUILD_YEAR  CBD_DIS

In [5]:
print(df.columns)

Index(['ADDRESS', 'SUBURB', 'PRICE', 'BEDROOMS', 'BATHROOMS', 'GARAGE',
       'LAND_AREA', 'FLOOR_AREA', 'BUILD_YEAR', 'CBD_DIST', 'NEAREST_STN',
       'NEAREST_STN_DIST', 'DATE_SOLD', 'POSTCODE', 'LATITUDE', 'LONGITUDE',
       'NEAREST_SCH', 'NEAREST_SCH_DIST', 'NEAREST_SCH_RANK'],
      dtype='object')


In [6]:
df.fillna({'BUILD_YEAR': df['BUILD_YEAR'].median()}, inplace=True)
df.fillna({'GARAGE': df['GARAGE'].median()}, inplace=True)
df.fillna({'NEAREST_SCH_RANK': df['NEAREST_SCH_RANK'].median()}, inplace=True)

# Check for missing values again to confirm
print(df.isna().sum())


ADDRESS             0
SUBURB              0
PRICE               0
BEDROOMS            0
BATHROOMS           0
GARAGE              0
LAND_AREA           0
FLOOR_AREA          0
BUILD_YEAR          0
CBD_DIST            0
NEAREST_STN         0
NEAREST_STN_DIST    0
DATE_SOLD           0
POSTCODE            0
LATITUDE            0
LONGITUDE           0
NEAREST_SCH         0
NEAREST_SCH_DIST    0
NEAREST_SCH_RANK    0
dtype: int64


In [7]:
features = [ 'CBD_DIST', 'NEAREST_STN_DIST',
            'NEAREST_SCH_DIST', 'BEDROOMS', 'BATHROOMS', 'GARAGE', 'LAND_AREA', 'FLOOR_AREA', 'BUILD_YEAR']
target = 'PRICE'
x = df[features]
y = df[target]

In [8]:
scaler = StandardScaler()
x_scaler = scaler.fit_transform(x)
print(x_scaler)

[[-0.13000197 -0.60586722 -0.56525999 ... -0.12823399 -0.32594899
   0.63933097]
 [ 0.62675739  0.08378837  2.12434833 ... -0.14315019 -0.61720341
   1.13891243]
 [ 0.24837771 -0.58362027 -0.09512716 ... -0.12110537 -1.35227407
  -0.55966453]
 ...
 [-0.89556086  0.01704751 -0.0836833  ... -0.14854158 -0.99167337
   0.2396658 ]
 [-0.63157504 -0.0274464  -0.83436142 ... -0.13680032 -1.36614333
  -0.80945526]
 [-0.27079441 -0.62811418 -0.43511764 ... -0.14644493 -1.22745075
   0.2396658 ]]


In [9]:
print(x)
print(y)

       CBD_DIST  NEAREST_STN_DIST  NEAREST_SCH_DIST  BEDROOMS  BATHROOMS  \
0         18300              1800          0.828339         4          2   
1         26900              4900          5.524324         3          2   
2         22600              1900          1.649178         3          1   
3         17900              3600          1.571401         2          1   
4         11200              2000          1.514922         4          1   
...         ...               ...               ...       ...        ...   
33651     16100              1500          1.430350         4          3   
33652      9600              4600          1.679644         3          2   
33653      9600              4600          1.669159         3          2   
33654     12600              4400          0.358494         3          1   
33655     16700              1700          1.055564         3          1   

       GARAGE  LAND_AREA  FLOOR_AREA  BUILD_YEAR  
0         2.0        600         160

In [10]:
print(x.isna().sum())
print(y.isna().sum())

CBD_DIST            0
NEAREST_STN_DIST    0
NEAREST_SCH_DIST    0
BEDROOMS            0
BATHROOMS           0
GARAGE              0
LAND_AREA           0
FLOOR_AREA          0
BUILD_YEAR          0
dtype: int64
0


In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print(len(x_train))
print(len(x_test))
print(len(y_train))
print(len(y_test))

26924
6732
26924
6732


In [12]:
poly = PolynomialFeatures(degree=4)
x_train_poly = poly.fit_transform(x_train)
x_test_poly = poly.transform(x_test)
x_train_poly.shape
x_test_poly.shape

(6732, 715)

In [None]:
regressor = LinearRegression()
regressor.fit(x_train_poly, y_train)
y_pred = regressor.predict(x_test_poly)
print(y_pred)

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error : {mse}")
print(f"r2 Score : {r2}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Scatter plot for Price vs. CBD Distance
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df['CBD_DIST'], y=df['PRICE'])
plt.title('Price vs. CBD Distance')
plt.xlabel('CBD Distance (km)')
plt.ylabel('Price (AUD)')
plt.show()

# Scatter plot for Price vs. Nearest School Distance
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df['NEAREST_SCH_DIST'], y=df['PRICE'])
plt.title('Price vs. Nearest School Distance')
plt.xlabel('Nearest School Distance (km)')
plt.ylabel('Price (AUD)')
plt.show()

# Scatter plot for Price vs. Floor Area
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df['FLOOR_AREA'], y=df['PRICE'])
plt.title('Price vs. Floor Area')
plt.xlabel('Floor Area (sqm)')
plt.ylabel('Price (AUD)')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Predictions vs Actuals plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=y_pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--')
plt.title('Predicted Prices vs. Actual Prices')
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Pilih satu fitur untuk visualisasi, misalnya 'CBD_DIST'
X = df[['CBD_DIST']].values
y = df['PRICE'].values

# Urutkan data untuk plotting yang lebih baik
sort_idx = X.flatten().argsort()
X = X[sort_idx]
y = y[sort_idx]

# Fit model linear
model_linear = LinearRegression()
model_linear.fit(X, y)
y_linear_pred = model_linear.predict(X)

# Fit model polynomial
poly_features = PolynomialFeatures(degree=4, include_bias=False)
X_poly = poly_features.fit_transform(X)

model_poly = LinearRegression()
model_poly.fit(X_poly, y)
y_poly_pred = model_poly.predict(X_poly)

# Plot hasil
plt.figure(figsize=(10, 6))
plt.scatter(X, y, color='blue', label='Data')
plt.plot(X, y_linear_pred, color='red', label='Linear Model')
plt.plot(X, y_poly_pred, color='green', label='Polynomial Model')
plt.xlabel('CBD Distance (km)')
plt.ylabel('Price (AUD)')
plt.title('Linear vs Polynomial Regression')
plt.legend()
plt.show()

# Print R-squared scores
print(f"R-squared score (Linear): {r2_score(y, y_linear_pred):.4f}")
print(f"R-squared score (Polynomial): {r2_score(y, y_poly_pred):.4f}")


In [None]:
corr = df[features + [target]].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

sns.pairplot(df[features + [target]], height=2)
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
param_grid = {'polynomialfeatures__degree' : [2, 3, 4, 5],
              'linearregression__fit_intercept' : [True, False]}

model = make_pipeline(PolynomialFeatures(), LinearRegression())
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2')
grid_search.fit(x_train, y_train)

print(f"Best Parameters : {grid_search.best_params_}")
print(f"Best R2 score : {grid_search.best_score_:.4f}")

In [None]:
best_model = make_pipeline(
    PolynomialFeatures(degree=2),
    LinearRegression(fit_intercept=False)
)
best_model.fit(x_train, y_train)
y_pred = best_model.predict(x_test)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

poly_features = best_model.named_steps['polynomialfeatures']
linear_regression = best_model.named_steps['linearregression']

# Gunakan get_feature_names_out() jika tersedia, jika tidak gunakan get_feature_names()
if hasattr(poly_features, 'get_feature_names_out'):
    feature_names = poly_features.get_feature_names_out(input_features=features)
else:
    feature_names = poly_features.get_feature_names(input_features=features)

coefficients = linear_regression.coef_

for name, coef in zip(feature_names, coefficients):
    print(f"{name}: {coef}")

# Plot koefisien
plt.figure(figsize=(12, 6))
plt.bar(range(len(coefficients)), coefficients)
plt.xticks(range(len(coefficients)), feature_names, rotation=90)
plt.title('Feature Coefficients')
plt.tight_layout()
plt.show()

In [None]:
y_pred = best_model.predict(x_test)

plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted Prices')
plt.tight_layout()
plt.show()


In [None]:
residuals = y_test - y_pred

plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals, alpha=0.5)
plt.hlines(y=0, xmin=y_pred.min(), xmax=y_pred.max(), colors='r', linestyles='--')
plt.xlabel('Predicted Price')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.tight_layout()
plt.show()

# Q-Q plot
from scipy import stats
fig, ax = plt.subplots(figsize=(10, 6))
stats.probplot(residuals, dist="norm", plot=ax)
ax.set_title("Q-Q plot")
plt.tight_layout()
plt.show()

In [None]:
from sklearn.inspection import permutation_importance

perm_importance = permutation_importance(best_model, X_test, y_test, n_repeats=10, random_state=42)

feature_importance = pd.DataFrame({
    'feature': features,
    'importance': perm_importance.importances_mean
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance')
plt.tight_layout()
plt.show()