This Notebook is showing what you can do with simple linear regression techniques. The data is obtained from a website called koopwoningen using a basic parser.

# Exploring the data and finding linear relationships

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import numpy as np
from scipy import stats

Read the data from the file, the first row of the file contains the headers.

*Note to self* If the first line contains a space after a comma, the name of the header also contains the space

In [None]:
# Load your housing data into a Pandas DataFrame
data = pd.read_csv('2_all_houses.txt')
data = data[data["Price"] > 0]

In [None]:
data.head()

In [None]:
# Display a summary of the dataset
print(data.describe())

## Exploring tha data
We have to get to know the data, maybe do some data cleaning. Remove very expensive houses, and house with to many rooms. Also remove the houses without a price, as they do not contribute anything. Before we do that, we plot the data to learn from it.

In [None]:
def plot_histograms(_data):
    # 1. Plotting Histograms to see distribution of numerical features
    plt.figure(figsize=(16, 5))
    
    # Histogram for NumberOfRooms
    plt.subplot(1, 3, 1)
    sns.histplot(_data['NumberOfRooms'], kde=True, bins=20)
    plt.title('Distribution of Number Of Rooms')
    
    # Histogram for SquareFootage
    plt.subplot(1, 3, 2)
    sns.histplot(_data['SquareFootage'], kde=True, bins=20)
    plt.title('Distribution of Square Footage')
    
    # Histogram for YearBuilt
    plt.subplot(1, 3, 3)
    sns.histplot(_data['YearBuilt'], kde=True, bins=20)
    plt.title('Distribution of Year Built')
    
    plt.tight_layout()
    plt.show()

plot_histograms(data)

### Observations
- The distribution of the 'NumberOfRooms' and 'SquareFootage' features are skewed to the right.
- The 'YearBuilt' feature is more uniformly distributed.
- There are some outliers in the 'NumberOfRooms' and 'SquareFootage' features.

### Cleaning steps
- Remove the outliers from the 'NumberOfRooms' and 'SquareFootage' features.
- Create ranges for the 'YearBuilt' feature to group the years into categories.

In [None]:
# A good rule of thumb is to use 3 standard deviations from the mean
clean_data = data.copy()
clean_data = clean_data[(np.abs(stats.zscore(clean_data["SquareFootage"])) < 3)]

# Houses with more than 8 rooms are considered outliers
clean_data = clean_data[clean_data["NumberOfRooms"] < 9]

# Change the YearBuilt feature to a categorical feature pre-1950, 1950-2000, post-2000
clean_data['YearBuilt'] = pd.cut(clean_data['YearBuilt'], bins=[0, 1950, 1980, 2000, 2024], labels=['pre-1950', '1950-1980', '1980-2000', 'post-2000'])

plot_histograms(clean_data)

In [None]:
# 2. Plotting Boxplots to visualize outliers
plt.figure(figsize=(16, 5))

# Boxplot for NumberOfRooms
plt.subplot(1, 2, 1)
sns.boxplot(x=clean_data['NumberOfRooms'])
plt.title('Boxplot of Number Of Rooms')

# Boxplot for SquareFootage
plt.subplot(1, 2, 2)
sns.boxplot(x=clean_data['SquareFootage'])
plt.title('Boxplot of Square Footage')

plt.tight_layout()
plt.show()


In [None]:
# 3. Scatterplot to check for relationships (NumberOfRooms vs SquareFootage)
plt.figure(figsize=(8, 6))
sns.scatterplot(x='NumberOfRooms', y='SquareFootage', data=clean_data)
plt.title('Number of Rooms vs. Square Footage')
plt.show()


In [None]:
plt.figure(figsize=(16,6))

plt.subplot(1,2,1)
sns.scatterplot(x='NumberOfRooms', y='Price', data=clean_data)
plt.title('Number of Rooms vs. Price')

plt.subplot(1,2,2)
sns.scatterplot(x='SquareFootage', y='Price', data=clean_data)
plt.title('Square Footage vs. Price')

plt.tight_layout()
plt.show()

## Linear Regression
Next we will use linear regression to predict the price of a house based on the number of rooms and the square footage. We will use the cleaned data for this.

We use two metrics to evaluate the models:
- Mean Absolute Error (MAE): The average of the absolute differences between predictions and actual values. (lower is better)
- R-squared: The proportion of the variance in the dependent variable that is predictable from the independent variables. (closer to 1 is better)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
# Select the features and target variable
X = clean_data[['NumberOfRooms', 'SquareFootage', 'YearBuilt', 'Type']]
y = clean_data['Price']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['NumberOfRooms', 'SquareFootage']),
        ('cat', OneHotEncoder(), ['Type', 'YearBuilt'])
    ])

# Create a pipeline with preprocessing and model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', RandomForestRegressor())])

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae:.0f}")
print(f"R-squared: {r2:.2f}")

# Plot actual vs predicted values using Seaborn
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=y_pred)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual vs Predicted Housing Prices")

# Plot the perfect line
max_val = max(np.max(y_test), np.max(y_pred))
plt.plot([0, max_val], [0, max_val], color='green', linestyle='--')

plt.show()

## Introducing Polynomial Regression
Polynomial regression is a form of regression analysis in which the relationship between the independent variable x and the dependent variable y is modelled as an nth degree polynomial in x. It is used when the relationship between the independent variable and the dependent variable is curvilinear.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score

# Use only SquareFootage as the input feature and Price as the target
X_train_sqft = X_train[['SquareFootage']]
X_test_sqft = X_test[['SquareFootage']]
y_train_log = np.log(y_train)  # Log-transform the target variable

# Define a pipeline for polynomial regression
poly_pipeline = Pipeline([
    ('poly_features', PolynomialFeatures(degree=3, include_bias=False)),
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])

# Fit the model to the square footage data
poly_pipeline.fit(X_train_sqft, y_train_log)

# Make predictions on the test set
y_pred_log = poly_pipeline.predict(X_test_sqft)
y_pred = np.exp(y_pred_log)  # Convert back from log scale

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae:.2f}")
print(f"R-squared: {r2:.2f}")

# Create a DataFrame for the range of square footage values
X_range = pd.DataFrame(np.linspace(X_test_sqft.min().iloc[0], X_test_sqft.max().iloc[0], 500), columns=['SquareFootage'])
y_range_log = poly_pipeline.predict(X_range)
y_range = np.exp(y_range_log)  # Convert back from log scale

# Plot actual data points
plt.figure(figsize=(10, 6))
plt.scatter(X_test_sqft, y_test, label="Actual Data", alpha=0.5)

# Plot the polynomial regression curve
plt.plot(X_range, y_range, color="red", linewidth=2, label="Polynomial Model (Degree 3)")

# Labels and title
plt.xlabel("Square Footage")
plt.ylabel("Price")
plt.title("Price vs. Square Footage with Polynomial Regression")
plt.legend()
plt.show()

# Plot actual vs predicted values using Seaborn
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=y_pred)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual vs Predicted Housing Prices")

# Plot the perfect line
max_val = max(np.max(y_test), np.max(y_pred))
plt.plot([0, max_val], [0, max_val], color='green', linestyle='--')

plt.show()