In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib

In [None]:
# Assuming your dataset is named 'sydney_data.csv'
df = pd.read_csv('sydney_data.csv')

In [None]:
# Check for missing values
df.isnull().sum()

# Clean the "Time to CBD (Public Transport) [Town Hall St]" column
df['Time to CBD (Public Transport) [Town Hall St]'] = df['Time to CBD (Public Transport) [Town Hall St]'].str.replace(' minuntes', '')

# Convert data types
df['Time to CBD (Public Transport) [Town Hall St]'] = pd.to_numeric(df['Time to CBD (Public Transport) [Town Hall St]'], errors='coerce')

# Check for missing values again after cleaning and conversion
df.isnull().sum()

In [None]:
print(df.dtypes)

In [None]:
# Check unique values in the "Time to CBD (Public Transport) [Town Hall St]" column
print(df['Time to CBD (Public Transport) [Town Hall St]'].unique())

# Check unique values in the "Median House Price (2020)" column
print(df['Median House Price (2020)'].unique())

In [None]:
# Remove dollar signs and commas from the "Median House Price (2020)" column
df['Median House Price (2020)'] = df['Median House Price (2020)'].str.replace('[\$,]', '', regex=True)

# Convert the column to a numeric type
df['Median House Price (2020)'] = pd.to_numeric(df['Median House Price (2020)'], errors='coerce')

In [None]:
# Calculate correlations
correlation_2020 = df['Time to CBD (Public Transport) [Town Hall St]'].corr(df['Median House Price (2020)'])

print(f"Correlation with Median House Price (2020): {correlation_2020}")

# Create a scatter plot to visualize the relationship
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Time to CBD (Public Transport) [Town Hall St]', y='Median House Price (2020)', data=df)
plt.title('Scatter Plot of Time to CBD vs. Median House Price (2020)')
plt.xlabel('Time to CBD (Public Transport) [Town Hall St]')
plt.ylabel('Median House Price (2020)')
plt.show()


The correlation between "Time to CBD via public transport" and "Median House Price (2020)" is approximately -0.259. This negative correlation suggests that as the time to reach the Central Business District (CBD) via public transport increases, the median house prices tend to decrease, albeit not very strongly.

### Histogram of House Prices: Visualize the distribution of median house prices for a better understanding of the data.

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['Median House Price (2020)'], bins=20, kde=True)
plt.title('Distribution of Median House Prices (2020)')
plt.xlabel('Median House Price (2020)')
plt.ylabel('Frequency')
plt.show()


### Box Plot of House Prices by Region: Explore how median house prices vary by region.

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='Region', y='Median House Price (2020)', data=df)
plt.title('Median House Prices (2020) by Region')
plt.xticks(rotation=90)
plt.xlabel('Region')
plt.ylabel('Median House Price (2020)')
plt.show()


### Scatter Plot Matrix: Create a matrix of scatter plots to visualize relationships between multiple numeric variables.

In [None]:
numeric_columns = [
    'Median House Price (2020)',
    'Time to CBD (Public Transport) [Town Hall St]',
    'Traffic',
    'Public Transport',
    'Nature',
    'Noise'
]

sns.pairplot(df[numeric_columns], height=2)
plt.suptitle('Pairwise Scatter Plot Matrix')
plt.show()

In [None]:
# Feature Engineering
# Feature that represents the proximity to the nearest train station.

df['Proximity to Train Station'] = 1 / (df['Time to CBD (Public Transport) [Town Hall St]'] + 1)

In [None]:
# Statistical Tests

# Check for missing values in 'Proximity to Train Station' and 'Median House Price (2020)'
print("Missing values in 'Proximity to Train Station':", df['Proximity to Train Station'].isnull().sum())
print("Missing values in 'Median House Price (2020)':", df['Median House Price (2020)'].isnull().sum())

# Remove rows with NaN values in 'Median House Price (2020)'
df_cleaned = df.dropna(subset=['Median House Price (2020)'])

# Ensure 'Proximity to Train Station' does not contain zero values
# Add a small constant value to prevent division by zero
epsilon = 1e-6  # A small constant
df_cleaned['Proximity to Train Station'] += epsilon

# Test the correlation again on the cleaned data
correlation, p_value = pearsonr(df_cleaned['Proximity to Train Station'], df_cleaned['Median House Price (2020)'])
print(f"Correlation: {correlation}")
print(f"P-Value: {p_value}")


Summary of the results:

Missing values in 'Proximity to Train Station': 0
Missing values in 'Median House Price (2020)': 3
Correlation: 0.0699 (approximately)
P-Value: 0.1539
The correlation between "Proximity to Train Station" and "Median House Price (2020)" is approximately 0.0699, and the p-value is 0.1539. A p-value greater than 0.05 suggests that the correlation may not be statistically significant at a typical significance level of 0.05. However, the correlation is still worth considering, as it could indicate a weak relationship between the two variables.

Hypothesis Testing:

Perform hypothesis tests to answer specific questions. For example, you can test hypotheses like:

1. Is there a statistically significant difference in median house prices between different regions?
2. Does the proximity to train stations have a significant impact on house prices?

In [None]:
# Hypothesis test for regional differences in house prices
region_groups = df.groupby('Region')['Median House Price (2020)']
region_names = df['Region'].unique()

for i in range(len(region_names)):
    for j in range(i + 1, len(region_names)):
        region1 = region_groups.get_group(region_names[i])
        region2 = region_groups.get_group(region_names[j])
        t_stat, p_value = ttest_ind(region1, region2)
        print(f'Test between {region_names[i]} and {region_names[j]}: p-value = {p_value}')

Correlation Heatmap:

Create a heatmap to visualize the correlations between all numerical variables in dataset.

In [None]:
correlation_matrix = df.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()

Residual Analysis:

Analyze the residuals of your regression model to check for patterns.

In [None]:
# Reshape the arrays to be 1-dimensional
y_pred = y_pred.flatten()
residuals = residuals.flatten()

# Create a scatter plot of residuals vs. predicted values
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_pred, y=residuals)
plt.title('Residual Plot')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.show()


In [None]:
# Define the features and target variable
X = df[['Proximity to Train Station', 'Traffic', 'Public Transport', 'Nature', 'Noise']]
y = df['Median House Price (2020)']

# Check for and handle missing values in the target variable 'y'
if y.isnull().any():
    # Handle missing values in 'y' by imputing with the mean
    imputer = SimpleImputer(strategy='mean')
    y = imputer.fit_transform(y.values.reshape(-1, 1))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the imputer to handle missing values in features
imputer = SimpleImputer(strategy='mean')

# Fit and transform the imputer on the training data
X_train_imputed = imputer.fit_transform(X_train)

# Transform the test data with the same imputer
X_test_imputed = imputer.transform(X_test)

# Initialize the linear regression model
model = LinearRegression()

# Fit the model to the training data with imputed features
model.fit(X_train_imputed, y_train)

# Make predictions on the test data with imputed features
y_pred = model.predict(X_test_imputed)

# Evaluate the model using MSE and MAE
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")


In the above code:

1. We check for missing values in the target variable 'y' using y.isnull().any(), and if there are missing values, we impute them with the mean using SimpleImputer.
2. We proceed with the model building and evaluation steps as before after handling any potential missing values in the target variable.

In [None]:
# Model Visualization

plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred)
plt.xlabel("Actual House Prices")
plt.ylabel("Predicted House Prices")
plt.title("Actual vs. Predicted House Prices")
plt.show()

In [None]:
# Residual Distribution Plot
plt.figure(figsize=(15, 10))
sns.histplot(residuals, bins=20, kde=True)
plt.title('Distribution of Residuals')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()

This plot shows the distribution of residuals. A normal distribution of residuals around zero is a good sign for a linear regression model.

In [None]:
#QQ Plot for Residuals
plt.figure(figsize=(8, 6))
sm.qqplot(residuals, line='r')
plt.title('QQ Plot of Residuals')
plt.show()

A QQ plot helps you assess if the residuals follow a normal distribution. Points should closely follow the red line.

In [None]:
# Model Coefficients Visualization
# Get the model coefficients and flatten them
coefficients = model.coef_.flatten()

# Create a bar chart to visualize coefficients
plt.figure(figsize=(10, 6))
plt.bar(X.columns, coefficients)
plt.title('Model Coefficients')
plt.xlabel('Features')
plt.ylabel('Coefficient Value')
plt.xticks(rotation=45)
plt.show()


This chart displays the coefficients of the features in the model. Positive coefficients indicate a positive relationship with house prices, while negative coefficients indicate a negative relationship.

In [None]:
# Save the model to a file
model_filename = 'linear_regression_model.pkl'
joblib.dump(model, model_filename)

print(f"Model saved as {model_filename}")
# Load the saved model
loaded_model = joblib.load(model_filename)