<a href="https://colab.research.google.com/github/konerjonlar/Akbank-Makine-Ogrenmesi-Bootcamp/blob/main/Housing_Price_Predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Required Libraries

In [1]:
# for data analysis
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# for z-score
from scipy import stats
# for modellling and evaluating performance of the model
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
from sklearn import metrics
from sklearn.linear_model import Lasso, LinearRegression, Ridge, ElasticNet
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Project Definition

For this project, we need to load the Melbourne Housing dataset into our project. The
quality and amount of data we collect will determine how good our predictive model
can be. For this reason, we need to examine the dataset very carefully. We will
estimate the price of a house using the Melbourne Housing dataset, which is a
real-life example. Before evaluating any cost, we will start by analyzing the data
using preprocessing techniques. We will then build our models and measure their
performance to complete the project.

# Gathering and Observing Data

## Load the dataset to the project with the help of read_csv() and observe the first 5 columns

In [2]:
# Load dataset
df = pd.read_csv('Melbourne_housing_FULL.csv')

# Observe the 5 columns
df.iloc[:, :5].head()


FileNotFoundError: ignored

## Find the shape, number of columns and size of the dataset

In [None]:
# Find the shape, number of columns and size of the dataset
print("\nShape of the Dataset:\n", df.shape,
"\nNumber of Columns:\n", len(df.columns),
"\nSize of the Dataset:\n", df.size)

## Show the information of the dataset, which contains the number of columns, column labels, column data types, memory usage, range index, and the number of cells in each column (non-null values).

In [None]:
# Display general information about the dataset
print("\nColumn Labels:", df.columns.tolist(),
      "\nColumn Data Types:", df.dtypes,
      "\nMemory Usage:", df.memory_usage(),
      "\nRange Index:", df.index,
      "\nNumber of cells in each column (non-null values):", df.count(), sep="\n")

# Exploratory Data Analysis

## Examine the descriptive statistics of dataset

In [None]:
df.describe()

## The values of some variables are given as objects. At the same time, we observe that there are also categorical values. This might give us trouble when examining the dataset. Therefore, in such cases, we need to define the variables categorically.

In [None]:
# Convert these columns to categorical data type
categorical_columns = [col for col in df.columns if df[col].dtype == 'object']

## Check for duplicate data. If there are duplicate data, clear them from the dataset.

In [None]:
# Check for and remove duplicate data
duplicates = df.duplicated()  # Find duplicate rows
df_no_duplicates = df[~duplicates]  # Create a new DataFrame without duplicates

# Print the number of duplicate rows
print("Number of duplicate rows:", duplicates.sum())

# Update the original DataFrame to contain only the non-duplicate data
df = df_no_duplicates

## Clear outlier data in the dataset. When you examine the dataset, you will observe that the outlier data is generally in the "Landsize" and "Buildingarea" variables.

In [None]:
# Calculate z-scores for 'Landsize' and 'BuildingArea'
z_scores_landsize = np.abs(stats.zscore(df['Landsize']))
z_scores_building_area = np.abs(stats.zscore(df['BuildingArea']))

# Define a z-score threshold for identifying outliers
z_score_threshold = 3

# Create boolean masks to identify outlier rows
outliers_landsize = z_scores_landsize > z_score_threshold
outliers_building_area = z_scores_building_area > z_score_threshold

# Remove rows with outliers
df = df[~(outliers_landsize | outliers_building_area)]

## Find and remove the missing values on the dataset

In [None]:
# Find and remove rows with missing values
df = df.dropna(subset=['Bathroom', 'Car'])

# Fill missing values with the mode
df['Bathroom'] = df['Bathroom'].fillna(df['Bathroom'].mode().idxmax())
df['Car'] = df['Car'].fillna(df['Car'].mode().idxmax())
df = df.reset_index(drop=True)

## Data Visualization

### Build a Histogram to visualize price distribution

In [None]:
# Create a histogram for the 'Price' variable
plt.figure(figsize=(10, 6))  # Set the figure size
plt.hist(df['Price'], bins=30, color='skyblue', edgecolor='black')  # Plot the histogram
plt.title('Price Distribution')  # Set the title
plt.xlabel('Price')  # Label the x-axis
plt.ylabel('Frequency')  # Label the y-axis

# Show the histogram
plt.show()

### Draw a pair plot to see the relationship between all numerical variables and the price variable

In [None]:
# Select the relevant numerical columns for the pair plot
numerical_columns = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude', 'Price']

# Create a pair plot
sns.pairplot(df[numerical_columns])
plt.show()

### Draw a correlation matrix by using a heatmap on seaborn

In [None]:
# Calculate the correlation matrix
df_numeric = df.select_dtypes(include='number')
correlation_matrix = df_numeric.corr()

# Convert the correlation matrix to a NumPy array
correlation_matrix_array = correlation_matrix.to_numpy()

# Create a heatmap to visualize the correlation matrix
plt.figure(figsize=(10, 8))  # Set the figure size
sns.heatmap(correlation_matrix_array, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix Heatmap')  # Set the title
plt.show()

### Implement Label Encoder and One Hot encoder for categorical variables

In [None]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply Label Encoding for ordinal variables
for column in categorical_columns:
    if df[column].nunique() <= 5:
        df[column] = label_encoder.fit_transform(df[column])

# Apply One-Hot Encoding for nominal variables
df = pd.get_dummies(df, columns=[col for col in categorical_columns if df[col].nunique() > 5], drop_first=True)

# Model Selection

## Since we are going to make a price estimation, we need to determine our x and y variables correctly.

## Splitting our data into train-test in order to increase the performance of model training

In [None]:
X = df.drop("Price", axis=1)
y = df["Price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.25)

In [None]:
models = {
'Lasso': Lasso(),
'LinearRegression': LinearRegression(),
'Ridge': Ridge(),
'ElasticNet': ElasticNet(),
'KNeighborsRegressor': KNeighborsRegressor(),
'RandomForestRegressor': RandomForestRegressor(),
'GradientBoostingRegressor': GradientBoostingRegressor(),
'AdaBoostRegressor': AdaBoostRegressor(n_estimators = 10, learning_rate = 1, loss = 'square', random_state = 2)
        }

In [None]:
for model_name, model in models.items():
    model.fit(X_train, y_train)
    predict = model.predict(X_test)
    mae = mean_absolute_error(y_test, predict)
    mse = mean_squared_error(y_test, predict)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, predict)
    print(f"##########{model_name}##########")
    print(mae)
    print(mse)
    print(rmse)
    print(r2)

# Model Evaluation

## Comparing models in each other

## Choose the best performing model by using evaluation metrics(MAE, MSE, RMSE, R2)