<a href="https://colab.research.google.com/github/incinurcetin/MelbHousing/blob/main/Akbank_Bootcamp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Import Required Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor




In [3]:
# STEP 3 Project Definition

## Load the Melbourne Housing dataset
data=pd.read_csv('Melbourne_housing_FULL.csv')

In [4]:
# STEP 4 Gathering and Observing Data

## Load the dataset using pd.read_csv() and observe the first 5 columns
first_5_columns = data.head()
print("First 5 Columns:")
print(first_5_columns)

## Find the shape, number of columns, and size of the dataset
shape = data.shape
num_columns = shape[1]
size = shape[0] * shape[1]
print("\nShape of the Dataset:")
print(f"Number of Rows: {shape[0]}")
print(f"Number of Columns: {num_columns}")
print(f"Size of the Dataset: {size}")

## Show the information of the dataset
data_info = data.info()
print("\nDataset Information:")
print(data_info)


First 5 Columns:
       Suburb             Address  Rooms Type      Price Method SellerG  \
0  Abbotsford       68 Studley St      2    h        NaN     SS  Jellis   
1  Abbotsford        85 Turner St      2    h  1480000.0      S  Biggin   
2  Abbotsford     25 Bloomburg St      2    h  1035000.0      S  Biggin   
3  Abbotsford  18/659 Victoria St      3    u        NaN     VB  Rounds   
4  Abbotsford        5 Charles St      3    h  1465000.0     SP  Biggin   

        Date  Distance  Postcode  ...  Bathroom  Car  Landsize  BuildingArea  \
0  3/09/2016       2.5    3067.0  ...       1.0  1.0     126.0           NaN   
1  3/12/2016       2.5    3067.0  ...       1.0  1.0     202.0           NaN   
2  4/02/2016       2.5    3067.0  ...       1.0  0.0     156.0          79.0   
3  4/02/2016       2.5    3067.0  ...       2.0  1.0       0.0           NaN   
4  4/03/2017       2.5    3067.0  ...       2.0  0.0     134.0         150.0   

   YearBuilt         CouncilArea Lattitude  Longtit

In [5]:
# STEP 5 Exploratory Data Analysis

## Examine Descriptive Statistics
numerical_stats = data.describe()
print("Descriptive Statistics for Numerical Columns:")
print(numerical_stats)

## Convert Categorical Variables to Categorical Data Types
## Identify categorical columns and convert them
categorical_columns = ['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method','SellerG','Date','Distance','Postcode', 'Bathroom','Car','Landsize','BuildingArea','YearBuilt','CouncilArea','Lattitude','Longtitude','Regionname','Propertycount']
data[categorical_columns] = data[categorical_columns].astype('category')

## Check for Duplicate Data
# Identify and count duplicate rows
duplicate_rows = data[data.duplicated()]
num_duplicates = len(duplicate_rows)
# Remove duplicate rows
data_no_duplicates = data.drop_duplicates()
# Print number of duplicate rows and the resulting dataset without duplicates
print(f"\nNumber of Duplicate Rows: {num_duplicates}")
print(f"Dataset Shape after Removing Duplicates: {data_no_duplicates.shape}")

##Z-score
# Define the columns with outliers
columns_with_outliers = ['Landsize', 'BuildingArea']
# Create a StandardScaler object
scaler = StandardScaler()
# Calculate Z-scores for the selected columns
data[columns_with_outliers] = scaler.fit_transform(data[columns_with_outliers])
# Print the dataset with Z-scores
print("Dataset with Z-scores:")
print(data[columns_with_outliers])

## Define the columns with missing values
columns_with_missing_values = ['Bathroom', 'Car']
# Fill missing values with the mode
for column in columns_with_missing_values:
    data[column] = data[column].fillna(data[column].mode().iloc[0])
# Display the dataset with missing values filled
print("Dataset with Missing Values Filled:")
print(data)


Descriptive Statistics for Numerical Columns:
              Rooms         Price      Distance      Postcode      Bedroom2  \
count  34857.000000  2.724700e+04  34856.000000  34856.000000  26640.000000   
mean       3.031012  1.050173e+06     11.184929   3116.062859      3.084647   
std        0.969933  6.414671e+05      6.788892    109.023903      0.980690   
min        1.000000  8.500000e+04      0.000000   3000.000000      0.000000   
25%        2.000000  6.350000e+05      6.400000   3051.000000      2.000000   
50%        3.000000  8.700000e+05     10.300000   3103.000000      3.000000   
75%        4.000000  1.295000e+06     14.000000   3156.000000      4.000000   
max       16.000000  1.120000e+07     48.100000   3978.000000     30.000000   

           Bathroom           Car       Landsize  BuildingArea     YearBuilt  \
count  26631.000000  26129.000000   23047.000000   13742.00000  15551.000000   
mean       1.624798      1.728845     593.598993     160.25640   1965.289885   
st

In [None]:
### DATA VISUALIZATION
plt.title('Price Distribution Histogram')
plt.xlabel('Price')
sns.histplot(data['Price'],kde=True)
plt.show()

numerical_variables=['Rooms','Price','Distance','Bathroom','Car','Landsize','BuildingArea','Lattitude','Longtitude','Propertycount']

sns.set(style="ticks")
sns.pairplot(data[numerical_variables], height=2, plot_kws={'alpha': 0.5})
plt.suptitle('Pair Plot of Numerical Variables and Price', y=1.02)
plt.show()

correlation_matrix = data[numerical_variables].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix Heatmap')
plt.show()



In [6]:

categorical_variables= ['Suburb','Address','Type','Method','SellerG','Date','Postcode','YearBuilt','CouncilArea','Regionname']


label_encoder = LabelEncoder()
for column in categorical_variables :
  data[column] = label_encoder.fit_transform(data[column])
one_hot_encoder = OneHotEncoder(sparse=False)
encoded_data = one_hot_encoder.fit_transform(data[categorical_variables].values.reshape(-1, 1))
print(data)



       Suburb  Address Rooms  Type      Price  Method  SellerG  Date Distance  \
0           0    29458     2     0        NaN       6      155    58      2.5   
1           0    32512     2     0  1480000.0       2       33    60      2.5   
2           0    15389     2     0  1035000.0       2       33    63      2.5   
3           0     9768     3     2        NaN       7      296    63      2.5   
4           0    25128     3     0  1465000.0       5       33    64      2.5   
...       ...      ...   ...   ...        ...     ...      ...   ...      ...   
34852     348     5862     4     0  1480000.0       0      152    38      6.3   
34853     348    16982     2     0   888000.0       5      321    38      6.3   
34854     348     7350     2     1   705000.0       2      152    38      6.3   
34855     348     5421     3     0  1140000.0       5      370    38      6.3   
34856     348    17743     2     0  1020000.0       0      276    38      6.3   

       Postcode  ...  Bathr

In [9]:
# STEP 6 Model Selection

X = encoded_data.drop('Price', axis=1)
y = encoded_data['Price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Dictionary of regression models
models = {
    'Lasso': {
    'model': Lasso()
    },
    'LinearRegression': {
        'model': LinearRegression()
    },
    'Ridge': {
        'model': Ridge()
    },
    'ElasticNet': {
        'model': ElasticNet()
    },
    'KNeighborsRegressor': {
        'model': KNeighborsRegressor()
    },
    'RandomForestRegressor': {
        'model': RandomForestRegressor()
    },
    'GradientBoostingRegressor': {
        'model': GradientBoostingRegressor()
    },
    'AdaBoostRegressor': {
        'model': AdaBoostRegressor(n_estimators=5, learning_rate=1.2, loss='exponential', random_state=2)
    }
}

# Training
for model_name, model_info in models.items():
    model = model_info['model']
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    print(f'{model_name} - R-squared: {score:.4f}')

AttributeError: ignored