# Import

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error


# Data understanding

## Reading the dataset (csv file) into Pandas dataframe

In [2]:
housing_file_path = "/Users/killercookie/Documents/GitHub/School code/School-Code/Machine lab/DataSet_LakasArak_labeled.csv"
housing = pd.DataFrame(pd.read_csv(housing_file_path))

FileNotFoundError: [Errno 2] No such file or directory: '/Users/killercookie/Documents/GitHub/School code/School-Code/Machine lab/DataSet_LakasArak_labeled.csv'

## Exploring the dataframe

In [186]:
# Check the head of the dataset
housing.head()

Unnamed: 0,county,city,postcode,property_type,property_subtype,property_condition_type,property_floor,building_floor_count,view_type,orientation,...,room_cnt,small_room_cnt,created_at,property_area,balcony_area,price_created_at,ad_view_cnt,active_days,nr,split
0,Budapest,Budapest XII.,,flat,brick flat (for sale),good,1,,street view,,...,2.0,1.0,2015-02-09,65.0,0.0,23.5,605.0,119.0,4,test
1,Budapest,Budapest I.,1016.0,flat,brick flat (for sale),novel,2,,street view,,...,1.0,1.0,2015-02-09,45.0,0.0,20.0,49.0,25.0,12,train
2,Budapest,Budapest XVI.,1164.0,flat,brick flat (for sale),novel,1,,garden view,,...,2.0,1.0,2015-02-09,60.0,0.0,22.0,77.0,77.0,14,train
3,Budapest,Budapest X.,,flat,brick flat (for sale),good,4,,garden view,,...,2.0,0.0,2015-02-09,55.0,4.0,11.0,139.0,18.0,21,train
4,Budapest,Budapest XVIII.,1181.0,flat,prefabricated panel flat (for sale),renewed,6,,,,...,2.0,1.0,2015-02-09,60.0,3.0,10.2,176.0,69.0,31,test


In [187]:
housing.shape

(78539, 23)

In [188]:
# The dataset shape shows the number of instances and features in the dataset
num_instances = housing.shape[0]  # Number of rows
num_features = housing.shape[1]   # Number of columns


In [189]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78539 entries, 0 to 78538
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   county                   78539 non-null  object 
 1   city                     77980 non-null  object 
 2   postcode                 49585 non-null  float64
 3   property_type            78539 non-null  object 
 4   property_subtype         76880 non-null  object 
 5   property_condition_type  78539 non-null  object 
 6   property_floor           74746 non-null  object 
 7   building_floor_count     36429 non-null  object 
 8   view_type                42878 non-null  object 
 9   orientation              47647 non-null  object 
 10  garden_access            17200 non-null  object 
 11  heating_type             67233 non-null  object 
 12  elevator_type            64388 non-null  object 
 13  room_cnt                 78539 non-null  float64
 14  small_room_cnt        

In [190]:
housing.describe()

Unnamed: 0,postcode,room_cnt,small_room_cnt,property_area,balcony_area,price_created_at,ad_view_cnt,active_days,nr
count,49585.0,78539.0,78539.0,78539.0,78539.0,78539.0,78539.0,78539.0,78539.0
mean,1103.35898,1.467666,0.559875,48.440584,1.953182,19.341475,259.59932,44.173684,196334.09324
std,50.769326,0.59984,0.737015,12.716653,4.677227,8.900296,512.351553,47.821006,113305.083861
min,1011.0,0.0,0.0,5.0,0.0,0.2,0.0,1.0,4.0
25%,1064.0,1.0,0.0,40.0,0.0,13.2,42.0,11.0,97959.0
50%,1101.0,1.0,0.0,50.0,0.0,16.9,103.0,28.0,196095.0
75%,1142.0,2.0,1.0,60.0,3.0,23.9,263.0,61.0,294516.5
max,1239.0,7.0,4.0,70.0,97.0,99.6,28096.0,537.0,394181.0


## Features and ground truth labels

In [191]:
# One of the columns contains the prices. In this task, we need to predict the prices based on some information that we have; thus, this column is the ground truth label.
# 'price_created_at' column has the ground truth label that we are going to use in training and testing later.
gt_feature = 'price_created_at'

# Data preperation

In [192]:
# As we learned earlier, prepare the data for the next steps (e.g. train and test).
# You might use all the available features or part of them. Please justify your choices.
# Attention!!! Do not drop any N/A value.
# ...


# Function to impute missing textual data while preserving distribution
def impute_categorical_with_distribution(df, columns):
    for col in columns:
        if df[col].isnull().sum() > 0:
            # Get the frequency distribution of the non-null values
            value_counts = df[col].value_counts(normalize=True)
            # Impute missing values by sampling from the observed distribution
            imputed_values = np.random.choice(value_counts.index, 
                                              size=df[col].isnull().sum(), 
                                              p=value_counts.values)
            df.loc[df[col].isnull(), col] = imputed_values
    
    return df

# List of categorical/textual columns
categorical_columns = housing.select_dtypes(include=['object']).columns
housing = impute_categorical_with_distribution(housing, categorical_columns)

# Function to impute missing values based on skewness
def impute_numerical_with_distribution(df, columns):
    for col in columns:
        if df[col].isnull().sum() > 0:
            skewness = df[col].skew()
            if abs(skewness) < 0.5:  # Low skewness, use mean
                mean_value = df[col].mean()
                noise = np.random.normal(loc=0, scale=df[col].std(), size=df[col].isnull().sum())
                df.loc[df[col].isnull(), col] = mean_value + noise
            else:  # High skewness, use median
                median_value = df[col].median()
                # Sample from the observed distribution and add a bit of randomness
                observed_values = df[col].dropna()
                imputed_values = np.random.choice(observed_values, size=df[col].isnull().sum())
                df.loc[df[col].isnull(), col] = imputed_values

    return df

# List of numerical columns
numerical_columns = housing.select_dtypes(include=['float64', 'int64']).columns
housing = impute_numerical_with_distribution(housing, numerical_columns)

# File path where the new dataset will be saved
new_file_path = "/Users/killercookie/Documents/GitHub/School code/School-Code/Machine lab/new_dataset.csv"

# Save the modified dataset to a CSV file
housing.to_csv(new_file_path, index=False)

## Holding out a test set for performance evaluation

In [193]:
# 1- We need to decide how much of the data is used for testing.
#    In this experiment the data is labeled beforehand, we have 30% of the data for testing purposes.
# 2- How many instances do we have for training and testing?

train_set = housing[housing['split']=='train']
test_set = housing[housing['split']=='test']

train_set.shape, test_set.shape

((54977, 23), (23562, 23))

In [194]:
# The following is just to assert that the data is complete and none of th einstances was dropped
test_perc = 0.3
train_perc = 1 - test_perc

assert (len(train_set) + len(test_set)) == num_instances
assert (len(train_set)) == int(train_perc*num_instances)
assert (len(test_set)) == (num_instances - len(train_set))

# Model selection

In [195]:
# After the data preperation/preprocessing step, the list of selected features (as strings) should be saved into a list in the form:
# features = [feature1, feature2, ...]
# Select only numerical features (int64 and float64 types)

numerical_features = housing.select_dtypes(include=['int64', 'float64']).columns

# Exclude the target column (price) and any non-feature columns like 'split'
features = [col for col in numerical_features if col != 'price_created_at']
print(features)

['postcode', 'room_cnt', 'small_room_cnt', 'property_area', 'balcony_area', 'ad_view_cnt', 'active_days', 'nr']


In [196]:
# We need to create features and ground truth sets for both train and test splits that we have. Use 'features' and 'gt_feature'.

X_train = train_set[features]
y_train = train_set[gt_feature]

X_test = test_set[features]
y_test = test_set[gt_feature]

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(54977, 8) (54977,) (23562, 8) (23562,)


In [197]:
# For prediction, you should use the following function. As you can see, it is incomplete, please fill the gaps.
# The selected model has to learn the features in the data before giving an educated prediction. Why?
# We first fit the model using the train data, then use it to predict labels (prices) for the test instances. Why?

def model_predict(model, X_train, y_train, X_test):
  # fit the model
  model.fit(X_train, y_train)

  # make predictions
  pred = model.predict(X_test)
  return model, pred

In [198]:
# Using the selected models, You can make the predictions using 'model_predict' function. Please save the returned values so we can check their performance.
# ...

# Initialize models
model_1 = LinearRegression()
model_2 = RandomForestRegressor()
model_3 = GradientBoostingRegressor()

# Make predictions using the defined model_predict function
model_1, pred_1 = model_predict(model_1, X_train, y_train, X_test)
model_2, pred_2 = model_predict(model_2, X_train, y_train, X_test)
model_3, pred_3 = model_predict(model_3, X_train, y_train, X_test)

# Check predictions (optional)
print(pred_1[:5], pred_2[:5], pred_3[:5])

[22.31898931 17.67666373 11.54571735 13.71106863 22.02962761] [21.299 15.663  9.53  17.127 17.881] [21.22572185 15.73415942  9.74853171 17.14417257 18.80181525]


# Evaluation

In [199]:
# For evaluation we use Mean Absolute Error (MAE), Mean Absolute Percentage Error (MAPE), and Root Mean Square Error (RMSE).
# Please complete the following function:

def model_evaluate(pred, target):
    mae = mean_absolute_error(target, pred)
    mape = np.mean(np.abs((target - pred) / target)) * 100  # MAPE calculation
    rmse = np.sqrt(mean_squared_error(target, pred))
    
    return mae, mape, rmse

In [200]:
# Check and compare the performance for all the models. Do you find any interesting observtions(s)? What are your conclusion(s)?

In [201]:
# Evaluate model 1 (Linear Regression)
mae_1, mape_1, rmse_1 = model_evaluate(pred_1, y_test)
print(f"Model 1 (Linear Regression): MAE={mae_1}, MAPE={mape_1}, RMSE={rmse_1}")

# Evaluate model 2 (Random Forest)
mae_2, mape_2, rmse_2 = model_evaluate(pred_2, y_test)
print(f"Model 2 (Random Forest): MAE={mae_2}, MAPE={mape_2}, RMSE={rmse_2}")

# Evaluate model 3 (Gradient Boosting)
mae_3, mape_3, rmse_3 = model_evaluate(pred_3, y_test)
print(f"Model 3 (Gradient Boosting): MAE={mae_3}, MAPE={mape_3}, RMSE={rmse_3}")


Model 1 (Linear Regression): MAE=5.356422305362718, MAPE=29.388705395841235, RMSE=7.4067527226305065
Model 2 (Random Forest): MAE=3.889860241066124, MAPE=21.364579499769, RMSE=5.926298361187138
Model 3 (Gradient Boosting): MAE=4.391367562711816, MAPE=24.100257216727986, RMSE=6.333144479104173
