In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.inspection import permutation_importance


In [3]:

# Load the dataset
def load_and_preprocess_data(file_path):
    """Load and preprocess the Malmö housing dataset."""
    df = pd.read_csv(file_path, na_values='NA')
    
    # Handle outliers in final_price
    df = df[df['final_price'] > 100000]  # Remove suspiciously low prices
    df = df[df['final_price'] < 15000000]  # Cap at reasonable upper limit
    
    # Extract neighborhood from location
    df['neighborhood'] = df['location'].apply(lambda x: x.split(',')[0].strip() if isinstance(x, str) else None)
    
    # Clean up year_of_construction
    df['year_of_construction'] = df['year_of_construction'].apply(
        lambda x: x if (pd.notnull(x) and 1800 <= x <= 2024) else np.nan
    )
    
    return df

def prepare_features(df):
    """Prepare features for modeling."""
    # Identify categorical and numerical columns
    categorical_features = ['ownership_form', 'neighborhood']
    numerical_features = [
        'number_of_rooms', 'living_area', 'year_of_construction', 
        'fee', 'floor_number', 'top_floor_number', 'elevator_presence',
        'latitude', 'longitude', 'sale_year'
    ]
    
    # Create preprocessor
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )
    
    return preprocessor, numerical_features, categorical_features

def build_model(df, preprocessor):
    """Build and evaluate the Random Forest model."""
    X = df.drop(['final_price', 'location', 'operational_cost', 'leasehold_fee', 
                 'housing_association', 'sale_month', 'sale_day', 'balcony'], axis=1)
    y = df['final_price']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Create and train pipeline
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
    ])
    
    # Fit the model
    model.fit(X_train, y_train)
    
    # Evaluate the model
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    print(f"Mean Absolute Error: {mae:.2f} SEK")
    print(f"Root Mean Squared Error: {rmse:.2f} SEK")
    print(f"R² Score: {r2:.4f}")
    
    # Cross-validation score
    cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
    print(f"Cross-validation R² scores: {cv_scores}")
    print(f"Mean CV R² score: {cv_scores.mean():.4f}")
    
    return model, X_train, X_test, y_train, y_test

def get_feature_importance(model, X_train, numerical_features, categorical_features):
    """Extract feature importance from the model."""
    # Get feature names after preprocessing
    feature_names = []
    
    # Add numerical feature names
    feature_names.extend(numerical_features)
    
    # Get categorical feature names from the pipeline
    preprocessor = model.named_steps['preprocessor']
    ohe = preprocessor.named_transformers_['cat'].named_steps['onehot']
    categorical_feature_names = ohe.get_feature_names_out(categorical_features).tolist()
    feature_names.extend(categorical_feature_names)
    
    # Extract feature importances
    importances = model.named_steps['regressor'].feature_importances_
    
    # Sort importance values
    indices = np.argsort(importances)[::-1]
    
    # Due to the mismatch in dimensions, we'll just get the top 20 importance values
    top_indices = indices[:min(20, len(importances))]
    top_features = [feature_names[i] if i < len(feature_names) else f"Feature_{i}" for i in top_indices]
    top_importances = [importances[i] for i in top_indices]
    
    importance_df = pd.DataFrame({
        'Feature': top_features,
        'Importance': top_importances
    })
    
    return importance_df

def plot_feature_importance(importance_df):
    """Plot feature importance."""
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=importance_df)
    plt.title('Feature Importance for Housing Price Prediction')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()

def analyze_top_neighborhoods(df):
    """Analyze top and bottom neighborhoods by median price."""
    neighborhood_stats = df.groupby('neighborhood')['final_price'].agg(['count', 'median'])
    neighborhood_stats = neighborhood_stats[neighborhood_stats['count'] >= 100]  # Only consider neighborhoods with enough data
    
    top_neighborhoods = neighborhood_stats.sort_values('median', ascending=False).head(10)
    bottom_neighborhoods = neighborhood_stats.sort_values('median').head(10)
    
    return top_neighborhoods, bottom_neighborhoods

def price_prediction_function(model, input_data):
    """Function for predicting prices based on input features."""
    # Convert the input dictionary to a DataFrame
    input_df = pd.DataFrame([input_data])
    
    # Make the prediction
    predicted_price = model.predict(input_df)[0]
    
    return predicted_price

In [5]:

df = load_and_preprocess_data('temp/hemnet_properties.csv')

In [6]:
df

Unnamed: 0,final_price,location,ownership_form,number_of_rooms,living_area,balcony,year_of_construction,fee,operational_cost,leasehold_fee,housing_association,sale_year,sale_month,sale_day,floor_number,top_floor_number,elevator_presence,latitude,longitude,neighborhood
0,1895000,"Gamla Väster, Malmö kommun",Bostadsrätt,1.0,30.0,0.0,1929.0,1715.0,,,,2024,2,26,2.0,3.0,0.0,55.605517,12.993356,Gamla Väster
1,1600000,"Dockan, Malmö kommun",Bostadsrätt,1.0,33.0,1.0,2023.0,2944.0,2280.0,,,2024,2,20,4.0,7.0,1.0,55.613085,12.988022,Dockan
2,925000,"Ellstorp, Malmö kommun",Bostadsrätt,1.0,34.0,1.0,1938.0,3035.0,,,,2024,2,18,3.0,6.0,1.0,55.603517,13.029341,Ellstorp
3,1675000,"Västra Hamnen, Malmö kommun",Bostadsrätt,1.0,34.0,1.0,2017.0,3374.0,2280.0,,,2024,2,11,4.0,5.0,1.0,55.611881,12.979457,Västra Hamnen
4,600000,"Lönngården, Malmö kommun",Bostadsrätt,1.0,31.0,1.0,1953.0,3126.0,,,,2024,2,11,1.0,4.0,0.0,55.583507,13.023618,Lönngården
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44176,2700000,"Slottsstaden, Malmö kommun",Bostadsrätt,3.0,75.0,1.0,1951.0,4770.0,,,,2024,5,24,2.0,7.0,1.0,55.593241,12.968267,Slottsstaden
44177,3600000,"S:t Knut, Malmö kommun",Bostadsrätt,3.0,69.0,,1932.0,5565.0,11200.0,,,2024,5,22,,,,55.597030,13.015556,S:t Knut
44178,3100000,"Hyllie, Malmö kommun",Bostadsrätt,3.0,68.0,0.0,2019.0,4055.0,,,,2024,5,22,1.0,6.0,1.0,55.559682,12.982279,Hyllie
44179,4575000,"Dockan, Malmö kommun",Bostadsrätt,4.0,102.0,1.0,2021.0,6328.0,8577.0,,,2024,5,17,,,,55.613945,12.987258,Dockan


In [7]:
# Preprocess the data
preprocessor, numerical_features, categorical_features = prepare_features(df)

In [8]:
# use build_model
model, X_train, X_test, y_train, y_test = build_model(df, preprocessor)

Mean Absolute Error: 205711.01 SEK
Root Mean Squared Error: 329078.24 SEK
R² Score: 0.9224
Cross-validation R² scores: [0.82645903 0.86488575 0.8089238  0.86088901 0.52032324]
Mean CV R² score: 0.7763


In [9]:
# get feature importance
importance_df = get_feature_importance(model, X_train, numerical_features, categorical_features)

In [10]:
# plot feature importance
plot_feature_importance(importance_df)

In [11]:
importance_df

Unnamed: 0,Feature,Importance
0,living_area,0.432905
1,latitude,0.172589
2,longitude,0.153163
3,sale_year,0.086582
4,year_of_construction,0.081257
5,fee,0.024783
6,number_of_rooms,0.014123
7,floor_number,0.008363
8,top_floor_number,0.00482
9,neighborhood_Davidshall,0.001656


In [12]:
# analyze top neighborhoods
top_neighborhoods, bottom_neighborhoods = analyze_top_neighborhoods(df)

In [13]:
top_neighborhoods


Unnamed: 0_level_0,count,median
neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1
Gamla Väster,497,3650000.0
Davidshall,464,3500000.0
Dockan,963,3425000.0
Limhamns Sjöstad,402,3000000.0
City,830,2997500.0
Västra Hamnen,1770,2800000.0
Ön,120,2772500.0
Drottningtorget,535,2675000.0
Lugnet,251,2595000.0
Klagshamn,144,2572500.0
