# Airbnb Prices â€” Outlier Analysis

Full step-by-step outlier analysis notebook.

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
plt.rcParams['figure.figsize'] = (10,5)


## Load data

In [None]:

# Adjust path if needed
DATA_PATH = 'airbnb_europe_prices.csv'
df = pd.read_csv(DATA_PATH)
df.head()


## Summary statistics

In [None]:

df.describe().T


## Visual inspection

In [None]:

df['price_total'].hist(bins=80)
plt.title('price_total distribution')
plt.show()

plt.boxplot(df['price_total'], vert=False)
plt.title('price_total boxplot')
plt.show()


## IQR / Tukey outliers

In [None]:

q1 = df['price_total'].quantile(0.25)
q3 = df['price_total'].quantile(0.75)
iqr = q3 - q1
upper = q3 + 1.5 * iqr
outliers_iqr = df[df['price_total'] > upper]
outliers_iqr.head()


## Log transform

In [None]:

df['log_price'] = np.log1p(df['price_total'])
df['log_price'].hist(bins=80)
plt.title('log(price_total + 1)')
plt.show()


## Simple model comparison

In [None]:

features = ['max_guests','num_bedrooms','distance_city_center']
X = df[features].fillna(0)
y = df['price_total']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
model = LinearRegression().fit(X_train, y_train)
pred = model.predict(X_test)
print('MAE raw:', mean_absolute_error(y_test, pred))

y_log = np.log1p(y)
X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.25, random_state=42)
model = LinearRegression().fit(X_train, y_train)
pred = np.expm1(model.predict(X_test))
print('MAE log:', mean_absolute_error(np.expm1(y_test), pred))
