In [None]:
#Airbnb 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

# Ignore warnings for clean output
import warnings
warnings.filterwarnings("ignore")

# Define dataset path
dataset_path = "/kaggle/input/tensorlabs-2025-internships"

### 1 Load the datasets ###
def load_data():
    try:
        property_df = pd.read_csv(f"{dataset_path}/Detailed_Property.csv")
        reviews_df = pd.read_csv(f"{dataset_path}/Property_Reviews.csv")
        places_df = pd.read_csv(f"{dataset_path}/property_by_place.csv")
        return property_df, reviews_df, places_df
    except FileNotFoundError as e:
        print("Error: One or more dataset files not found.")
        raise e

# 1 Load data
property_df, reviews_df, places_df = load_data()

### 2 Data Preprocessing ###
# Rename 'id' in places_df to match 'property_id' for merging
places_df.rename(columns={"id": "property_id"}, inplace=True)

# Merge datasets
df = property_df.merge(places_df, on="property_id", how="left")
df = df.merge(reviews_df, on="property_id", how="left")

# Drop irrelevant columns
df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], errors='ignore', inplace=True)

# Convert price to numeric (remove currency symbols if present)
if df["price"].dtype == 'object':
    df["price"] = df["price"].astype(str).str.replace(r"[^\d.]", "", regex=True)
    df["price"] = pd.to_numeric(df["price"], errors="coerce")

# Fill missing values intelligently
df["starRating"].fillna(df["starRating"].mean(), inplace=True)
df["price"].fillna(df["price"].median(), inplace=True)

# Drop rows with critical missing values
df.dropna(subset=['bedrooms', 'bathrooms', 'personCapacity'], inplace=True)

# Convert categorical variables into numerical (Label Encoding for property type)
if "propertyType" in df.columns:
    label_encoder = LabelEncoder()
    df["propertyType"] = label_encoder.fit_transform(df["propertyType"])

### 3 Exploratory Data Analysis (EDA) ###

# Price distribution
plt.figure(figsize=(10, 5))
sns.histplot(df['price'], bins=50, kde=True)
plt.title("Price Distribution")
plt.xlabel("Price ($)")
plt.ylabel("Count")
plt.show()

# Property type vs. price (EDA - NOT model prediction)
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='propertyType', y='price')
plt.xticks(rotation=45)
plt.title("Price by Property Type (EDA - Data Distribution)")
plt.xlabel("Encoded Property Types")
plt.ylabel("Price ($)")
plt.show()

### 4 Machine Learning Model (Predicting Price) ###
# Define features for the model
features = ['starRating', 'bedrooms', 'bathrooms', 'personCapacity', 'propertyType']
df = df.dropna(subset=features + ['price'])

X = df[features]
y = df['price']

# Split data into train & test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train RandomForestRegressor with optimized parameters
model = RandomForestRegressor(n_estimators=200, max_depth=15, min_samples_split=5, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

### 5 Model Evaluation ###
#  Evaluate Model Performance
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\n Model Performance:")
print(f" Mean Absolute Error (MAE): {mae:.2f}")
print(f" Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f" R² Score: {r2:.2f}")

#  Feature Importance Plot
plt.figure(figsize=(8, 5))
sns.barplot(x=model.feature_importances_, y=features)
plt.title("Feature Importance in Price Prediction")
plt.xlabel("Importance")
plt.ylabel("Features")
plt.show()

# Actual vs. Predicted Prices (Regression Performance)
plt.figure(figsize=(8, 5))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.7)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.title("Actual vs. Predicted Prices")
plt.xlabel("Actual Price ($)")
plt.ylabel("Predicted Price ($)")
plt.show()
