In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load training data
train_df = pd.read_csv('../data/train.csv')

In [3]:
# Display basic info
print(f"Training data shape: {train_df.shape}")
print(f"\nFirst few rows:")
print(train_df.head())
print(f"\nColumn names:")
print(train_df.columns.tolist())

Training data shape: (1460, 81)

First few rows:
   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType 

In [4]:
train_df = train_df.iloc[:, 1:]

In [5]:
# Separate features and target
# Assuming 'SalePrice' is the target column name (adjust if different)
target_col = 'SalePrice'

In [6]:
X = train_df.drop(columns=[target_col])
y = train_df[target_col]

In [7]:
# Handle missing values
# For numeric columns: fill with median
numeric_cols = X.select_dtypes(include=[np.number]).columns
X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].median())

In [8]:
# For categorical columns: fill with mode or 'Missing'
categorical_cols = X.select_dtypes(include=['object']).columns
for col in categorical_cols:
    X[col] = X[col].fillna('Missing')

In [9]:
# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

In [10]:
print(f"\nFeatures shape after preprocessing: {X.shape}")
print(f"Target shape: {y.shape}")


Features shape after preprocessing: (1460, 79)
Target shape: (1460,)


In [11]:
# Split data - 75% train, 25% validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=0)
print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")

Training set size: 1095
Validation set size: 365


In [12]:
# 1. Instantiate
dtr = DecisionTreeRegressor(random_state=0)

In [13]:
# 2. Fit
dtr.fit(X_train, y_train)

In [14]:
# 3. Predict
y_pred = dtr.predict(X_val)

In [15]:
# Calculate metrics
mae = metrics.mean_absolute_error(y_val, y_pred)
mse = metrics.mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
r2 = metrics.r2_score(y_val, y_pred)

print("Decision Tree Performance:")
print(f"Mean Absolute Error: ${mae:,.2f}")
print(f"Root Mean Squared Error: ${rmse:,.2f}")
print(f"R² Score: {r2:.4f}")

Decision Tree Performance:
Mean Absolute Error: $25,708.20
Root Mean Squared Error: $43,882.20
R² Score: 0.7083


In [16]:
# Load test data
test_df = pd.read_csv('../data/test.csv')

print(f"\nTest data shape: {test_df.shape}")


Test data shape: (1459, 80)


In [17]:
# Remove ID column (first column)
test_df = test_df.iloc[:, 1:]

print(f"\nTest data shape: {test_df.shape}")


Test data shape: (1459, 79)


In [18]:
# Preprocess test data (same as training data)
X_test = test_df.copy()

In [19]:
# Handle missing values for numeric columns
X_test[numeric_cols] = X_test[numeric_cols].fillna(X_test[numeric_cols].median())

In [20]:
# Handle missing values and encode categorical columns
for col in categorical_cols:
    X_test[col] = X_test[col].fillna('Missing')
    # Handle unseen categories
    le = label_encoders[col]
    X_test[col] = X_test[col].apply(
        lambda x: le.transform([str(x)])[0] if str(x) in le.classes_ else -1
    )

In [21]:
# Make predictions on test set
test_predictions = dtr.predict(X_test)

print(f"\nTest Predictions Summary:")
print(f"Number of predictions: {len(test_predictions)}")
print(f"Mean predicted price: ${test_predictions.mean():,.2f}")
print(f"Min predicted price: ${test_predictions.min():,.2f}")
print(f"Max predicted price: ${test_predictions.max():,.2f}")


Test Predictions Summary:
Number of predictions: 1459
Mean predicted price: $179,841.93
Min predicted price: $34,900.00
Max predicted price: $755,000.00


In [22]:
# Create submission dataframe
submission = pd.DataFrame({
    'Id': range(len(test_predictions)),
    'SalePrice': test_predictions
})

print("\nFirst few predictions:")
print(submission.head(10))


First few predictions:
   Id  SalePrice
0   0   144000.0
1   1   181000.0
2   2   216500.0
3   3   216500.0
4   4   220000.0
5   5   187000.0
6   6   155000.0
7   7   177000.0
8   8   149000.0
9   9   120500.0


In [23]:
print("\n" + "="*60)
print("MODEL COMPARISON SUMMARY")
print("="*60)
print(f"{'Model':<30} {'MAE':<15} {'RMSE':<15} {'R²':<10}")
print("-"*60)
print(f"{'Basic Decision Tree':<30} ${mae:>13,.2f} ${rmse:>13,.2f} {r2:>9.4f}")
print("="*60)


MODEL COMPARISON SUMMARY
Model                          MAE             RMSE            R²        
------------------------------------------------------------
Basic Decision Tree            $    25,708.20 $    43,882.20    0.7083
