In [16]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [17]:
# Step 2: Load and process Data
df = pd.read_csv('../data/train.csv')
# Remove first id column
df = df.drop(columns=['Id'])

In [18]:
from project.src.utils import classify_price

# Step 3: Create price categories
df['PriceCategory'] = df['SalePrice'].apply(classify_price)

print("Price category distribution:")
print(df['PriceCategory'].value_counts())
print(f"\nPercentages:")
print(df['PriceCategory'].value_counts(normalize=True) * 100)

Price category distribution:
PriceCategory
medium    494
high      483
low       483
Name: count, dtype: int64

Percentages:
PriceCategory
medium    33.835616
high      33.082192
low       33.082192
Name: proportion, dtype: float64


In [19]:
# Step 4: Select Features and Target
X = df.drop(columns=['SalePrice', 'PriceCategory'], axis=1)
y = df['PriceCategory']

# Separate numeric and categorical features
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print(f"Numeric features: {numeric_features}")
print(f"Categorical features: {len(categorical_features)}")

Numeric features: ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
Categorical features: 43


In [20]:
# Step 5: Preprocess Features
from sklearn.preprocessing import LabelEncoder

# Handle numeric features - fill missing values with median
X_numeric = X[numeric_features].fillna(X[numeric_features].median())

# Handle categorical features - encode and fill missing values
X_categorical = X[categorical_features].copy()
X_categorical = X_categorical.fillna('Missing')  # Fill NaN with 'Missing'

# Label encode categorical features
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    X_categorical[col] = le.fit_transform(X_categorical[col])
    label_encoders[col] = le

# Combine numeric and encoded categorical features
X_processed = pd.concat([X_numeric, X_categorical], axis=1)

print(f"\nTotal features after preprocessing: {X_processed.shape[1]}")
print(f"Total samples: {X_processed.shape[0]}")


Total features after preprocessing: 79
Total samples: 1460


In [21]:
# Step 6: Split Data (60% Train, 20% Validation, 20% Test)

# 60% train, 40% remaining
X_train, X_temp, y_train, y_temp = train_test_split(
    X_processed, y, test_size=0.4, random_state=42, stratify=y
)

# 20% validation, 20% test (50-50% of remaining 40%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f"Training set size: {X_train.shape[0]} ({X_train.shape[0]/len(df)*100:.1f}%)")
print(f"Validation set size: {X_val.shape[0]} ({X_val.shape[0]/len(df)*100:.1f}%)")
print(f"Test set size: {X_test.shape[0]} ({X_test.shape[0]/len(df)*100:.1f}%)")

Training set size: 876 (60.0%)
Validation set size: 292 (20.0%)
Test set size: 292 (20.0%)


In [22]:
# Step 7: Train Naive Bayes Model

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

In [23]:
# Step 8: Evaluate on Validation Set

# Predict on validation set
y_val_pred = nb_model.predict(X_val)

# Calculate accuracy
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy:.4f}")

# Classification report
print("\nClassification Report (Validation):")
print(classification_report(y_val, y_val_pred))

# Confusion matrix
print("\nConfusion Matrix (Validation):")
print(confusion_matrix(y_val, y_val_pred))

Validation Accuracy: 0.7637

Classification Report (Validation):
              precision    recall  f1-score   support

        high       0.81      0.81      0.81        96
         low       0.73      1.00      0.84        97
      medium       0.76      0.48      0.59        99

    accuracy                           0.76       292
   macro avg       0.77      0.77      0.75       292
weighted avg       0.77      0.76      0.75       292


Confusion Matrix (Validation):
[[78  3 15]
 [ 0 97  0]
 [18 33 48]]


In [24]:
# Step 9: Evaluate on Test Set

# Predict on test set
y_test_pred = nb_model.predict(X_test)

# Calculate accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Classification report
print("\nClassification Report (Test):")
print(classification_report(y_test, y_test_pred))

# Confusion matrix
print("\nConfusion Matrix (Test):")
print(confusion_matrix(y_test, y_test_pred))

Test Accuracy: 0.7260

Classification Report (Test):
              precision    recall  f1-score   support

        high       0.83      0.81      0.82        97
         low       0.69      0.88      0.77        96
      medium       0.65      0.49      0.56        99

    accuracy                           0.73       292
   macro avg       0.72      0.73      0.72       292
weighted avg       0.72      0.73      0.72       292


Confusion Matrix (Test):
[[79  4 14]
 [ 0 84 12]
 [16 34 49]]


In [25]:
# Step 10: Summary

print("=" * 50)
print("SUMMARY")
print("=" * 50)
print(f"Model: Gaussian Naive Bayes")
print(f"Numeric features: {len(numeric_features)}")
print(f"Categorical features (encoded): {len(categorical_features)}")
print(f"Total features: {X_processed.shape[1]}")
print(f"Training samples: {X_train.shape[0]}")
print(f"Validation samples: {X_val.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")
print(f"\nValidation Accuracy: {val_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print("=" * 50)

SUMMARY
Model: Gaussian Naive Bayes
Numeric features: 36
Categorical features (encoded): 43
Total features: 79
Training samples: 876
Validation samples: 292
Test samples: 292

Validation Accuracy: 0.7637
Test Accuracy: 0.7260
