<a href="https://colab.research.google.com/github/imwaseem93/deep-learning-ai/blob/main/Ames_Housing_Prices_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load Ames Housing Dataset from local CSV
# Replace 'path/to/AmesHousing.csv' with the actual file path (e.g., 'C:/Users/YourName/Downloads/AmesHousing.csv')
url = '/content/AmesHousing.csv'
data = pd.read_csv(url)
# Define features with correct column names from the dataset
features = ['Overall Qual', 'Gr Liv Area', 'Garage Cars', 'Total Bsmt SF', 'Full Bath', 'Year Built']
# Check if all features exist
missing_cols = [col for col in features if col not in data.columns]
if missing_cols:
    print(f"Error: Columns {missing_cols} not found in dataset. Available columns: {data.columns.tolist()}")
    exit()

print("Dataset shape: ", data.shape)
print(data.describe)

FileNotFoundError: [Errno 2] No such file or directory: '/content/AmesHousing.csv'

In [None]:
# Data Preprocessing
# Create binary target: 1 if SalePrice > median, 0 otherwise
median_price = data['SalePrice'].median()
data['HighPrice'] = (data['SalePrice'] > median_price).astype(int)
# Drop rows with missing values in selected features or SalePrice
data = data[features + ['SalePrice', 'HighPrice']].dropna()
print(f'Dataset size after cleaning: {len(data)}')
print('Missing values after cleaning:\n', data.isnull().sum())

Dataset size after cleaning: 2928
Missing values after cleaning:
 Overall Qual     0
Gr Liv Area      0
Garage Cars      0
Total Bsmt SF    0
Full Bath        0
Year Built       0
SalePrice        0
HighPrice        0
dtype: int64


In [None]:
# Visualize distributions
plt.figure(figsize=(12, 8))
data[features].hist(bins=30, figsize=(12, 8))
plt.tight_layout()
plt.savefig('feature_distributions.png')
plt.close()


<Figure size 1200x800 with 0 Axes>

In [None]:
# Correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(data[features + ['HighPrice']].corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.savefig('correlation_matrix.png')
plt.close()

In [None]:
# Split data
X = data[features]
y = data['HighPrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Train models
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}
results = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
    results[name] = {'accuracy': accuracy, 'auc': auc, 'cv_mean': cv_scores.mean(), 'cv_std': cv_scores.std()}
    print(f'{name}:\nAccuracy: {accuracy:.3f}, AUC: {auc:.3f}, CV Accuracy: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}')


Logistic Regression:
Accuracy: 0.892, AUC: 0.893, CV Accuracy: 0.902 ± 0.005
Random Forest:
Accuracy: 0.911, AUC: 0.911, CV Accuracy: 0.910 ± 0.009
Gradient Boosting:
Accuracy: 0.901, AUC: 0.901, CV Accuracy: 0.913 ± 0.008


In [None]:
# Plot model comparison
plt.figure(figsize=(10, 6))
metrics = pd.DataFrame({
    'Model': results.keys(),
    'Accuracy': [results[m]['accuracy'] for m in results],
    'AUC': [results[m]['auc'] for m in results]
})
metrics.plot(x='Model', kind='bar', figsize=(10, 6))
plt.title('Model Performance Comparison')
plt.ylabel('Score')
plt.savefig('model_performance.png')
plt.close()

<Figure size 1000x600 with 0 Axes>

In [None]:
# Feature importance for Random Forest
rf_model = models['Random Forest']
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)
plt.figure(figsize=(8, 5))
sns.barplot(x='Importance', y='Feature', data=feature_importance)
plt.title('Random Forest Feature Importance')
plt.savefig('feature_importance.png')
plt.close()


In [None]:
# Confusion matrix for Gradient Boosting
gb_model = models['Gradient Boosting']
y_pred_gb = gb_model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred_gb)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Gradient Boosting Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('confusion_matrix.png')
plt.close()


In [None]:
print('Analysis complete. Visualizations saved.')