# Part 5: Data Analysis and Modeling (If Time Permits)

In this optional notebook, we'll explore basic data analysis techniques and simple modeling.

## Topics Covered:
- Summary statistics
- Data visualization
- Correlation analysis
- Linear regression modeling
- Model evaluation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Set random seed for reproducibility
np.random.seed(42)

# Set style for better-looking plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

print("Libraries loaded successfully!")

## 1. Creating a Sample Dataset

For this analysis, we'll create a hypothetical dataset about house prices:

In [None]:
# Generate synthetic house price data
n_samples = 200

# Features
square_feet = np.random.randint(800, 4000, n_samples)
bedrooms = np.random.randint(1, 6, n_samples)
bathrooms = np.random.choice([1, 1.5, 2, 2.5, 3, 3.5, 4], n_samples)
age_years = np.random.randint(0, 50, n_samples)
garage_spaces = np.random.randint(0, 4, n_samples)

# Target variable (price) - based on features with some noise
base_price = 50000
price_per_sqft = 150
price_per_bedroom = 20000
price_per_bathroom = 15000
depreciation_per_year = 500
price_per_garage = 10000

price = (base_price + 
         price_per_sqft * square_feet + 
         price_per_bedroom * bedrooms +
         price_per_bathroom * bathrooms -
         depreciation_per_year * age_years +
         price_per_garage * garage_spaces +
         np.random.randn(n_samples) * 50000)  # Add noise

# Create DataFrame
housing_df = pd.DataFrame({
    'square_feet': square_feet,
    'bedrooms': bedrooms,
    'bathrooms': bathrooms,
    'age_years': age_years,
    'garage_spaces': garage_spaces,
    'price': price
})

print("Housing dataset:")
print(housing_df.head(10))
print(f"\nDataset shape: {housing_df.shape}")

## 2. Summary Statistics

Understanding the basic statistics of your data is crucial:

In [None]:
# Basic descriptive statistics
print("Summary statistics:")
print(housing_df.describe())

# Additional statistics
print("\nMedian values:")
print(housing_df.median())

print("\nStandard deviation:")
print(housing_df.std())

In [None]:
# Statistics by category
print("Average price by number of bedrooms:")
print(housing_df.groupby('bedrooms')['price'].agg(['mean', 'median', 'count']))

print("\nAverage price by number of bathrooms:")
print(housing_df.groupby('bathrooms')['price'].agg(['mean', 'median', 'count']))

## 3. Data Visualization

Visual exploration helps understand patterns and relationships:

In [None]:
# Distribution of house prices
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(housing_df['price'], bins=30, edgecolor='black', alpha=0.7)
plt.xlabel('Price ($)')
plt.ylabel('Frequency')
plt.title('Distribution of House Prices')
plt.ticklabel_format(style='plain', axis='x')

plt.subplot(1, 2, 2)
plt.boxplot(housing_df['price'])
plt.ylabel('Price ($)')
plt.title('House Price Boxplot')
plt.ticklabel_format(style='plain', axis='y')

plt.tight_layout()
plt.show()

In [None]:
# Scatter plots to visualize relationships
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Price vs Square Feet
axes[0, 0].scatter(housing_df['square_feet'], housing_df['price'], alpha=0.5)
axes[0, 0].set_xlabel('Square Feet')
axes[0, 0].set_ylabel('Price ($)')
axes[0, 0].set_title('Price vs Square Feet')
axes[0, 0].ticklabel_format(style='plain', axis='y')

# Price vs Bedrooms
axes[0, 1].scatter(housing_df['bedrooms'], housing_df['price'], alpha=0.5)
axes[0, 1].set_xlabel('Bedrooms')
axes[0, 1].set_ylabel('Price ($)')
axes[0, 1].set_title('Price vs Bedrooms')
axes[0, 1].ticklabel_format(style='plain', axis='y')

# Price vs Age
axes[1, 0].scatter(housing_df['age_years'], housing_df['price'], alpha=0.5)
axes[1, 0].set_xlabel('Age (years)')
axes[1, 0].set_ylabel('Price ($)')
axes[1, 0].set_title('Price vs Age')
axes[1, 0].ticklabel_format(style='plain', axis='y')

# Price vs Bathrooms
axes[1, 1].scatter(housing_df['bathrooms'], housing_df['price'], alpha=0.5)
axes[1, 1].set_xlabel('Bathrooms')
axes[1, 1].set_ylabel('Price ($)')
axes[1, 1].set_title('Price vs Bathrooms')
axes[1, 1].ticklabel_format(style='plain', axis='y')

plt.tight_layout()
plt.show()

## 4. Correlation Analysis

Understanding how features relate to each other and the target variable:

In [None]:
# Calculate correlation matrix
correlation_matrix = housing_df.corr()

print("Correlation with price:")
print(correlation_matrix['price'].sort_values(ascending=False))

# Visualize correlation matrix with heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, fmt='.2f')
plt.title('Correlation Matrix Heatmap')
plt.tight_layout()
plt.show()

## 5. Linear Regression Model

Building a simple model to predict house prices:

In [None]:
# Prepare features (X) and target (y)
X = housing_df[['square_feet', 'bedrooms', 'bathrooms', 'age_years', 'garage_spaces']]
y = housing_df['price']

print("Features shape:", X.shape)
print("Target shape:", y.shape)
print("\nFeature names:", X.columns.tolist())

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")

In [None]:
# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

print("Model trained successfully!")
print(f"\nIntercept: ${model.intercept_:,.2f}")
print("\nCoefficients:")
for feature, coef in zip(X.columns, model.coef_):
    print(f"  {feature}: ${coef:,.2f}")

## 6. Model Evaluation

Assessing how well our model performs:

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Model Performance Metrics:")
print(f"  RÂ² Score: {r2:.4f}")
print(f"  Mean Absolute Error: ${mae:,.2f}")
print(f"  Root Mean Squared Error: ${rmse:,.2f}")
print(f"\nInterpretation:")
print(f"  The model explains {r2*100:.2f}% of the variance in house prices.")
print(f"  On average, predictions are off by ${mae:,.2f}.")

In [None]:
# Visualize predictions vs actual values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Price ($)')
plt.ylabel('Predicted Price ($)')
plt.title('Actual vs Predicted House Prices')
plt.ticklabel_format(style='plain', axis='both')
plt.tight_layout()
plt.show()

In [None]:
# Residual plot (errors)
residuals = y_test - y_pred

plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--', lw=2)
plt.xlabel('Predicted Price ($)')
plt.ylabel('Residuals ($)')
plt.title('Residual Plot')
plt.ticklabel_format(style='plain', axis='both')
plt.tight_layout()
plt.show()

print(f"Mean residual: ${residuals.mean():,.2f}")
print(f"Std of residuals: ${residuals.std():,.2f}")

## 7. Making Predictions on New Data

Using the model to predict prices for new houses:

In [None]:
# Create some example houses
new_houses = pd.DataFrame({
    'square_feet': [1500, 2500, 3500],
    'bedrooms': [2, 3, 4],
    'bathrooms': [2, 2.5, 3],
    'age_years': [5, 10, 2],
    'garage_spaces': [1, 2, 2]
})

print("New houses to predict:")
print(new_houses)

# Make predictions
predictions = model.predict(new_houses)

print("\nPredicted prices:")
for i, (idx, house) in enumerate(new_houses.iterrows()):
    print(f"\nHouse {i+1}:")
    print(f"  {house['square_feet']} sqft, {house['bedrooms']} bed, "
          f"{house['bathrooms']} bath, {house['age_years']} years old, "
          f"{house['garage_spaces']} garage")
    print(f"  Predicted Price: ${predictions[i]:,.2f}")

## 8. Feature Importance

Understanding which features have the most impact:

In [None]:
# Visualize feature importance (coefficients)
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'coefficient': model.coef_
}).sort_values('coefficient', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['feature'], feature_importance['coefficient'])
plt.xlabel('Coefficient Value')
plt.ylabel('Feature')
plt.title('Feature Importance (Linear Regression Coefficients)')
plt.tight_layout()
plt.show()

print("Feature importance (sorted by coefficient):")
print(feature_importance)

## Practice Exercises

In [None]:
# Exercise 1: Calculate the average price for houses with:
# - More than 3 bedrooms
# - Less than 10 years old
# - At least 2 garage spaces

# Your code here:

In [None]:
# Exercise 2: Create a scatter plot showing price vs square_feet
# Color the points by number of bedrooms

# Your code here:

In [None]:
# Exercise 3: Use the trained model to predict the price of YOUR dream house
# Define the features and make a prediction

# Your code here: