In [None]:
# Question 1: King County Housing Data Analysis
# Import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Load the data
df = pd.read_csv('kc_house_data.csv')
print("Data loaded successfully!")
print(df.head())

In [None]:
# 1.1: Display the data types of each column using dtypes
print("1.1: Data Types")
print(df.dtypes)

In [None]:
# 1.2: Drop the "id" and "Unnamed: 0" columns and run describe()
print("\n1.2: Drop columns and describe")
df_cleaned = df.drop(['id', 'Unnamed: 0'], axis=1, inplace=True)
print(df.describe())

In [None]:
# 1.3: Use value_counts() on floors column and convert to DataFrame
print("\n1.3: Value counts for floors column")
floors_counts = df['floors'].value_counts().to_frame()
print(floors_counts)

In [None]:
# 1.4: Use seaborn boxplot to compare price outliers for houses with/without waterfront views
print("\n1.4: Boxplot - Price outliers by waterfront views")
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='waterfront', y='price')
plt.title('Price Distribution by Waterfront View')
plt.xlabel('Waterfront (0=No, 1=Yes)')
plt.ylabel('Price')
plt.show()

In [None]:
# 1.5: Use seaborn regplot to determine correlation between sqft_above and price
print("\n1.5: Regplot - Correlation between sqft_above and price")
plt.figure(figsize=(10, 6))
sns.regplot(data=df, x='sqft_above', y='price')
plt.title('Price vs Living Space Above Ground')
plt.xlabel('Square Feet Above Ground')
plt.ylabel('Price')
plt.show()

# Calculate correlation coefficient
correlation = df['sqft_above'].corr(df['price'])
print(f"Correlation between sqft_above and price: {correlation:.4f}")

In [None]:
# 1.6: Fit a linear regression model using sqft_living to predict price and calculate R²
print("\n1.6: Linear Regression - sqft_living predicting price")
X = df[['sqft_living']]
y = df['price']

model_1_6 = LinearRegression()
model_1_6.fit(X, y)

y_pred = model_1_6.predict(X)
r2_1_6 = r2_score(y, y_pred)

print(f"R² Score: {r2_1_6:.4f}")
print(f"Coefficient: {model_1_6.coef_[0]:.4f}")
print(f"Intercept: {model_1_6.intercept_:.2f}")

In [None]:
# 1.7: Fit a linear regression model using multiple features to predict price
print("\n1.7: Linear Regression - Multiple features predicting price")
# Select multiple features (excluding date and zipcode which are not numeric predictors)
feature_cols = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 
                'waterfront', 'view', 'condition', 'grade', 'sqft_above', 
                'sqft_basement', 'yr_built', 'yr_renovated']
X_multi = df[feature_cols]
y = df['price']

model_1_7 = LinearRegression()
model_1_7.fit(X_multi, y)

y_pred_multi = model_1_7.predict(X_multi)
r2_1_7 = r2_score(y, y_pred_multi)

print(f"R² Score: {r2_1_7:.4f}")
print(f"\nFeature Coefficients:")
for feature, coef in zip(feature_cols, model_1_7.coef_):
    print(f"  {feature}: {coef:.4f}")

In [None]:
# 1.8: Create and fit a pipeline using multiple features to predict price and calculate R²
print("\n1.8: Pipeline - Multiple features predicting price")
feature_cols = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 
                'waterfront', 'view', 'condition', 'grade', 'sqft_above', 
                'sqft_basement', 'yr_built', 'yr_renovated']
X_multi = df[feature_cols]
y = df['price']

# Create a pipeline with scaling and linear regression
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])

pipeline.fit(X_multi, y)
y_pred_pipeline = pipeline.predict(X_multi)
r2_1_8 = r2_score(y, y_pred_pipeline)

print(f"R² Score: {r2_1_8:.4f}")

In [None]:
# 1.9: Fit a Ridge regression model (alpha=0.1) using training data and compute R² for test data
print("\n1.9: Ridge Regression - Train/Test split (alpha=0.1)")
feature_cols = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 
                'waterfront', 'view', 'condition', 'grade', 'sqft_above', 
                'sqft_basement', 'yr_built', 'yr_renovated']
X_multi = df[feature_cols]
y = df['price']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_multi, y, test_size=0.2, random_state=42)

# Fit Ridge regression model
ridge_model = Ridge(alpha=0.1)
ridge_model.fit(X_train, y_train)

# Calculate R² for test data
y_pred_test = ridge_model.predict(X_test)
r2_test = r2_score(y_test, y_pred_test)

print(f"R² Score on Test Data: {r2_test:.4f}")
print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

In [None]:
# 1.10: Apply second-order polynomial transform, fit Ridge regression (alpha=0.1), calculate R² on test data
print("\n1.10: Ridge Regression with Polynomial Features (degree=2, alpha=0.1)")
feature_cols = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 
                'waterfront', 'view', 'condition', 'grade', 'sqft_above', 
                'sqft_basement', 'yr_built', 'yr_renovated']
X_multi = df[feature_cols]
y = df['price']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_multi, y, test_size=0.2, random_state=42)

# Create pipeline with polynomial features
poly_pipeline = Pipeline([
    ('poly_features', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler()),
    ('ridge_regressor', Ridge(alpha=0.1))
])

poly_pipeline.fit(X_train, y_train)

# Calculate R² for test data
y_pred_poly_test = poly_pipeline.predict(X_test)
r2_poly_test = r2_score(y_test, y_pred_poly_test)

print(f"R² Score on Test Data (with Polynomial Features): {r2_poly_test:.4f}")
print(f"Number of features after polynomial transform: {poly_pipeline.named_steps['poly_features'].n_output_features_}")