In [3]:
import pandas as pd
import numpy as np

# load dataset
data_path = "kc_house_data.csv"
df = pd.read_csv(data_path)

# Ignore columns
ignore_cols = ["id", "date", "zipcode"]
y_col = "price"
feature_cols = [c for c in df.columns if c not in ignore_cols + [y_col]]

# Question 1: Mean, min, max, variance
summary = pd.DataFrame({
    "mean": df[feature_cols].mean(),
    "min": df[feature_cols].min(),
    "max": df[feature_cols].max(),
    "variance": df[feature_cols].var(ddof=0)
})

print("Summary stats:")
display(summary)

# Lowest and highest mean and variance
lowest_mean_feat = summary["mean"].idxmin()
highest_mean_feat = summary["mean"].idxmax()
lowest_var_feat  = summary["variance"].idxmin()
highest_var_feat = summary["variance"].idxmax()

print("\nLowest mean:", lowest_mean_feat, summary.loc[lowest_mean_feat, "mean"])
print("Highest mean:", highest_mean_feat, summary.loc[highest_mean_feat, "mean"])
print("Lowest variance:", lowest_var_feat, summary.loc[lowest_var_feat, "variance"])
print("Highest variance:", highest_var_feat, summary.loc[highest_var_feat, "variance"])

# Question 2: Correlation of each feature with the response price
corr = df[feature_cols].corrwith(df[y_col]).sort_values(ascending = False)
corr_table = corr.rename("corr_with_price").to_frame()

print("\nCorrelation with price:")
display(corr_table)

positive = corr[corr > 0].index.tolist()
negative = corr[corr < 0].index.tolist()

print("\nPositively correlated features:", positive)
print("Negatively correlated features:", negative)
print("Highest positive correlation:", corr.idxmax(), corr.max())


Summary stats:


Unnamed: 0,mean,min,max,variance
bedrooms,3.370842,0.0,33.0,0.864975
bathrooms,2.114757,0.0,8.0,0.5931238
sqft_living,2079.899736,290.0,13540.0,843494.7
sqft_lot,15106.967566,520.0,1651359.0,1715579000.0
floors,1.494309,1.0,3.5,0.2915745
waterfront,0.007542,0.0,1.0,0.007484879
view,0.234303,0.0,4.0,0.5872154
condition,3.40943,1.0,5.0,0.4234469
grade,7.656873,1.0,13.0,1.381639
sqft_above,1788.390691,290.0,9410.0,685702.9



Lowest mean: long -122.21389640494147
Highest mean: sqft_lot 15106.967565816869
Lowest variance: waterfront 0.0074848791729106
Highest variance: sqft_lot 1715579393.3040266

Correlation with price:


Unnamed: 0,corr_with_price
sqft_living,0.702035
grade,0.667434
sqft_above,0.605567
sqft_living15,0.585379
bathrooms,0.525138
view,0.397293
sqft_basement,0.323816
bedrooms,0.30835
lat,0.307003
waterfront,0.266369



Positively correlated features: ['sqft_living', 'grade', 'sqft_above', 'sqft_living15', 'bathrooms', 'view', 'sqft_basement', 'bedrooms', 'lat', 'waterfront', 'floors', 'yr_renovated', 'sqft_lot', 'sqft_lot15', 'yr_built', 'condition', 'long']
Negatively correlated features: []
Highest positive correlation: sqft_living 0.7020350546118003
