In [41]:
import pandas as pd
import numpy as np

df = pd.read_csv('kc_house_data.csv')
cols_to_drop = ['id', 'date', 'zipcode']
df_analysis = df.drop(columns=cols_to_drop)
response = 'price'
feature_cols = list(df_analysis.columns)

print("Shape after dropping id, date, zipcode:", df_analysis.shape)
print("Features:", feature_cols)
df_analysis.head()

Shape after dropping id, date, zipcode: (21613, 18)
Features: ['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'lat', 'long', 'sqft_living15', 'sqft_lot15']


Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,47.5112,-122.257,1340,5650
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,47.721,-122.319,1690,7639
2,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,47.7379,-122.233,2720,8062
3,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,47.5208,-122.393,1360,5000
4,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,47.6168,-122.045,1800,7503


## Part 1: Average, Min, Max, and Variance for Each Feature

In [42]:
stats = pd.DataFrame({
    'average': df_analysis.mean(),
    'min': df_analysis.min(),
    'max': df_analysis.max(),
    'variance': df_analysis.var()
})
stats

Unnamed: 0,average,min,max,variance
price,540088.141767,75000.0,7700000.0,134782400000.0
bedrooms,3.370842,0.0,33.0,0.865015
bathrooms,2.114757,0.0,8.0,0.5931513
sqft_living,2079.899736,290.0,13540.0,843533.7
sqft_lot,15106.967566,520.0,1651359.0,1715659000.0
floors,1.494309,1.0,3.5,0.291588
waterfront,0.007542,0.0,1.0,0.007485226
view,0.234303,0.0,4.0,0.5872426
condition,3.40943,1.0,5.0,0.4234665
grade,7.656873,1.0,13.0,1.381703


In [43]:
stats_predictors = stats.drop('price', errors='ignore')

print("Among predictor features (excluding price):")
print("Lowest average:", stats_predictors['average'].idxmin(), "=", stats_predictors['average'].min())
print("Highest average:", stats_predictors['average'].idxmax(), "=", stats_predictors['average'].max())
print()
print("If including price (all features):")
print("Lowest average:", stats['average'].idxmin(), "=", stats['average'].min())
print("Highest average:", stats['average'].idxmax(), "=", stats['average'].max())

Among predictor features (excluding price):
Lowest average: long = -122.21389640494147
Highest average: sqft_lot = 15106.967565816869

If including price (all features):
Lowest average: long = -122.21389640494147
Highest average: price = 540088.1417665294


In [44]:
print("Among predictor features (excluding price):")
print("Lowest variance:", stats_predictors['variance'].idxmin(), "=", stats_predictors['variance'].min())
print("Highest variance:", stats_predictors['variance'].idxmax(), "=", stats_predictors['variance'].max())
print()
print("If including price (all features):")
print("Lowest variance:", stats['variance'].idxmin(), "=", stats['variance'].min())
print("Highest variance:", stats['variance'].idxmax(), "=", stats['variance'].max())

Among predictor features (excluding price):
Lowest variance: waterfront = 0.007485225502686407
Highest variance: sqft_lot = 1715658774.1754704

If including price (all features):
Lowest variance: waterfront = 0.007485225502686407
Highest variance: price = 134782378397.24687


## Part 2: Correlation of Each Feature with the Response (price)

In [45]:
predictor_cols = [c for c in df_analysis.columns if c != response]
correlations = df_analysis[predictor_cols].corrwith(df_analysis[response])
corr_table = pd.DataFrame({'Feature': correlations.index, 'Correlation with price': correlations.values})
corr_table = corr_table.set_index('Feature')
print("Correlation of each feature with price:")
corr_table

Correlation of each feature with price:


Unnamed: 0_level_0,Correlation with price
Feature,Unnamed: 1_level_1
bedrooms,0.30835
bathrooms,0.525138
sqft_living,0.702035
sqft_lot,0.089661
floors,0.256794
waterfront,0.266369
view,0.397293
condition,0.036362
grade,0.667434
sqft_above,0.605567


In [46]:
positive_corr = correlations[correlations > 0]
print("Features that are positively correlated with price:")
print(positive_corr.sort_values(ascending=False).to_string())

Features that are positively correlated with price:
sqft_living      0.702035
grade            0.667434
sqft_above       0.605567
sqft_living15    0.585379
bathrooms        0.525138
view             0.397293
sqft_basement    0.323816
bedrooms         0.308350
lat              0.307003
waterfront       0.266369
floors           0.256794
yr_renovated     0.126434
sqft_lot         0.089661
sqft_lot15       0.082447
yr_built         0.054012
condition        0.036362
long             0.021626


In [47]:
highest_pos_corr_feature = correlations.idxmax()
highest_pos_corr_value = correlations.max()
print("Highest positive correlation with price:", highest_pos_corr_feature, "=", highest_pos_corr_value)

Highest positive correlation with price: sqft_living = 0.7020350546118003


## Part 3: Features with Negative Correlation with the Response

In [48]:
negative_corr = correlations[correlations < 0]
if len(negative_corr) > 0:
    print("Yes. These are negatively correlated with price:")
    print(negative_corr.sort_values().to_string())
else:
    print("No. None of the features have a negative correlation with price.")

No. None of the features have a negative correlation with price.
