### Problem 5: Average, Variance, and Correlation

In [12]:
import pandas as pd
pd.set_option("display.float_format", "{:,.3f}".format)

df = pd.read_csv("kc_house_data.csv")
df_price = df["price"]
df = df.drop(columns = ["id", "date", "zipcode", "price"])

# Compute statistics
stats = pd.DataFrame({
    "average": df.mean(),
    "min": df.min(),
    "max": df.max(),
    "variance": df.var()
})

stats_clean = stats.round(3)

print(stats_clean)

# Finding and printing lowest/highest average and variance
lowest_avg = stats_clean["average"].idxmin()
highest_avg = stats_clean["average"].idxmax()

lowest_var = stats_clean["variance"].idxmin()
highest_var = stats_clean["variance"].idxmax()

print(f"\nLowest average: {lowest_avg} = {stats_clean.loc[lowest_avg, 'average']}")
print(f"Highest average: {highest_avg} = {stats_clean.loc[highest_avg, 'average']}")
print(f"Lowest variance: {lowest_var} = {stats_clean.loc[lowest_var, 'variance']}")
print(f"Highest variance: {highest_var} = {stats_clean.loc[highest_var, 'variance']}")


                 average       min           max          variance
bedrooms           3.371     0.000        33.000             0.865
bathrooms          2.115     0.000         8.000             0.593
sqft_living    2,079.900   290.000    13,540.000       843,533.681
sqft_lot      15,106.968   520.000 1,651,359.000 1,715,658,774.175
floors             1.494     1.000         3.500             0.292
waterfront         0.008     0.000         1.000             0.007
view               0.234     0.000         4.000             0.587
condition          3.409     1.000         5.000             0.423
grade              7.657     1.000        13.000             1.382
sqft_above     1,788.391   290.000     9,410.000       685,734.667
sqft_basement    291.509     0.000     4,820.000       195,872.668
yr_built       1,971.005 1,900.000     2,015.000           862.797
yr_renovated      84.402     0.000     2,015.000       161,346.212
lat               47.560    47.156        47.778             0

In [22]:
# Compute correlations with price
correlations = df.corrwith(df_price)

# Convert to table
corr_table = pd.DataFrame({
    "correlation_with_price": correlations
}).sort_values(by="correlation_with_price", ascending=False)

print(corr_table)

positive_corr = correlations[correlations > 0].sort_values(ascending=False)

print("\nFeatures positively correlated with price:")
print(positive_corr)

highest_feature = positive_corr.idxmax()
highest_value = positive_corr.max()

print("\nFeature with highest positive correlation to price:")
print(highest_feature, ":", highest_value)

               correlation_with_price
sqft_living                     0.702
grade                           0.667
sqft_above                      0.606
sqft_living15                   0.585
bathrooms                       0.525
view                            0.397
sqft_basement                   0.324
bedrooms                        0.308
lat                             0.307
waterfront                      0.266
floors                          0.257
yr_renovated                    0.126
sqft_lot                        0.090
sqft_lot15                      0.082
yr_built                        0.054
condition                       0.036
long                            0.022

Features positively correlated with price:
sqft_living     0.702
grade           0.667
sqft_above      0.606
sqft_living15   0.585
bathrooms       0.525
view            0.397
sqft_basement   0.324
bedrooms        0.308
lat             0.307
waterfront      0.266
floors          0.257
yr_renovated    0.126
sqft_lot

In [23]:
negative_corr = correlations[correlations < 0].sort_values(ascending=False)
if negative_corr.empty:
    print("\nThere are no features with a negative correlation with the response variable (price). All features show weak to strong positive correlations.")
else:
    print(negative_corr)

Series([], dtype: float64)
