In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

In [None]:
#make a graph showing the sold prices of houses based on the number of bedrooms
'''
def bedroom_graph(city, cityName):
    columns_to_graph = ['beds', 'price', 'listingDate']
    city = city[(city['beds'] != 0) & (city['beds'] <= 6)]  #second condition not needed, but without looks crazy for now
    bedroom_group = city[columns_to_graph].groupby('beds')
    city_dfs = {beds: group for beds, group in bedroom_group}
    plt.figure(figsize=(10, 6))
    plt.xticks(rotation = 25)
    for beds, df in city_dfs.items(): #make separate dataframes based on number of bedrooms
        df = df.sort_values(by = ['listingDate'], ascending = True)
        #grouping bi-weekly with 2W, if want to group by month, use ME
        city_grouped = df.groupby(pd.Grouper(key = 'listingDate', freq = '2W')).agg({
            'price':'mean',
        }).reset_index() 
        plt.scatter(city_grouped['listingDate'], city_grouped['price'], label = f'{beds} bedrooms')
   
    plt.legend()
    plt.xlabel('Date')
    plt.ylabel('Sold Price in millions')
    plt.title(f'Housing price based on number of bedrooms in {cityName}')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'graphs/bedroom-graph')

#make a graph based on the price per sqft 
def pricePerSqFt_graph(city, cityName):
    city = city[city['pricePerSqFt'] > 0]
    city = city.sort_values(by = ['listingDate'], ascending = True)
    city_grouped = city.groupby(pd.Grouper(key = 'listingDate', freq = 'W')).agg({
        'pricePerSqFt':'mean',
    }).reset_index()

    plt.figure(figsize=(10, 6))
    plt.xticks(rotation = 25)
    plt.scatter(city_grouped['listingDate'], city_grouped['pricePerSqFt'])
    plt.xlabel('Date')
    plt.ylabel('Price per sqft')
    plt.title(f'Price per Square Feet in {cityName}')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'graphs/pricePerSqft-graph')
'''

def bedroom_sqft_graph(city, cityName):
    grouped = city.groupby('beds').agg({'pricePerSqFt' : 'mean'}).reset_index()

    plt.figure(figsize = (10, 6))
    plt.bar(grouped['beds'], grouped['pricePerSqFt'])
    plt.title(f'Average price/sqft by number of bedrooms in {cityName}')
    plt.xlabel('Number of Bedrooms')
    plt.ylabel('Average price/sqft')

    plt.show()

def distToCenter_graph(city, cityName):
    bin = 200 #group distances by per 500 meters to find average within that radius

    #cut out outlier
    city = city[(city['distanceToCenter'] < 25000) & (city['listingPrice'] < 4e7)].copy()
    city['distance_bin'] = pd.cut(city['distanceToCenter'], bins=range(0, int(city['distanceToCenter'].max()) + bin, bin))
    avgPrice = city.groupby('distance_bin', observed = True)['listingPrice'].mean().reset_index()
    avgPrice['bin_center'] = avgPrice['distance_bin'].apply(lambda x: x.left + (bin/2))  #gets mid-point of circle bin

    plt.scatter(city['distanceToCenter'], city['listingPrice'], alpha = 0.5) 
    plt.plot(avgPrice['bin_center'], avgPrice['listingPrice'], color = 'red', label = 'Average Price')
    plt.title(f'Listing Price vs. Distance to City Center in {cityName}')
    plt.grid(True)
    plt.xlabel('Distance to City Center (meters)')
    plt.ylabel('Price')
    plt.legend()
    plt.tight_layout()

    plt.savefig(f'graphs/distanceToCityCenter/{cityName}_dist')
    plt.close()

    return avgPrice[['listingPrice', 'distance_bin']]

def distToSchool_graph(city, cityName):
    bin = 200
    city = city[(city['distanceToSchool'] < 6000) & (city['listingPrice'] < 4e7)].copy()
    city['distance_bin'] = pd.cut(city['distanceToSchool'], bins=range(0, int(city['distanceToSchool'].max()) + bin, bin))
    avgPrice = city.groupby('distance_bin', observed = True)['listingPrice'].mean().reset_index()
    avgPrice['bin_center'] = avgPrice['distance_bin'].apply(lambda x: x.left + (bin/2))

    plt.scatter(city['distanceToSchool'], city['listingPrice'], alpha = 0.5)
    plt.plot(avgPrice['bin_center'], avgPrice['listingPrice'], color = 'red', label = 'Average Price')
    plt.title(f'Listing Price vs. Distance to Nearest School in {cityName}')
    plt.grid(True)
    plt.xlabel('Distance to Nearest School (meters)')
    plt.ylabel('Price')
    plt.legend()
    plt.tight_layout()
    
    plt.savefig(f'graphs/distanceToNearestSchool/{cityName}_dist')
    plt.close()

    return avgPrice[['listingPrice', 'distance_bin']]


In [None]:
#get data
total = pd.read_csv('data/combined/total_clean.csv')
total['soldDate'] = pd.to_datetime(total['soldDate'], errors='coerce')
total['listingDate'] = pd.to_datetime(total['listingDate'], errors='coerce')

In [None]:
#drop unusable data and make graphs
total1 = total.dropna(subset = ['beds', 'price', 'soldDate', 'pricePerSqFt']).copy()
bedroom_sqft_graph(total1, 'the Lower Mainland')

#get values of entire lower mainland to compare prices
center_dist_prices = distToCenter_graph(total1, 'the Lower Mainland')
school_dist_prices = distToSchool_graph(total1, 'the Lower Mainland')

for cityName, group in total1.groupby('city'):
    distToCenter_graph(group, cityName)
    distToSchool_graph(group, cityName)

In [None]:
#mannwhitney u test to see if more expensive near school or city center
max_distance = 5000

center_dist_prices = center_dist_prices[center_dist_prices['distance_bin'].apply(lambda x: x.right <= max_distance)]
school_dist_prices = school_dist_prices[school_dist_prices['distance_bin'].apply(lambda x: x.right <= max_distance)]

isEqual = stats.levene(center_dist_prices['listingPrice'], school_dist_prices['listingPrice'])
print(isEqual.pvalue)
u_stat, p = stats.mannwhitneyu(center_dist_prices['listingPrice'], school_dist_prices['listingPrice'])
print(u_stat, p)

In [None]:
transform = lambda v : np.log(v)

In [None]:
beds_prices = total[['beds', 'pricePerSqFt']].copy()
beds_prices = beds_prices[np.isfinite(beds_prices['pricePerSqFt'])]
beds_prices['priceSqFtTransformed'] = beds_prices['pricePerSqFt'].apply(transform)

less_beds = beds_prices[beds_prices['beds'] <= 3]
more_beds = beds_prices[beds_prices['beds'] > 3]

fig, axes = plt.subplots(2, 2, figsize=(16,5))
axes = axes.ravel()

axes[0].hist(less_beds['pricePerSqFt'], bins=25, edgecolor='black', alpha=0.8)
axes[0].set_title(f'Price per sqft of homes: number of bedrooms ≤ 3')
axes[1].hist(more_beds['pricePerSqFt'], bins=25, edgecolor='black', alpha=0.8)
axes[1].set_title(f'Price per sqft of homes: number of bedrooms > 3')
axes[2].hist(less_beds['priceSqFtTransformed'], bins=25, edgecolor='black', alpha=0.8)
axes[2].set_title(f'Transformed price per sqft of homes: number of bedrooms ≤ 3')
axes[3].hist(more_beds['priceSqFtTransformed'], bins=25, edgecolor='black', alpha=0.8)
axes[3].set_title(f'Transformed price per sqft of homes: number of bedrooms > 3')

for i in range(4):
    axes[i].set_xlabel('log(Price per SqFt)')
    axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.savefig('graphs/beds_normality.png')

In [None]:
# print(stats.mannwhitneyu(less_baths['priceTransformed'], more_baths['priceTransformed']).pvalue)
print(stats.ttest_ind(less_beds['priceSqFtTransformed'], more_beds['priceSqFtTransformed'], equal_var=False))

# Price Trend

In [None]:
from scipy.stats import ttest_ind, f_oneway, chi2_contingency

In [None]:
filtered_week_total = total[total['soldDate'].dt.year >= 2023].copy()
filtered_week_total['yearWeek'] = filtered_week_total['soldDate'].dt.to_period('W').dt.to_timestamp()

filtered_week_total['ordinalWeek'] = filtered_week_total['yearWeek'].apply(lambda x: x.toordinal())

filtered_week_total['pricePerSqFtLog'] = filtered_week_total['pricePerSqFt'].apply(transform)
m, b, _, p, _ = stats.linregress(filtered_week_total['ordinalWeek'], filtered_week_total['pricePerSqFtLog'])

y_regress = m * filtered_week_total['ordinalWeek'] + b

print(f'Regression slope: {m}')
print(f'Regression intercept: {b}')
print(f'Regression p-value: {p}')

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(filtered_week_total['yearWeek'], filtered_week_total['pricePerSqFtLog'], alpha=0.7)
plt.plot(filtered_week_total['yearWeek'], y_regress, color='red', label='Linear Regression')
plt.xlabel('Week')
plt.ylabel('Sold Price Per SqFt (log scaled)')
plt.title('Weekly Sold Price/SqFt')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.savefig('graphs/weekly-price-per-sqft-linregress.png')

In [None]:
# plot residuals
pricePerSqFtRes = filtered_week_total['pricePerSqFtLog'] - y_regress
plt.figure(figsize=(8, 6))
plt.grid(True)
plt.hist(pricePerSqFtRes, bins=30, edgecolor='black', alpha=0.7)
plt.title('Residuals of Log-Transformed Price Per Square Foot', fontsize=11)
plt.xlabel('Residuals', fontsize=10)
plt.ylabel('Frequency', fontsize=10)
plt.tight_layout()
plt.savefig('graphs/pricePerSqFtResiduals.png')

In [None]:
filtered_week_total['priceLog'] = filtered_week_total['price'].apply(transform)
m, b, _, p, _ = stats.linregress(filtered_week_total['ordinalWeek'], filtered_week_total['priceLog'])

y_regress = m * filtered_week_total['ordinalWeek'] + b
print(f'Regression slope: {m}')
print(f'Regression intercept: {b}')
print(f'Regression p-value: {p}')

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(filtered_week_total['yearWeek'], filtered_week_total['priceLog'], alpha=0.7)
plt.plot(filtered_week_total['yearWeek'], y_regress, color='red', label='Linear Regression')
plt.xlabel('Week')
plt.ylabel('Sold Price (log scaled)')
plt.title('Weekly Sold Price')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.savefig('graphs/weekly-price-linregress.png')

In [None]:
# plot residuals
pricePerSqFtRes = filtered_week_total['priceLog'] - y_regress
plt.figure(figsize=(8, 6))
plt.grid(True)
plt.hist(pricePerSqFtRes, bins=30, edgecolor='black', alpha=0.7)
plt.title('Residuals of Log-Transformed Price', fontsize=11)
plt.xlabel('Residuals', fontsize=10)
plt.ylabel('Frequency', fontsize=10)
plt.tight_layout()
plt.savefig('graphs/priceResiduals.png')

# Location Price

In [None]:
import matplotlib.colors as mcolors

In [None]:
total_pc = pd.read_csv('data/combined/total_clean.csv')
total_pc['postalShortCode'] = total_pc['postalCode'].apply(lambda code: code[:4])
postal_prices = total_pc.groupby(['postalShortCode']).agg({'pricePerSqFt': 'mean'}).reset_index()

In [None]:
postal_city = total_pc[['postalShortCode', 'city']]
most_frequent_city = postal_city.groupby('postalShortCode')['city'].agg(lambda x: x.mode()[0]).reset_index()
postal_prices = postal_prices.merge(most_frequent_city, on='postalShortCode')

postal_neigh = total_pc[['postalShortCode', 'neighborhood']]
most_frequent_neigh = postal_neigh.groupby('postalShortCode')['neighborhood'].agg(lambda x: x.mode()[0]).reset_index()
postal_prices = postal_prices.merge(most_frequent_neigh, on='postalShortCode')

In [None]:
values = postal_prices.sort_values(by='pricePerSqFt', ascending=False)
cities = total_pc['city'].unique()

bar_range = np.arange(len(values['pricePerSqFt'])) * 2
colors = plt.get_cmap('tab20').colors
color_map = dict(zip(cities, colors))

In [None]:
plt.figure(figsize=(17, 14))

colors = values['city'].map(color_map)
bars = plt.bar(bar_range, values['pricePerSqFt'], align='center', width=1, color=colors)
plt.title('Average Home Price (Per SqFt) by Postal Code', fontsize=20)
plt.xlabel('Postal Code', fontsize=16)
plt.ylabel('Average Price per SqFt ($)', fontsize=16)
print(values)
plt.xticks(bar_range, values['postalShortCode'], rotation=75, ha='center')

for bar, neighborhood in zip(bars, values['neighborhood']):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), neighborhood, ha='left', va='bottom', fontsize=9, rotation=60)

handles = [plt.Rectangle((0,0),1,1, color=color_map[city]) for city in color_map]
labels = list(color_map.keys())
plt.legend(handles, labels, title='City')
plt.tight_layout()

plt.savefig('graphs/postal_code_prices.png')