# Boston Housing Data Analysis
This notebook contains visualizations and statistical analyses to support housing market decisions in Boston, MA.

In [None]:
from js import fetch
import io
import pandas as pd

URL = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ST0151EN-SkillsNetwork/labs/boston_housing.csv'
resp = await fetch(URL)
boston_url = io.BytesIO((await resp.arrayBuffer()).to_py())
df = pd.read_csv(boston_url)

# Rename columns
df.rename(columns={
    'CHAS': 'Charles_River',
    'NOX': 'Nitric_Oxide',
    'INDUS': 'Non_Retail_Business_Acres',
    'AGE': 'Pre_1940_Owner_Occupied',
    'DIS': 'Distance_to_Employment_Centres',
    'PTRATIO': 'Pupil_Teacher_Ratio',
    'MEDV': 'MEDV'
}, inplace=True)

# Discretize AGE
df['AGE_Group'] = pd.cut(df['Pre_1940_Owner_Occupied'], bins=[0, 35, 70, 100],
                         labels=['<=35', '35-70', '>70'])

df.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.linear_model import LinearRegression

sns.set(style="whitegrid")

# Visualizations
plt.figure(figsize=(8, 6))
sns.boxplot(y=df['MEDV'])
plt.title('Boxplot of Median Value of Owner-Occupied Homes')
plt.ylabel('MEDV ($1000s)')
plt.show()

plt.figure(figsize=(8, 6))
sns.countplot(x='Charles_River', data=df)
plt.title('Bar Plot of Charles River Variable')
plt.xlabel('Bounds Charles River (1 = Yes, 0 = No)')
plt.ylabel('Count of Houses')
plt.show()

plt.figure(figsize=(8, 6))
sns.boxplot(x='AGE_Group', y='MEDV', data=df)
plt.title('Boxplot of Median Value by Age Group of Homes')
plt.xlabel('Age Group (Years Built Prior to 1940)')
plt.ylabel('MEDV ($1000s)')
plt.show()

plt.figure(figsize=(8, 6))
sns.scatterplot(x='Non_Retail_Business_Acres', y='Nitric_Oxide', data=df)
plt.title('Scatter Plot of NOX vs Non-Retail Business Acres')
plt.xlabel('Non-Retail Business Acres per Town')
plt.ylabel('Nitric Oxide Concentration (NOX)')
plt.show()

plt.figure(figsize=(8, 6))
sns.histplot(df['Pupil_Teacher_Ratio'], kde=True, bins=20)
plt.title('Histogram of Pupil-Teacher Ratio')
plt.xlabel('Pupil-Teacher Ratio')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Statistical Analysis
chas_0 = df[df['Charles_River'] == 0]['MEDV']
chas_1 = df[df['Charles_River'] == 1]['MEDV']
t_stat, p_val_ttest = stats.ttest_ind(chas_0, chas_1)

anova_result = stats.f_oneway(
    df[df['AGE_Group'] == '<=35']['MEDV'],
    df[df['AGE_Group'] == '35-70']['MEDV'],
    df[df['AGE_Group'] == '>70']['MEDV']
)

r_val, p_val_corr = stats.pearsonr(df['Nitric_Oxide'], df['Non_Retail_Business_Acres'])

X = df[['Distance_to_Employment_Centres']]
y = df['MEDV']
model = LinearRegression().fit(X, y)
regression_coef = model.coef_[0]
intercept = model.intercept_
r_squared = model.score(X, y)

print("=== T-Test: Charles River vs MEDV ===")
print(f"T-statistic: {t_stat:.4f}, P-value: {p_val_ttest:.6f}")
print("Conclusion:", "Significant difference" if p_val_ttest < 0.05 else "No significant difference")

print("\n=== ANOVA: MEDV Across AGE Groups ===")
print(f"F-statistic: {anova_result.statistic:.4f}, P-value: {anova_result.pvalue:.6f}")
print("Conclusion:", "At least one group differs significantly" if anova_result.pvalue < 0.05 else "No significant difference among groups")

print("\n=== Pearson Correlation: NOX vs INDUS ===")
print(f"Correlation (r): {r_val:.4f}, P-value: {p_val_corr:.6f}")
print("Conclusion:", "Significant relationship" if p_val_corr < 0.05 else "No significant relationship")

print("\n=== Regression: Distance to Employment vs MEDV ===")
print(f"Coefficient: {regression_coef:.4f}, Intercept: {intercept:.4f}, R²: {r_squared:.4f}")
print(f"Interpretation: Each additional unit of distance is associated with a {regression_coef:.2f} change in MEDV")