In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
boston_df = pd.read_csv('./boston.csv')
boston_rows = boston_df.shape[0]
boston_cols = boston_df.shape[1]
print(f"Dataset dimensions: {boston_df.shape[0]} rows × {boston_df.shape[1]} columns")

Dataset dimensions: 506 rows × 13 columns


In [4]:
min_nox_idx = boston_df['NOX'].idxmin()
medv_min_nox = boston_df.loc[min_nox_idx, 'MEDV']
min_nox_value = boston_df.loc[min_nox_idx, 'NOX']
print(f"Lowest NOX concentration: {min_nox_value}")
print(f"MEDV at lowest NOX: ${medv_min_nox}k")

Lowest NOX concentration: 0.385
MEDV at lowest NOX: $20.1k


In [19]:
plt.figure(figsize=(10, 6))
plt.boxplot(boston_df['CRIM'], vert=False)
plt.xlabel('Crime Rate (CRIM)')
plt.title('Boxplot of Per Capita Crime Rate')
plt.grid(axis='x', alpha=0.5)
plt.savefig('./2_3_boxplot_crim.png', dpi=300, bbox_inches='tight')
plt.close()

In [None]:
Q1 = boston_df['CRIM'].quantile(0.25)
Q3 = boston_df['CRIM'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
is_outlier = (boston_df['CRIM'] < lower_bound) | (boston_df['CRIM'] > upper_bound)

is_outlier_df = boston_df[is_outlier]
non_outliers_df = boston_df[~is_outlier]

mean_age_outliers = outliers_df['AGE'].mean()
mean_age_non_outliers = non_outliers_df['AGE'].mean()

print(f"Mean AGE for outliers: {mean_age_outliers:.2f}")
print(f"Mean AGE for non-outliers: {mean_age_non_outliers:.2f}")
print(f"Difference: {mean_age_outliers - mean_age_non_outliers:.2f}")


Mean AGE for outliers: 94.23
Mean AGE for non-outliers: 64.73
Difference: 29.51


In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(boston_df['DIS'], boston_df['NOX'])
plt.xlabel('Distance to Employment Centers (DIS)')
plt.ylabel('Nitric Oxide Concentration (NOX)')
plt.title('Relationship between Distance to Employment Centers and NOX Levels')
plt.savefig('./2_5_scatter_dis_nox.png', dpi=300, bbox_inches='tight')
plt.close()
correlation_dis_nox = boston_df['DIS'].corr(boston_df['NOX'])
print(f"Correlation between DIS and NOX: {correlation_dis_nox:.4f}")


Correlation between DIS and NOX: -0.7692


In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(boston_df['RAD'], boston_df['TAX'])
plt.xlabel('Highway Accessibility Index (RAD)')
plt.ylabel('Property Tax Rate (TAX)')
plt.title('Relationship between Highway Accessibility and Property Tax Rate')
plt.savefig('./2_6_scatter_rad_tax.png', dpi=300, bbox_inches='tight')
plt.close()

correlation_rad_tax = boston_df['RAD'].corr(boston_df['TAX'])
print(f"Correlation between RAD and TAX: {correlation_rad_tax:.4f}")
print(f"Interpretation: The {'strong positive' if correlation_rad_tax > 0.7 else 'positive' if correlation_rad_tax > 0 else 'negative'} correlation")
print("suggests that areas with better highway accessibility tend to have higher property tax rates.")


Correlation between RAD and TAX: 0.9102
Interpretation: The strong positive correlation
suggests that areas with better highway accessibility tend to have higher property tax rates.
