In [4]:
import pandas as pd
import numpy as np

data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)

data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

columns = [
    "CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS","RAD","TAX","PTRATIO",
    "B","LSTAT"
]
boston = pd.DataFrame(data, columns=columns)
boston["MEDV"] = target



In [5]:
# 查看資料形狀
print("Shape of dataset:", boston.shape)

# 查看前五列
print("First five rows:\n", boston.head())

# 數值摘要
print("Summary statistics:\n", boston.describe())


Shape of dataset: (506, 14)
First five rows:
       CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   

   PTRATIO       B  LSTAT  MEDV  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21.6  
2     17.8  392.83   4.03  34.7  
3     18.7  394.63   2.94  33.4  
4     18.7  396.90   5.33  36.2  
Summary statistics:
              CRIM          ZN       INDUS        CHAS         NOX          RM  \
count  506.000000  506.000000  506.000000  506.000000  506.000000  506.000000   
mean     3.613524   11.363636   11.136779    0.069170    0.554695    6.284634   
std      8.601545   23.322453    6.860353    0.

In [None]:
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

scatter_matrix(boston, figsize=(12,12))
plt.show()


In [None]:
boston.corr()['CRIM'].sort_values()


In [None]:
print("Max CRIM:", boston['CRIM'].max())
print("Max TAX:", boston['TAX'].max())
print("Max PTRATIO:", boston['PTRATIO'].max())


In [None]:
boston['CHAS'].sum()  


In [None]:
boston['PTRATIO'].median()


In [None]:
lowest_medv_idx = boston['MEDV'].idxmin()
boston.loc[lowest_medv_idx]


In [None]:
print("Suburbs with RM > 7:", (boston['RM'] > 7).sum())
print("Suburbs with RM > 8:", (boston['RM'] > 8).sum())
