In [1]:
import os
import tarfile
import urllib.request

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/rickiepark/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [2]:
fetch_housing_data()

In [3]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [4]:
housing = load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
import numpy as np

In [6]:
data = housing["population"]
count = data.shape[0]

모평균

In [7]:
sum = 0
for i in range(count):
  sum += data[i]

population_mean = sum/count
population_mean

1425.4767441860465

In [8]:
num_mean = np.mean(data)
num_mean

1425.4767441860465

모분산

In [17]:
sum = 0
for i in range(count):
  sum += (data[i]-population_mean)**2

population_variance = sum/count
population_variance

1282408.3220366791

In [18]:
num_var = np.var(data)
num_var

1282408.3220366866

공분산

In [19]:
data2 = housing["median_income"]

In [20]:
num_mean2 = np.mean(data2)

In [23]:
covariance=0
for i in range(count):
  covariance += (data2[i]-num_mean2)*(data[i]-num_mean)

covariance = covariance/count
covariance

10.400475316443558

In [27]:
housing.cov()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
longitude,4.014139,-3.957054,-2.728244,194.8037,58.76851,226.3778,42.36807,-0.057765,-10627.43
latitude,-3.957054,4.562293,0.300346,-168.2178,-60.29962,-263.1378,-58.01024,-0.32386,-35532.56
housing_median_age,-2.728244,0.300346,158.39626,-9919.12,-1700.313,-4222.271,-1457.581,-2.84614,153398.8
total_rooms,194.80375,-168.217847,-9919.12006,4759445.0,856730.6,2117613.0,766104.6,820.85241,33772890.0
total_bedrooms,58.768508,-60.299623,-1700.312817,856730.6,177565.4,419139.1,157829.5,-6.180851,2416878.0
population,226.377839,-263.137814,-4222.270582,2117613.0,419139.1,1282470.0,392803.6,10.400979,-3221249.0
households,42.368072,-58.010245,-1457.58129,766104.6,157829.5,392803.6,146176.0,9.466667,2904924.0
median_income,-0.057765,-0.32386,-2.84614,820.8524,-6.180851,10.40098,9.466667,3.609323,150847.5
median_house_value,-10627.425205,-35532.559074,153398.801329,33772890.0,2416878.0,-3221249.0,2904924.0,150847.482793,13316150000.0


상관 계수

In [28]:
num_var2 = np.var(data2)
num_var2

3.609147689697444

In [30]:
import math
corr = covariance/(math.sqrt(num_var)*math.sqrt(num_var2))
corr

0.004834345627652911

In [31]:
housing.corr()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
longitude,1.0,-0.924664,-0.108197,0.044568,0.069608,0.099773,0.05531,-0.015176,-0.045967
latitude,-0.924664,1.0,0.011173,-0.0361,-0.066983,-0.108785,-0.071035,-0.079809,-0.14416
housing_median_age,-0.108197,0.011173,1.0,-0.361262,-0.320451,-0.296244,-0.302916,-0.119034,0.105623
total_rooms,0.044568,-0.0361,-0.361262,1.0,0.93038,0.857126,0.918484,0.19805,0.134153
total_bedrooms,0.069608,-0.066983,-0.320451,0.93038,1.0,0.877747,0.979728,-0.007723,0.049686
population,0.099773,-0.108785,-0.296244,0.857126,0.877747,1.0,0.907222,0.004834,-0.02465
households,0.05531,-0.071035,-0.302916,0.918484,0.979728,0.907222,1.0,0.013033,0.065843
median_income,-0.015176,-0.079809,-0.119034,0.19805,-0.007723,0.004834,0.013033,1.0,0.688075
median_house_value,-0.045967,-0.14416,0.105623,0.134153,0.049686,-0.02465,0.065843,0.688075,1.0
