In [1]:
import pandas as pd

data = pd.read_csv('kc_house_data.csv')

In [2]:
pd.set_option("display.max_columns", None)

In [3]:
data.head(3)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062


#### 지역코드별로 거래가의 중위값을 계산해 '지역별 중위 거래가'피쳐를 만들어줌

In [4]:
zipprice = pd.DataFrame(data.groupby('zipcode')['price'].median())
zipprice.head(3)

Unnamed: 0_level_0,price
zipcode,Unnamed: 1_level_1
98001,260000.0
98002,235000.0
98003,267475.0


In [5]:
zipprice['code']=zipprice.index
zipprice.head(3)

Unnamed: 0_level_0,price,code
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1
98001,260000.0,98001
98002,235000.0,98002
98003,267475.0,98003


In [6]:
zipprice = zipprice.sort_values(["price"], ascending=False)
zipprice['rank'] = range(zipprice.shape[0])
zipprice.head(3)

Unnamed: 0_level_0,price,code,rank
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
98039,1895000.0,98039,0
98004,1150000.0,98004,1
98040,993750.0,98040,2


In [7]:
zipprice.shape

(70, 3)

#### 위에서 구한 우편번호별 중위값과 순위를 data에 mapping 해주자.

방법 : 
> 1. dictionary로 묶어준다.  
> 2. map을 사용한다.

In [8]:
median_price = {i:j for j,i in zipprice.iloc[:,:2].values}
median_ranking = {i:j for i,j in zipprice.iloc[:,1:].values}

In [9]:
list(median_price.items())[0:2], list(median_ranking.items())[0:2]

([(98039.0, 1895000.0), (98004.0, 1150000.0)], [(98039, 0), (98004, 1)])

In [10]:
data['median_price'] =  data['zipcode'].map(median_price)
data['median_ranking'] = data['zipcode'].map(median_ranking)

In [11]:
data.head(3)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,median_price,median_ranking
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,278277.0,60
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639,425000.0,38
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062,445000.0,36


In [12]:
data.shape

(21613, 23)

***

### 클러스터링 인수설정

In [13]:
기준컬럼 = ['lat','long','median_price']
clt = data[기준컬럼]
tmp = clt

> 근접한 동네인데도 충분히 빈부 격차가 크게 발생하는 곳도 더러 있으므로, 기준칼럼에 우편번호에 따른 중위 집가격을 추가했다.

In [14]:
clt.head(3)

Unnamed: 0,lat,long,median_price
0,47.5112,-122.257,278277.0
1,47.721,-122.319,425000.0
2,47.7379,-122.233,445000.0


In [15]:
from sklearn.preprocessing import scale
clt = pd.DataFrame(scale(clt), columns=clt.columns)
clt.head(3)

Unnamed: 0,lat,long,median_price
0,-0.352572,-0.306079,-1.055402
1,1.161568,-0.746341,-0.309462
2,1.283537,-0.135655,-0.207782


> scaling은 기본이다.

In [16]:
import numpy as np

컬럼당가중치 = [1,1,2]
clt_scale = pd.DataFrame(clt.values*np.array(컬럼당가중치) ,columns=기준컬럼)

> 오히려 우편번호에 따른 집 가격에 더 큰 가중치를 줌을 알 수 있다.

In [17]:
clt_scale.head(3)

Unnamed: 0,lat,long,median_price
0,-0.352572,-0.306079,-2.110803
1,1.161568,-0.746341,-0.618924
2,1.283537,-0.135655,-0.415563


***

## 클러스터링

In [18]:
def clus(Type, num_of_cluster, X, eps=2):
    
    labels = []
    
    if Type =='분할기법' :
        from sklearn.cluster import KMeans
        kmeans = KMeans(n_clusters=num_of_cluster)
        kmeans.fit( X )
        labels = kmeans.labels_
        return labels
        
    elif Type == '계층기법' :
        from sklearn.cluster import AgglomerativeClustering
        agnes = AgglomerativeClustering(n_clusters=num_of_cluster)
        agnes.fit(X)
        labels = agnes.labels_
        return labels
        
    else :
        from sklearn.cluster import DBSCAN
        dbscan = DBSCAN(eps=eps*0.1)
        dbscan.fit(X)
        labels = dbscan.labels_
        return labels

## 1. KMeans 방법

임의로 스타팅 포인트를 잡아서 모든 변수들의 거리를 클러스터링 갯수만큼 분화시키고,  
평균 위치를 계산해서 위의 과정을 계속해서 반복해준다.

> https://www.youtube.com/watch?v=_aWzGGNrcic  
> https://www.youtube.com/watch?v=4b5d3muPQmA

## 2. AGGlomerative 방법

가장 가까운 거리의 두 점을 순서대로 찾아가는 과정

> https://datascienceschool.net/view-notebook/094bcb7b86574711a2e8d81f26bce2f5/  
> https://www.youtube.com/watch?v=XJ3194AmH40

## 3. DBScan 방법

모든 점에서 설정된 거리만큼 최대한 확장시켜서 나누는 것.

> https://www.youtube.com/watch?v=5E097ZLE9Sg

***

In [19]:
labels = clus('계층기법', 4, clt_scale)
labels

array([3, 0, 0, ..., 0, 1, 0], dtype=int64)

> 일반적으로 도시를 4개로 구분해서 4를 사용했다.

In [20]:
labels.shape

(21613,)

In [21]:
data['labels'] = labels
data.head(3)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,median_price,median_ranking,labels
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,278277.0,60,3
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639,425000.0,38,0
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062,445000.0,36,0


***

# 결과분석

In [22]:
data.groupby('labels')['price'].median()

labels
0     465000.0
1     602000.0
2    1050000.0
3     285000.0
Name: price, dtype: float64

0번 클러스터는 Seattle 지역\
1번 클러스터는 Bellevue 를 위시한 시애틀의 배드타운\
2번 클러스터는 Medina와 그 주변의 시애틀 최대의 부촌 지역\
3번 클러스터는 공업도시 타코마 인근의 평범한 거주지역

***

## 시각화

In [23]:
from bokeh.io import output_file, show
from bokeh.models import ColumnDataSource, GMapOptions
from bokeh.plotting import gmap

In [25]:
map_options = GMapOptions(lat=data.lat.mean(), lng=data.long.mean()+0.1, map_type="terrain", zoom=10)

p = gmap(key, map_options, title="King County Map with classification")
p.plot_height = 500
p.plot_width = 980

dot1 = p.circle(x="long", y="lat", size=3, fill_color='red', fill_alpha=0.2, source=data[data['labels']==0], line_width = 0)
dot2 = p.circle(x="long", y="lat", size=3, fill_color='green', fill_alpha=0.2, source=data[data['labels']==1], line_width = 0)
dot3 = p.circle(x="long", y="lat", size=3, fill_color='blue', fill_alpha=0.2, source=data[data['labels']==2], line_width = 0)
dot4 = p.circle(x="long", y="lat", size=3, fill_color='black', fill_alpha=0.2, source=data[data['labels']==3], line_width = 0)

p.add_layout(dot1)
p.add_layout(dot2)
p.add_layout(dot3)
p.add_layout(dot4)

output_file("King_county_classification.html")
show(p)

![im](참고/4label.PNG)