- 학습 목표
- 서울시내 중학교 진학률 데이터 세트 (지역에 따른 군집)
- LabelEncoder , OneHotEncoder 필요
- 지도 시각화(위도, 경도) -> folium 

In [2]:
from sklearn.cluster import KMeans

import pandas as pd
import numpy  as np
import folium

import matplotlib.pyplot as plt
import seaborn          as sns
%matplotlib inline

In [12]:
path = '../data/middle_shcool_graduates_report.xlsx'
df   = pd.read_excel(path, header=0)
df.head()

# print(pd.__version__)

Unnamed: 0,지역,학교명,코드,유형,주야,남학생수,여학생수,일반고,특성화고,과학고,외고_국제고,예고_체고,마이스터고,자사고,자공고,기타진학,취업,미상,위도,경도
0,성북구,서울대학교사범대학부설중학교,3,국립,주간,277,0,0.585,0.148,0.018,0.007,0.0,0.011,0.227,0.0,0.004,0,0.0,37.594942,127.038909
1,종로구,서울대학교사범대학부설여자중학교,3,국립,주간,0,256,0.68,0.199,0.0,0.035,0.008,0.0,0.043,0.004,0.031,0,0.0,37.577473,127.003857
2,강남구,개원중학교,3,공립,주간,170,152,0.817,0.047,0.009,0.012,0.003,0.006,0.09,0.003,0.009,0,0.003,37.491637,127.071744
3,강남구,개포중학교,3,공립,주간,83,72,0.755,0.097,0.013,0.013,0.019,0.019,0.065,0.0,0.019,0,0.0,37.480439,127.062201
4,서초구,경원중학교,3,공립,주간,199,212,0.669,0.017,0.007,0.01,0.005,0.0,0.282,0.0,0.01,0,0.0,37.51075,127.0089


In [13]:
print(df.columns.values)

['지역' '학교명' '코드' '유형' '주야' '남학생수' '여학생수' '일반고' '특성화고' '과학고' '외고_국제고'
 '예고_체고' '마이스터고' '자사고' '자공고' '기타진학' '취업' '미상' '위도' '경도']


In [17]:
# 지도에 위치 표시
school_map = folium.Map(location=[37.55 , 126.98] , titles='Stamen Terrain', 
                        zoom_start=12) 
for name , lat, lng in zip(df.학교명 , df.위도, df.경도) :
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        color  = 'green',
        fill   = True,
        fill_color = 'balck',
        fill_opacity = 0.5,
        popup = name
    ).add_to(school_map)

school_map

In [None]:
# 진행절차 
# 데이터 전처리 - 원핫인코딩(지역, 코드, 유형, 주야)
# 군집모형 생성 - 분석에 사용할 피처는 과학고, 외고국제고, 자사고 진학률
# 표준화
# 모형 객체 생성
# 모형 학습
# 예측
# 예측 결과를 데이터 프레임에 추가
# 클러스터 값으로 그룹화, 그룹별 내용 출력
# 지도 그래프로 시각화 


In [20]:
# 원핫인코딩(더미 변수)
from sklearn import preprocessing    

label_encoder = preprocessing.LabelEncoder()     # label encoder 생성
onehot_encoder = preprocessing.OneHotEncoder()   # one hot encoder 생성

onehot_location = label_encoder.fit_transform(df['지역'])
onehot_code = label_encoder.fit_transform(df['코드'])
onehot_type = label_encoder.fit_transform(df['유형'])
onehot_day = label_encoder.fit_transform(df['주야'])
# print(onehot_location)
# print(onehot_code)
# print(onehot_type)
# print(onehot_day)
df['location'] = onehot_location
df['code'] = onehot_code
df['type'] = onehot_type
df['day'] = onehot_day


In [24]:
print(df.columns.values)

['지역' '학교명' '코드' '유형' '주야' '남학생수' '여학생수' '일반고' '특성화고' '과학고' '외고_국제고'
 '예고_체고' '마이스터고' '자사고' '자공고' '기타진학' '취업' '미상' '위도' '경도' 'location' 'code'
 'type' 'day']


In [25]:
# sklearn 라이브러리에서 cluster 군집 모형 가져오기 
from sklearn.cluster import KMeans

# 분석에 사용할 속성을 선택 (과학고, 외고국제고, 자사고 진학률)
columns_list = [9, 10, 13]
X = df.iloc[:, columns_list]
X

Unnamed: 0,과학고,외고_국제고,자사고
0,0.018,0.007,0.227
1,0.000,0.035,0.043
2,0.009,0.012,0.090
3,0.013,0.013,0.065
4,0.007,0.010,0.282
...,...,...,...
410,0.000,0.000,0.000
411,0.000,0.000,0.000
412,0.000,0.000,0.000
413,0.000,0.000,0.000


In [27]:
# 설명 변수 데이터를 정규화
X = preprocessing.StandardScaler().fit_transform(X)

# 모형 객체 생성
kmeans = KMeans(n_clusters = 3 ,init = 'k-means++' , max_iter = 300 , n_init=10)

# 모형 학습
kmeans.fit(X)   
 


KMeans(n_clusters=3)

In [30]:
# 예측 (군집) 
cluster_label = kmeans.labels_   
print(cluster_label)


[0 1 1 1 0 2 0 0 0 1 0 1 1 1 0 1 0 2 0 0 2 0 1 1 1 0 0 0 2 2 0 0 1 1 2 2 1
 1 1 0 0 0 0 1 2 0 2 2 1 1 1 1 2 0 2 2 2 0 2 2 1 2 2 2 0 2 2 2 1 1 2 1 1 1
 2 2 1 1 2 2 2 2 2 2 2 2 1 2 2 2 2 1 0 1 2 2 1 2 2 1 2 1 1 2 2 2 2 2 2 1 0
 2 0 0 1 2 2 2 2 2 2 2 2 2 1 0 2 2 2 2 2 2 2 2 1 2 2 1 1 2 2 2 2 2 2 2 2 1
 2 1 0 0 2 2 2 2 2 0 1 1 2 2 2 2 1 2 2 1 1 2 1 2 1 1 1 2 2 2 2 2 2 2 2 2 2
 2 2 2 1 0 0 2 2 2 2 1 0 1 1 0 2 2 2 2 2 2 2 1 1 2 1 2 1 1 2 2 2 1 1 2 1 1
 1 2 1 1 1 1 0 2 2 2 1 1 2 1 1 1 1 1 1 2 1 2 1 1 1 2 2 2 1 2 1 1 2 2 1 2 2
 2 2 1 0 2 1 0 2 1 2 2 2 1 2 2 2 0 1 1 0 2 2 2 2 2 2 2 2 0 2 2 1 1 2 0 1 2
 1 1 2 2 1 2 1 2 2 2 2 1 2 1 2 1 1 2 1 2 1 1 2 1 0 1 1 2 1 1 2 1 1 1 1 2 2
 2 1 2 1 2 1 2 2 2 2 1 2 2 0 2 2 2 2 1 0 2 2 2 2 1 0 0 0 2 1 1 1 1 0 2 1 0
 1 1 2 1 2 1 1 2 1 1 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 1]


In [32]:
# 예측 결과를 데이터프레임에 추가
df['cluster_id'] = cluster_label
df.head()   


Unnamed: 0,지역,학교명,코드,유형,주야,남학생수,여학생수,일반고,특성화고,과학고,...,기타진학,취업,미상,위도,경도,location,code,type,day,cluster_id
0,성북구,서울대학교사범대학부설중학교,3,국립,주간,277,0,0.585,0.148,0.018,...,0.004,0,0.0,37.594942,127.038909,16,0,1,0,0
1,종로구,서울대학교사범대학부설여자중학교,3,국립,주간,0,256,0.68,0.199,0.0,...,0.031,0,0.0,37.577473,127.003857,22,0,1,0,1
2,강남구,개원중학교,3,공립,주간,170,152,0.817,0.047,0.009,...,0.009,0,0.003,37.491637,127.071744,0,0,0,0,1
3,강남구,개포중학교,3,공립,주간,83,72,0.755,0.097,0.013,...,0.019,0,0.0,37.480439,127.062201,0,0,0,0,1
4,서초구,경원중학교,3,공립,주간,199,212,0.669,0.017,0.007,...,0.01,0,0.0,37.51075,127.0089,14,0,0,0,0


In [41]:
# 클러스터 값으로 그룹화하고, 그룹별로 내용 출력 (첫 5행만 출력)
grouped_cols = [0, 1, 2] 
grouped = df.groupby('cluster_id')
for key, group in grouped:
    print('* key :', key)
    print('* number :', len(group))    
    print(group.iloc[:, grouped_cols])
    print('\n')



* key : 0
* number : 51
       지역                    학교명  코드
0     성북구         서울대학교사범대학부설중학교   3
4     서초구                  경원중학교   3
6     강남구                 압구정중학교   3
7     강남구         단국대학교사범대학부속중학교   3
8     강남구                  대명중학교   3
10    강남구                  대청중학교   3
14    서초구                  반포중학교   3
16    강남구                  봉은중학교   3
18    서초구                  서운중학교   3
19    서초구                  서일중학교   3
21    서초구                세화여자중학교   3
25    서초구                  신동중학교   3
26    서초구                 신반포중학교   3
27    강남구                  신사중학교   3
30    강남구                  언주중학교   3
31    강남구                  역삼중학교   3
39    강남구                  휘문중학교   3
40    송파구                  가락중학교   3
41    송파구                  가원중학교   3
42    강동구                  강일중학교   3
45    강동구                  동북중학교   3
53    강동구                  배재중학교   3
57    송파구                  석촌중학교   3
64    송파구                  신천중학교   3
92    양천구                  목일중학교   3
110   양천구     

In [36]:
# 그래프로 표현 - 시각화
colors = {0:'coral', 1:'blue', 2:'green'}

cluster_map = folium.Map(location=[37.55,126.98], tiles='Stamen Terrain', 
                        zoom_start=12)

for name, lat, lng, clus in zip(df.학교명, df.위도, df.경도, df.cluster_id):  
    folium.CircleMarker([lat, lng],
                        radius=5,                   # 원의 반지름
                        color=colors[clus],         # 원의 둘레 색상
                        fill=True,
                        fill_color=colors[clus],    # 원을 채우는 색
                        fill_opacity=0.7,           # 투명도    
                        popup=name
    ).add_to(cluster_map)
cluster_map