# 0. Initialization On Colab

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

FOLDERNAME = 'Colab Notebooks/21_BigCon'
assert FOLDERNAME is not None, "[!] Enter the foldername."

import sys
sys.path.append('/content/drive/My Drive/{}/datasets'.format(FOLDERNAME))

# 1. Call Library And Setting Working Directory

In [None]:
import matplotlib
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import os
import warnings
warnings.filterwarnings("ignore")

In [None]:
 os.chdir('/content/drive/My Drive/21_BigCon/datasets')

# 2. Load Data

In [None]:
data = pd.read_csv('data.csv')

In [None]:
# '알수없음' 데이터를 따로 저장
unknown_data = data[data['emd_nm']=='알수없음']
data = data[data.emd_nm != '알수없음']
unknown_data.to_csv('unknown.csv')

data=data.reset_index()

# 3. Time Series Analysis

분석 결과:
여름철에 음식물 쓰레기가 많이 배출되고
2020년 2월 (코로나 발생) 이후로 급격하게 줄어든 모습을 보임

In [None]:
# 모든 행정동을 list화
emd=np.unique(data['emd_nm']).tolist()
len(np.unique(emd)) # 41개 (알수없음 제외)

## 4개 행정동 음식물 쓰레기 plot & peak date 확인

In [None]:
plt.xticks(rotation=90)
plt.xlabel('base_date', fontsize=7)
plt.ylabel('em_g')
plt.plot(list(map(str, data[data['emd_nm']=='한경면']['base_date'])), data[data['emd_nm']=='한경면']['em_g'])
plt.plot(list(map(str, data[data['emd_nm']=='구좌읍']['base_date'])), data[data['emd_nm']=='구좌읍']['em_g'])
plt.plot(list(map(str, data[data['emd_nm']=='조천읍']['base_date'])), data[data['emd_nm']=='조천읍']['em_g'])
plt.plot(list(map(str, data[data['emd_nm']=='한림읍']['base_date'])), data[data['emd_nm']=='한림읍']['em_g'])

## 랜덤으로 선택한 행정동에 대한 em_g 데이터플롯

In [None]:
import random
random_emd = random.choice(np.unique(data['emd_nm']).tolist())

# 랜덤한 행정동에 대한 시계열 graph plot
plt.xticks(rotation=90)
plt.xlabel('base_date', fontsize=7)
plt.ylabel('em_g')
plt.plot(list(map(str, data[data['emd_nm']==random_emd]['base_date'])), data[data['emd_nm']==random_emd]['em_g'])

# 랜덤한 행정동의 의 시계열 graph에서 peak의 'base_date' 검색
## 음식물 쓰레기가 많이 나온 달
print(random_emd + ' 음식물 쓰레기 배출량 상위 3개 달')
for i in range(0, 3):
  print(data[data['em_g']==sorted(data[data['emd_nm']==random_emd]['em_g'], 
                                  reverse=True)[i]]['base_date'])
  
## 음식물 쓰레기가 적게 나온 달
print(random_emd + ' 음식물 쓰레기 배출량 하위 3개 달')
for i in range(0, 3):
  print(data[data['em_g']==sorted(data[data['emd_nm']==random_emd]['em_g'])[i]]['base_date'])

## 월별, 연도별 시각화 보기

In [None]:
#월별 쓰레기량 보기
data_m=data[['base_date','em_g','emd_nm']]
data_m['base_date']=[str(data_m['base_date'][d]) for d in range(len(data_m))]
data_m['base_date']=[data_m['base_date'].iloc[d][4:6] for d in range(len(data_m))]
data_m=data_m.groupby('base_date').mean().reset_index()
plt.plot(data_m['base_date'],data_m['em_g'])
plt.xlabel('month', fontsize=15)
plt.ylabel('em_g') #여름철이 압도적으로 많다.

In [None]:
#년도별 쓰레기량 보기
data_y=data[['base_date','em_g','emd_nm']]
data_y['base_date']=[str(data_y['base_date'][d]) for d in range(len(data_y))]
data_y['base_date']=[data_y['base_date'].iloc[d][:4] for d in range(len(data_y))]
data_y=data_y.groupby('base_date').sum().reset_index()
plt.plot(data_y['base_date'],data_y['em_g'])
plt.xlabel('year', fontsize=15)
plt.ylabel('em_g') #21년에 감소하는 것으로 보이나 7월 이후 데이터가 없어 판단 불가

## 알수없음 

In [None]:
unknown_data=unknown_data.dropna(axis=1).reset_index().drop(['index','Unnamed: 0'],axis=1)

# 4. 내외국민 행정구역별 유동인구 총량 계산

In [None]:
for cat in ['resd', 'work', 'visit']:
  for s in ['w', 'm']:
    data[cat + '_pop_cnt'] = data[cat + '_pop_cnt_lf']
    for i in range(0, 9):
      data[cat + '_pop_cnt'] += data[cat + '_pop_cnt'] + data[cat + '_pop_cnt_' + s + str(i*10)]

# 5. 행정구역별로 특이성이 다른 것을 파악



*   행정구역별 'em_g'와 인구별 상관관계 비교 


In [None]:
for i in emd:
  print(i,':',data[data['emd_nm']==i].corr()['em_g']['resd_pop_cnt'])

# 6. 각 행정구역별 유동인구와 em_g 시간에 따른 correlation 계산

In [None]:
# 행정구역 월별 유동인구 평균 
data_corr = pd.DataFrame({'emd_nm' : np.unique(data['emd_nm']), 
                          'resd_emg_corr' : 0, 
                          'work_emg_corr' : 0,
                          'visit_emg_corr' : 0})

for cat in ['resd', 'work', 'visit']:
  for nm in data_corr['emd_nm'].copy():
    emd_nm_data = data[data['emd_nm'] == nm]
    data_corr.loc[data_corr['emd_nm'] == nm, cat + '_emg_corr'] = emd_nm_data.corr()['em_g'][cat + '_pop_cnt']

data_corr.head()

# 7. K-Means Clustering 진행


*   'res_pop_cnt', 'work_pop_cnt', 'visit_pop_cnt'에 대해 PCA 진행
*   진행한 PCA 기반으로 가중 평균한 column 생성
*   생성한 column 기반으로 K-means Clustering 진행 (k는 2~5로 진행)

In [None]:
#PCA 진행할 데이터 저장하기
data_for_pca=data[['resd_pop_cnt','work_pop_cnt','visit_pop_cnt']]
data_for_pca

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# 표준화 전처리
sc=StandardScaler()
data_for_pca_std=sc.fit_transform(data_for_pca)

# PCA 호출
pca=PCA()
data_pca_x=pca.fit_transform(data_for_pca_std)
for i in range(3):
  print(data_for_pca.columns[i],'설명된 분산 비율',pca.explained_variance_ratio_[i]) #resd_pop_cnt가 가장 크다

In [None]:
data_corr=data_corr[['emd_nm','resd_emg_corr']]
data_corr_x=data_corr['resd_emg_corr']
#resident_pop_cnt와 em_g의 상관계수를 바탕으로 clustering 진행
from sklearn.cluster import KMeans
distortions=[]
for i in range(1,6):
  km=KMeans(n_clusters=i,
            init='k-means++',
            n_init=10,
            max_iter=300,
            random_state=0)
  km.fit(np.array(data_corr_x).reshape(-1,1))
  distortions.append(km.inertia_)

plt.plot(range(1,6),distortions,marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Distortion')
plt.show() #k가 3이 적절해보인다

In [None]:
# k=3인 K-means를 통한 clustering
km=KMeans(n_clusters=3,
          init='k-means++',
          n_init=10,
          max_iter=300,
          random_state=0)
y_km=km.fit_predict(np.array(data_corr_x).reshape(-1,1))

# data_corr에 각 end_nm의 cluster 정보 column 추가
data_corr['cluster']=y_km
data_corr.head()

In [None]:
# 각 cluster별로 dataset Split 진행
emd_nm_cluster = [0]*3

for i in range(0, 3):
  emd_nm_cluster[i] = data_corr[data_corr['cluster']==i]['emd_nm']

data_cluster0 = data[data['emd_nm'].isin(emd_nm_cluster[0])]
data_cluster1 = data[data['emd_nm'].isin(emd_nm_cluster[1])]
data_cluster2 = data[data['emd_nm'].isin(emd_nm_cluster[2])]

data_cluster0.head()

In [None]:
# 각 클러스터별 resd_pop_cnt와 em_g의 상관계수 계산
print('corr(resd_pop_cnt, em_g) for cluster0 : ', data_cluster0.corr()['em_g']['resd_pop_cnt'])
print('corr(resd_pop_cnt, em_g) for cluster1 : ', data_cluster1.corr()['em_g']['resd_pop_cnt'])
print('corr(resd_pop_cnt, em_g) for cluster2 : ', data_cluster2.corr()['em_g']['resd_pop_cnt'])

# 8. 성별(gender), 연령대(age)와 'em_g' 간의 correlation 계산

## 8-1. 각 성별의 'resd_pop_cnt', 'work_pop_cnt', 'visit_pop_cnt'과 'em_g' 간의 correlation 계산

In [None]:
# 성별에 따른 'resd_pop_cnt', 'work_pop_cnt', 'visit_pop_cnt' 계산
data_gender=data[['emd_nm','base_date','em_g','em_cnt','pay_amt','use_cnt','use_amt']]

for i in ['resd','work','visit']:
  for j in ['w','m']:
    for k in range(0, 9):
      data_gender[i+'_pop_cnt_'+j] = 0
      data_gender[i+'_pop_cnt_'+j] = data_gender[i+'_pop_cnt_'+j] + data[i+'_pop_cnt_' + j + str(k*10)]

data_gender

In [None]:
# gender에 따른 'resd_pop_cnt', 'work_pop_cnt', 'visit_pop_cnt'과 'em_g' 간의 corrleation 분석
data_gender_corr = pd.DataFrame({'emd_nm' : np.unique(data['emd_nm'])})  

for cat in ['resd', 'work', 'visit']:
 for nm in data_gender_corr['emd_nm'].copy():
   for gd in ['w', 'm']:
    emd_nm_data = data_gender[data_gender['emd_nm'] == nm]
    data_gender_corr.loc[data_gender_corr['emd_nm'] == nm, cat + '_pop_' + gd + '_emg_corr'] = emd_nm_data.corr()['em_g'][cat + '_pop_cnt'+'_'+gd]

data_gender_corr

## 8-2. 각 연령대의 'resd_pop_cnt', 'work_pop_cnt', 'visit_pop_cnt'과 'em_g' 간의 correlation 계산

In [None]:
# 연령대에 따른 'resd_pop_cnt', 'work_pop_cnt', 'visit_pop_cnt' 계산
data_age=data[['emd_nm','base_date','em_g','em_cnt','pay_amt','use_cnt','use_amt']]
for i in ['resd','work','visit']:
  for j in ['w','m']:
    for k in range(0,9):
      data_age[i+'_pop_cnt_'+str(k*10)]=data[i+'_pop_cnt_'+'m'+str(k*10)]+data[i+'_pop_cnt_'+'w'+str(k*10)]
data_age

In [None]:
# age에 따른 'resd_pop_cnt', 'work_pop_cnt', 'visit_pop_cnt'과 'em_g' 간의 corrleation 분석
data_age_corr = pd.DataFrame({'emd_nm' : np.unique(data['emd_nm'])})

for cat in ['resd', 'work', 'visit']:
 for nm in data_age_corr['emd_nm'].copy():
   for age in [0, 10, 20, 30, 40, 50, 60, 70, 80]:
    emd_nm_data = data_age[data_age['emd_nm'] == nm]
    data_age_corr.loc[data_age_corr['emd_nm'] == nm, cat + '_pop_' + str(age) + '_emg_corr'] = emd_nm_data.corr()['em_g'][cat + '_pop_cnt' + '_' + str(age)]

data_age_corr.head()

# 9. 정제된 dataset을 바탕으로 상관 분석 및 PCA 실시

## 9-1. 데이터 정제하기

In [None]:
data=pd.merge(data,data_gender)
data=pd.merge(data,data_age)

discard_columns=['index','Unnamed: 0']
for i in ['resd','work','visit']:
  for j in ['w','m']:
    for k in range(9):
      discard_columns.append(i+'_pop_cnt_'+j+str(10*k))
data.drop(discard_columns,axis=1,inplace=True)

## 9-2. 시간 차원을 축소해 행정구역과 feature 간의 상관관계 분석

In [None]:
# 행정구역별 feature와 em_g의 시간에 따른 corr 
data_full_corr = pd.DataFrame({'emd_nm' : np.unique(data['emd_nm'])})  
          
for nm in data_full_corr['emd_nm'].copy():
  emd_nm_data_corr = data[data['emd_nm'] == nm].corr()
  for col, item in emd_nm_data_corr.iteritems():
    data_full_corr.loc[data_full_corr['emd_nm'] == nm, col] = emd_nm_data_corr['em_g'][col]

data_full_corr

In [None]:
data_for_clustering=data_full_corr[['emd_nm','total_pop_w']]

In [None]:
#k-means 클러스터링 진행
from sklearn.cluster import KMeans
distortions=[]
for i in range(1,6):
  km=KMeans(n_clusters=i,
            init='k-means++',
            n_init=10,
            max_iter=500,
            random_state=0)
  km.fit(np.array(data_full_corr['total_pop_w']).reshape(-1,1))
  distortions.append(km.inertia_)

plt.plot(range(1,6),distortions,marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Distortion')
plt.show() #k가 3이 적절해보인다

In [None]:
# k=3인 K-means를 통한 clustering
km=KMeans(n_clusters=3,
          init='k-means++',
          n_init=10,
          max_iter=500,
          random_state=0)
y_km=km.fit_predict(np.array(data_for_clustering['total_pop_w']).reshape(-1,1))

np.bincount(y_km)

In [None]:
data_for_clustering['cluster']=y_km
# 각 cluster별로 dataset Split 진행
emd_nm_cluster = [0]*3

for i in range(0, 3):
  emd_nm_cluster[i] = data_for_clustering[data_for_clustering['cluster']==i]['emd_nm']

data_cluster0 = data[data['emd_nm'].isin(emd_nm_cluster[0])]
data_cluster1 = data[data['emd_nm'].isin(emd_nm_cluster[1])]
data_cluster2 = data[data['emd_nm'].isin(emd_nm_cluster[2])]

data_cluster0

In [None]:
data_cluster0.to_csv('data_cluster0.csv')
data_cluster1.to_csv('data_cluster1.csv')
data_cluster2.to_csv('data_cluster2.csv')

## 9-3. 'em_g'와 나머지 feature들 간의 correlation 분석

In [None]:
data_for_ts = data.drop(['emd_nm'], axis=1) # 'base_date'를 기준으로 합하기 위해 Numerical하지 않은 'emd_nm' 제거
data_for_ts = data_for_ts.groupby(['base_date']).sum() # 'base_date'를 기준으로 모두 합

In [None]:
data_emg_ts = data_for_ts.corr()['em_g'].to_dict()

## 9-4. 주요 feature를 고르기 위한 PCA 및 Feature_Selection

PCA를 통해 24개의 feature를 통해 분산의 90% 정도 설명 가능함을 확인

In [None]:
data_for_pca = data.drop(columns=['em_cnt', 'pay_amt', 'use_cnt', 'em_g', 'base_date','emd_nm'])

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# 표준화 전처리
sc=StandardScaler()
data_for_pca_std=sc.fit_transform(data_for_pca)

# PCA 호출
pca=PCA()
pca.fit_transform(data_for_pca_std)

np.cumsum(pca.explained_variance_ratio_)
for i in range(24):
 print(pca.explained_variance_ratio_[i])

24개의 변수를 선택 (f_regression 방식 사용)

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression
selectK = SelectKBest(score_func=f_regression, k=24)
X = selectK.fit_transform(data_for_pca, data['em_g'])

selector = selectK.get_support(indices=True)

col = list(data_for_pca.columns)
for i in range(len(selector)):
 a = selector[i]
 print(col[a])

In [None]:
data_for_pca = data_for_pca[data_for_pca.columns.drop(list(data_for_pca.filter(regex='pop')))]
data_for_pca = data_for_pca.drop(columns=['use_amt'])

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# 표준화 전처리
sc=StandardScaler()
data_for_pca_std=sc.fit_transform(data_for_pca)

# PCA 호출
pca=PCA()
pca.fit_transform(data_for_pca_std)

np.cumsum(pca.explained_variance_ratio_)
for i in range(2):
 print(pca.explained_variance_ratio_[i])

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression
selectK = SelectKBest(score_func=f_regression, k=2)
X = selectK.fit_transform(data_for_pca, data['em_g'])

selector = selectK.get_support(indices=True)

col = list(data_for_pca.columns)
for i in range(len(selector)):
 a = selector[i]
 print(col[a])

#10. 외부 데이터를 추가해 PCA 및 Feature Selection 실행

In [None]:
covid=pd.read_csv('covid.csv')
covid=covid[['일자','계(명)']]
covid['일자']=[str(covid['일자'].iloc[d]) for d in range(len(covid))]
covid['일자']=[covid['일자'].iloc[d][:6] for d in range(len(covid))]
covid['일자']=[int(covid['일자'].iloc[d]) for d in range(len(covid))]
covid=covid.groupby(['일자']).sum().reset_index()
a=pd.merge(data,covid,left_on='base_date',right_on='일자')

In [None]:
weather=pd.read_csv('weather.csv')
weather['일시']=[weather['일시'].iloc[d].replace('-','') for d in range(len(weather))]
weather['일시']=[int(weather['일시'].iloc[d]) for d in range(len(weather))]
weather.drop(['Unnamed: 0'],axis=1,inplace=True)
a=pd.merge(data,weather,left_on='base_date',right_on='일시')

In [None]:
csi=pd.read_csv('CSI.csv')
csi.drop('Unnamed: 0',axis=1,inplace=True)
csi['base_date']=[csi['base_date'].iloc[d][:6] for d in range(len(csi))]
csi['base_date']=[int(csi['base_date'].iloc[d]) for d in range(len(csi))]
a=pd.merge(a,csi)
a

In [None]:
passengers=pd.read_csv('passenger.csv',encoding='utf-8')
passengers_a=passengers[['base_date','passengers']]
passengers_a['base_date']=[passengers_a['base_date'][d].replace('.','') for d in range(len(passengers_a))]
passengers_a['base_date']=[''.join(passengers_a['base_date'][d].split()) for d in range(len(passengers_a))]
passengers_a['base_date']=[int(passengers_a['base_date'][d]) for d in range(len(passengers_a))]
passengers_a

In [None]:
passengers=pd.read_csv('passenger.csv',encoding='utf-8')
passengers_a=passengers[['base_date','passengers']]
passengers_a['base_date']=[passengers_a['base_date'][d].replace('.','') for d in range(len(passengers_a))]
passengers_a['base_date']=[''.join(passengers_a['base_date'][d].split()) for d in range(len(passengers_a))]
passengers_a['base_date']=[int(passengers_a['base_date'][d]) for d in range(len(passengers_a))]
a=pd.merge(data,passengers_a)

In [None]:
a.corr()['passengers']

In [None]:
# 행정구역별 feature와 em_g의 시간에 따른 corr 
data_full_corr_ = pd.DataFrame({'emd_nm' : np.unique(a['emd_nm'])})  
          
for nm in data_full_corr_['emd_nm'].copy():
  emd_nm_data_corr_ = a[a['emd_nm'] == nm].corr()
  for col, item in emd_nm_data_corr_.iteritems():
    data_full_corr_.loc[data_full_corr_['emd_nm'] == nm, col] = emd_nm_data_corr_['em_g'][col]

data_full_corr_ 

In [None]:
# 추가된 CSI와 기상 변수들을 포함해 PCA 분석
a_for_pca = a.drop(columns=['em_cnt', 'pay_amt', 'use_cnt', 'em_g','base_date','emd_nm'])
a_for_pca

from sklearn.feature_selection import SelectKBest, f_regression
selectK = SelectKBest(score_func=f_regression, k=24)
X = selectK.fit_transform(a_for_pca, data['em_g'])

selector = selectK.get_support(indices=True)

col = list(a_for_pca.columns)
for i in range(len(selector)):
  b = selector[i]
  print(col[b])