# 지역사회건강조사 데이터 수집

In [None]:
from sas7bdat import SAS7BDAT

from sqlalchemy import create_engine
from pandas.io import sql
import numpy as np
import pandas as pd
import os

engine = create_engine('')
cnn = engine.connect()

'''
@ 건강수준 : hLabel
@ 질병경험 : dExper
@ 질병일수 : dDate
@ 결근결석경험 : abExper
@ 결근결석일수 : abDate
@ 운동능력 : exAbility
@ 자기관리 : selfManagement
@ 일상활동 : dailyActivities
@ 통증불편 : painInconvenience
@ 불안우울 : anxiousDepress
@ VAS : VAS
@ 평균 수면시간 : avgSleep
@ 주관적 스트레스 : subjectStress
@ Year : Year
'''

#지역사회건강조사 데이터
def sas_datainput():
    #2008년부터 2011년도까지
    i=2007
    data_df=pd.DataFrame()
    for filename in os.listdir(""):
        with SAS7BDAT(''%filename) as f:
            i+=1
            df = f.to_data_frame()
            print(i)
            # 지역명 변경
            unique_city=df.city_cd.unique().tolist()
            replace_data=['서울','부산','대구','인천','광주','대전','울산','경기','강원','충북','충남','전북','전남','경북','경남','제주']
            df['city_cd']=df['city_cd'].replace(unique_city,replace_data)

            # 칼럼 group_by
            df=np.round(df.groupby(df['city_cd'])['qoa_01z1', 'qoa_02z1', 'qoa_04z1','qoc_01z1','qoc_02z1','qoc_03z1','qoc_04z1','qoc_05z1','qoc_06z1','mtc_01z1','mta_01z1'].mean(),2)
            df['Year']=i
            data_df=data_df.append(df)

    return data_df

#지역사회건강조사 데이터
def sas2_datainput():
    #2008년부터 2011년도까지
    i=2011
    data_df=pd.DataFrame()
    for filename in os.listdir(""):
        with SAS7BDAT(''%filename) as f:
            i+=1
            df = f.to_data_frame()
            print(i)
            # 지역명 변경
            unique_city=df.city_cd.unique().tolist()
            replace_data=['서울','부산','대구','인천','광주','대전','울산','경기','강원','충북','충남','전북','전남','경북','경남','제주','세종']
            df['city_cd']=df['city_cd'].replace(unique_city,replace_data)
            
            # 칼럼 group_by
            df=np.round(df.groupby(df['city_cd'])['qoa_01z1', 'qoa_02z1', 'qoa_04z1','qoc_01z1','qoc_02z1','qoc_03z1','qoc_04z1','qoc_05z1','qoc_06z1','mtc_01z1','mta_01z1'].mean(),2)
            df['Year']=i
            data_df=data_df.append(df)
                        #년도 추가
            # 세종 제거
            data_df=data_df.drop('세종')
    return data_df

def sas_data_merge():
    data_df=sas_datainput()
    data_df2=sas2_datainput()

    frame = [data_df, data_df2]
    result = pd.concat(frame)
    result = result.reset_index()
    result.columns=['Location','hLabel','dExper','abExper','exAbility','selfManagement','dailyActivities','painInconvenience','anxiousDepress','VAS','avgSleep','subjectStress','Year']
    result=result.set_index(['Location','Year'])
    return result

def sas_sql():
    engine = create_engine('')
    cnn = engine.connect()
    cnData = sql.read_frame("year_total_air_poll",cnn)
    cnData.columns=['Location','Year','SO2','CO','O3','NO2','PM10']

    year_list = ['2008','2009','2010','2011','2012','2013']
    #cnData[cnData.Year in year_list]
    cnData=cnData[cnData['Year'].isin(year_list)]
    cnData=np.round(cnData.groupby(['Location','Year'])['SO2','CO','O3','NO2','PM10'].mean(),4)
    return cnData

def kn_sas():
    kn_data=pd.DataFrame()
    for filename in os.listdir(""):
        with SAS7BDAT(''%filename) as f:
            df=f.to_data_frame()
            unique_city=df.region.unique().tolist()
            replace_data=['서울','부산','대구','인천','광주','대전','울산','경기','강원','충북','충남','전북','전남','경북','경남','제주']
            df['region']=df['region'].replace(unique_city,replace_data)
            result=pd.DataFrame(df,columns=['year','age','region','mh_stress','mh_melan','pa_high','pa_mid','pa_walk','LQ_5EQL','BP8'])
            result.columns = ['Year','Age','Location','mh_stress','mh_melan','pa_high','pa_mid','pa_walk','LQ_5EQL','BP8']
            result=result.dropna(subset=['Year','Age','Location','mh_stress','mh_melan','pa_high','pa_mid','pa_walk','LQ_5EQL','BP8'])
            labels = [ "{0}".format(i, i + 9) for i in range(0, 100, 10) ]
            result['cateAge']=pd.cut(result.Age, range(0,105,10),right=False, labels=labels)
            result=np.round(result.groupby(['Location','Year'])['Age','mh_stress','mh_melan','pa_high','pa_mid','pa_walk','LQ_5EQL','BP8'].mean(),2)
            kn_data=kn_data.append(result)
    return kn_data

def preprocessing_Lifemain():
    cnData=sas_sql()
    result=sas_data_merge()
    kn_data=kn_sas()
    
    result=result.reset_index()
    cnData=cnData.reset_index()
    kn_data=kn_data.reset_index()
    
    total_result=result.combine_first(cnData)
    total_result=total_result.combine_first(kn_data)
    return total_result

data_total=preprocessing_Lifemain()

# DB에 저장
data_total.to_sql('', engine, flavor='mysql',if_exists='append')

In [None]:
from sqlalchemy import create_engine
from pandas.io import sql
import numpy as np
import pandas as pd
import os

from matplotlib.pyplot import savefig
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
%matplotlib inline

data=pd.read_pickle('삶의질 데이터.pickle')

# 지역사회건강조사 상관관계 분석

In [None]:
# 목록 분할
test=pd.DataFrame(data, columns=['Year','Location','PM10','hLabel','exAbility','selfManagement', \
                                  'dailyActivities','painInconvenience','anxiousDepress','VAS','avgSleep', \
                                  'subjectStress'])

# 지역별 변형
#test=test[test.Location.isin(['서울','경기']) ]

%matplotlib inline
sns.set(font_scale=1.5)
f, ax = plt.subplots(figsize=(20,10))
sns_heatmap=sns.heatmap(test.corr().round(2),annot=True, fmt=".2f", linewidths=.5, cmap="RdBu_r")
sns_heatmap

x=(list(range(0,12,1)))
fig, ax = plt.subplots(figsize=(16,10))
plt.plot(test.corr().values[0])
plt.xticks(x,['PM10','hLabel','dExper','abExper','exAbility','selfManagement', \
              'dailyActivities','painInconvenience','anxiousDepress','VAS','avgSleep', \
              'subjectStress'], rotation=70)
plt.title("PM10, Location Health Research Correlation")

## 지역별 그룹 분석

In [None]:
font_name = matplotlib.font_manager.FontProperties(fname='C:/Windows/Fonts/malgun.ttf').get_name()
matplotlib.rc('font', family=font_name)

total_data=pd.DataFrame(data, columns=['Year','Location','PM10','hLabel','exAbility','selfManagement', \
                                  'dailyActivities','painInconvenience','anxiousDepress','VAS','avgSleep', \
                                  'subjectStress'])

data_S=total_data[total_data.Location.isin(['서울','경기'])]
data_G=total_data[total_data.Location.isin(['경남','경북','전남','전북'])]
data_K=total_data[total_data.Location.isin(['강원','제주'])]

del data_S['Location']
del data_G['Location']
del data_K['Location']

data_S=data_S.groupby(['Year'])['PM10','hLabel','exAbility','selfManagement','dailyActivities','painInconvenience','anxiousDepress','VAS','avgSleep' \
                       ,'subjectStress'].mean()
data_S=data_S.round(2)
data_S['Location']='서울, 경기'

data_G=data_G.groupby(['Year'])['PM10','hLabel','exAbility','selfManagement','dailyActivities','painInconvenience','anxiousDepress','VAS','avgSleep' \
                       ,'subjectStress'].mean()
data_G=data_G.round(2)
data_G['Location']='경상, 전라'

data_K=data_K.groupby(['Year'])['PM10','hLabel','exAbility','selfManagement','dailyActivities','painInconvenience','anxiousDepress','VAS','avgSleep' \
                       ,'subjectStress'].mean()
data_K=data_K.round(2)
data_K['Location']='강원, 제주'

frame = pd.concat([data_S, data_G, data_K])
frame.columns = ['미세먼지','주관적건강수준','운동능력','자기관리','일상활동','통증불편','불안우울','주관적건강상태','평균수면시간' \
                 ,'주관적스트레스수준','Location']
frame=frame.reset_index()

frame['지역별 년도']= frame['Location']+'-'+frame.Year.apply(str)+"년"

print("삶의 질 명 항목: 주관적건강수준, 운동능력, 자기관리, 일상활동, 통증불편, 불안우울, 주관적건강상태, 평균수면시간, 주관적스트레스수준")
inputType=input("삶의 질 명을 입력하세요: ")

colors=['b','b','b','b','b','b','g','g','g','g','g','g','r','r','r','r','r','r']

fig = plt.figure(figsize=(16,10))
ax = fig.add_subplot(111)
ax2 = ax.twinx()

frame_total= (frame[[inputType,'지역별 년도']].plot(x='지역별 년도',kind='bar', ax=ax, rot=65, legend=False, colors=colors \
                                              , title='삶의 질 데이터 중 %s와 미세먼지간의 비교'%inputType) )
ax.set_ylabel("%s"%inputType)
ax.set_xlabel("지역-년도")

ax2.plot(frame.미세먼지, linestyle='-',marker='o', linewidth=4.0, color='black')
ax2.legend(['미세먼지'])
ax2.set_ylabel("미세먼지")
ax2.set_ylim(40,60)

# 국민건강영양조사 부분

In [None]:
from matplotlib.pyplot import savefig
import matplotlib.pyplot as plt
import seaborn as sns

test=pd.DataFrame(data, columns=['PM10','Year','Age','Location','mh_stress','mh_melan','pa_high','pa_mid','pa_walk','LQ_5EQL','BP8'])

labels = [ "{0}".format(i, i + 9) for i in range(0, 100, 10) ]
test['cateAge']=pd.cut(test.Age, range(0,105,10),right=False, labels=labels)

del test['Age']

# 지역 및 연령 지정
test[test.Location.isin(['서울']) & test.cateAge.isin(['40'])]

sns.set(font_scale=1.5)
f, ax = plt.subplots(figsize=(20,10))
sns_heatmap=sns.heatmap(test.corr(),annot=True, fmt=".2f", linewidths=.5, cmap="RdBu_r")