In [1]:
import os
import pandas as pd

In [101]:
folder_path = "S-DoT_NATURE_2022년(2022.01.03~2023.01.01)"

csv_files = sorted([f for f in os.listdir(folder_path) if f.endswith('.csv')])

# 필요한 컬럼만 선택
required_cols = ['시리얼', '등록일자', '기온(℃)', '상대습도( %)']

# 병합을 위한 빈 DataFrame 초기화
merged_df = pd.DataFrame(columns=required_cols)

for file in csv_files:
    file_path = os.path.join(folder_path, file)
    try:
        # 열 먼저 확인
        header_check = pd.read_csv(file_path, encoding='cp949', index_col=False,nrows=1)
        if not all(col in header_check.columns for col in required_cols):
            print(f"{file} → 필요한 열 누락, 건너뜀")
            continue

        temp_df = pd.read_csv(file_path, encoding='cp949',index_col=False, usecols=required_cols, low_memory=False)
        
        # 수치형 컬럼을 명시적으로 변환
        temp_df['기온(℃)'] = pd.to_numeric(temp_df['기온(℃)'], errors='coerce')
        temp_df['상대습도( %)'] = pd.to_numeric(temp_df['상대습도( %)'], errors='coerce')
        temp_df['시리얼'] = temp_df['시리얼'].astype(str)
        temp_df['등록일자'] = temp_df['등록일자'].astype(str)


        # 미리보기 
        print(f"\n📄 {file} → 행 개수: {len(temp_df)}")

        print(f"\n📄 {file} → 하위 5개 행 미리보기:")
        print(temp_df.tail(5))

        
        merged_df = pd.concat([merged_df, temp_df], ignore_index=True)
        print(f"{file} → 병합 완료")

    except Exception as e:
        print(f"{file} 처리 중 오류 발생: {e}")

# 최종 Parquet로 저장
merged_df.to_parquet("merged_2022.parquet", index=False)
print("Parquet 저장 완료: merged_2022.parquet")



📄 S-DoT_NATURE_2022.01.03-01.09.csv → 행 개수: 168989

📄 S-DoT_NATURE_2022.01.03-01.09.csv → 하위 5개 행 미리보기:
                시리얼  기온(℃)  상대습도( %)              등록일자
168984  V02Q1940955    2.4        60  2022-01-09 23:07
168985  V02Q1940953    1.1        64  2022-01-09 23:07
168986  V02Q1940952    3.6        57  2022-01-09 23:07
168987  V02Q1941015    2.8        62  2022-01-09 23:07
168988  V02Q1940954    3.6        54  2022-01-09 23:07
S-DoT_NATURE_2022.01.03-01.09.csv → 병합 완료


  merged_df = pd.concat([merged_df, temp_df], ignore_index=True)



📄 S-DoT_NATURE_2022.01.10-01.16.csv → 행 개수: 170282

📄 S-DoT_NATURE_2022.01.10-01.16.csv → 하위 5개 행 미리보기:
                시리얼  기온(℃)  상대습도( %)              등록일자
170277  V02Q1940952   -2.9        40  2022-01-16 23:07
170278  V02Q1940951   -3.5        45  2022-01-16 23:07
170279  V02Q1940955   -3.8        41  2022-01-16 23:07
170280  V02Q1941015   -3.7        39  2022-01-16 23:07
170281  V02Q1940953   -2.9        36  2022-01-16 23:07
S-DoT_NATURE_2022.01.10-01.16.csv → 병합 완료

📄 S-DoT_NATURE_2022.01.17-01.23.csv → 행 개수: 169219

📄 S-DoT_NATURE_2022.01.17-01.23.csv → 하위 5개 행 미리보기:
                시리얼  기온(℃)  상대습도( %)              등록일자
169214  V02Q1940955    3.5        51  2022-01-23 23:07
169215  V02Q1941013    4.5        50  2022-01-23 23:07
169216  V02Q1941006    3.0        56  2022-01-23 23:07
169217  V02Q1941015    3.9        55  2022-01-23 23:07
169218  V02Q1941000    4.2        52  2022-01-23 23:08
S-DoT_NATURE_2022.01.17-01.23.csv → 병합 완료

📄 S-DoT_NATURE_2022.01.24-01.30.csv → 행 개수: 1

In [2]:
merged_2022=pd.read_parquet("merged_2022.parquet")

In [3]:
merged_2022

Unnamed: 0,시리얼,등록일자,기온(℃),상대습도( %)
0,OC3CL200017,2022-01-03 0:07,-3.0,52.0
1,OC3CL200014,2022-01-03 0:07,-2.4,42.0
2,OC3CL200011,2022-01-03 0:07,21.6,85.0
3,OC3CL200029,2022-01-03 0:07,-1.2,44.0
4,OC3CL200027,2022-01-03 0:07,-4.0,37.0
...,...,...,...,...
7520444,V02Q1941015,2022-12-28 23:07,-2.4,49.0
7520445,V02Q1940954,2022-12-28 23:07,,
7520446,V02Q1940955,2022-12-28 23:07,-2.5,49.0
7520447,V02Q1941000,2022-12-28 23:07,-2.1,48.0


In [4]:
df_20221212=pd.read_csv(r"S-DoT_NATURE_2022년(2022.01.03~2023.01.01)\S-DoT_NATURE_2022.12.12-12.13.csv",encoding='cp949')
df_20221229=pd.read_csv(r"S-DoT_NATURE_2022년(2022.01.03~2023.01.01)\S-DoT_NATURE_2022.12.29-01.01.csv",encoding='cp949')
df_20220101=pd.read_csv(r"S-DoT_NATURE_2021년(2021.01.04~2022.01.02)\S-DoT_NATURE_2021.12.27-01.02.csv",encoding='cp949')

In [5]:
df_20221205=pd.read_csv(r"S-DoT_NATURE_2022년(2022.01.03~2023.01.01)\S-DoT_NATURE_2022.12.05-12.11.csv",encoding='cp949', sep='\t')


In [6]:
df_20221205

Unnamed: 0,기관 명,모델명,시리얼,구분,기온(℃),상대습도( %),풍향(°),풍속(m/s),돌풍 풍향(°),돌풍 풍속(m/s),...,소음(dB),진동_x(g),진동_y(g),진동_z(g),진동_x 최대(g),진동_y 최대(g),진동_z 최대(g),흑구 운도(℃),전송시간,등록일자
0,서울시,SDOT001,OC3CL2000087,1,-3.7,40.0,,,,,...,,,,,,,,,2.022120e+11,2022-12-05 0:00
1,서울시,SDOT001,OC3CL2000088,1,-3.8,40.0,,,,,...,,,,,,,,,2.022120e+11,2022-12-05 0:00
2,서울시,SDOT001,OC3CL2000089,1,-5.4,42.0,,,,,...,,,,,,,,,2.022120e+11,2022-12-05 0:00
3,서울시,SDOT001,OC3CL2000090,1,-4.4,41.0,,,,,...,,,,,,,,,2.022120e+11,2022-12-05 0:00
4,서울시,SDOT001,OC3CL200010,1,-6.3,57.0,,,,,...,35.0,,,,,,,924.0,2.022120e+11,2022-12-05 0:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560029,서울시,SDOT001,V02Q1940955,3,2.7,47.0,,,,,...,40.0,2.0,5.0,101.0,6.0,11.0,108.0,,2.022120e+11,2022-12-11 23:00
560030,서울시,SDOT001,V02Q1941000,3,2.9,50.0,,,,,...,,,,,,,,,2.022120e+11,2022-12-11 23:00
560031,서울시,SDOT001,V02Q1941006,3,2.2,52.0,,,,,...,,,,,,,,,2.022120e+11,2022-12-11 23:00
560032,서울시,SDOT001,V02Q1941013,3,3.8,47.0,,,,,...,,,,,,,,,2.022120e+11,2022-12-11 23:00


In [7]:
df_20221205=df_20221205[['시리얼', '등록일자', '기온(℃)', '상대습도( %)']]
df_20221205

Unnamed: 0,시리얼,등록일자,기온(℃),상대습도( %)
0,OC3CL2000087,2022-12-05 0:00,-3.7,40.0
1,OC3CL2000088,2022-12-05 0:00,-3.8,40.0
2,OC3CL2000089,2022-12-05 0:00,-5.4,42.0
3,OC3CL2000090,2022-12-05 0:00,-4.4,41.0
4,OC3CL200010,2022-12-05 0:00,-6.3,57.0
...,...,...,...,...
560029,V02Q1940955,2022-12-11 23:00,2.7,47.0
560030,V02Q1941000,2022-12-11 23:00,2.9,50.0
560031,V02Q1941006,2022-12-11 23:00,2.2,52.0
560032,V02Q1941013,2022-12-11 23:00,3.8,47.0


In [8]:
df_20221212.head()

Unnamed: 0,모델번호,시리얼,측정시간,지역,자치구,행정동,온도 최대(℃),온도 평균(℃),온도 최소(℃),습도 최대(%),...,암모니아 최대(pPM),암모니아 평균(pPM),암모니아 최소(pPM),황화수소 최대(pPM),황화수소 평균(pPM),황화수소 최소(pPM),오존 최대(pPM),오존 평균(pPM),오존 최소(pPM),등록일시
0,SDOT001,OC3CL200010,2022-12-12 0:00,parks,Seoul_Grand_Park,meeting_bridge2,-4.6,-4.8,-4.9,100.0,...,0.0,0.0,0.0,0.001,0.0,0.0,,,,2022-12-12 01:03:04AM
1,SDOT001,OC3CL200011,2022-12-12 0:00,parks,Seoul_Grand_Park,valet_parking1,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,2022-12-12 01:03:04AM
2,SDOT001,OC3CL200012,2022-12-12 0:00,traditional_markets,Gwangjin-gu,Hwayang-dong,2.2,1.7,1.1,52.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,2022-12-12 01:03:04AM
3,SDOT001,OC3CL200013,2022-12-12 0:00,main_street,Jongno-gu,"SAMcheong-dong""",0.1,-0.3,-0.6,60.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,2022-12-12 01:03:04AM
4,SDOT001,OC3CL200014,2022-12-12 0:00,main_street,Jung-gu,Myeong-dong,4.6,4.1,3.9,51.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,2022-12-12 01:03:04AM


In [9]:
# 형식 맞추기
df_20221212.rename(columns={
    '측정시간': '등록일자',
     '온도 평균(℃)' :'기온(℃)',
    '습도 평균(%)':'상대습도( %)'
}, inplace=True)

In [10]:
df_20221212[['시리얼', '등록일자', '기온(℃)', '상대습도( %)']].head()

Unnamed: 0,시리얼,등록일자,기온(℃),상대습도( %)
0,OC3CL200010,2022-12-12 0:00,-4.8,100.0
1,OC3CL200011,2022-12-12 0:00,,
2,OC3CL200012,2022-12-12 0:00,1.7,50.0
3,OC3CL200013,2022-12-12 0:00,-0.3,59.0
4,OC3CL200014,2022-12-12 0:00,4.1,50.0


In [11]:
df_20221212=df_20221212[['시리얼', '등록일자', '기온(℃)', '상대습도( %)']]

In [12]:
df_20221229.head()

Unnamed: 0,기관명,모델번호,시리얼,측정시간,지역,자치구,행정동,온도 최대(℃),온도 평균(℃),온도 최소(℃),...,암모니아 최대(ppm),암모니아 평균(ppm),암모니아 최소(ppm),황화수소 최대(ppm),황화수소 평균(ppm),황화수소 최소(ppm),오존 최대(ppm),오존 평균(ppm),오존 최소(ppm),등록일시
0,서울시,SDOT001,V02Q1940541,2022-12-29_12:07:00,residential_area,Jung-gu,Sungin1-dong,-0.5,-0.8,-1.1,...,,,,,,,,,,2022-12-29 12:07
1,서울시,SDOT001,V02Q1940235,2022-12-29_12:07:00,residential_area,Jungnang-gu,Muk2-dong,-0.2,-1.0,-1.9,...,,,,,,,,,,2022-12-29 12:07
2,서울시,SDOT001,V02Q1940254,2022-12-29_12:07:00,residential_area,Jungnang-gu,Sinnae2-dong,-0.8,-1.9,-2.8,...,,,,,,,,,,2022-12-29 12:07
3,서울시,SDOT001,V02Q1940233,2022-12-29_12:07:00,residential_area,Jungnang-gu,Muk1-dong,-1.5,-1.9,-2.1,...,,,,,,,,,,2022-12-29 12:07
4,서울시,SDOT001,V02Q1940172,2022-12-29_12:07:00,roads_and_parks,Jungnang-gu,Myeonmok4-dong,1.3,-0.2,-0.9,...,,,,,,,,,,2022-12-29 12:07


In [13]:
# 형식 맞추기
df_20221229.rename(columns={
    '측정시간': '등록일자',
     '온도 평균(℃)' :'기온(℃)',
    '습도 평균(%)':'상대습도( %)'
}, inplace=True)

In [14]:
df_20221229['등록일자'] = df_20221229['등록일자'].astype(str).str.replace('_', ' ').str[:16]

In [15]:
df_20221229[['시리얼', '등록일자', '기온(℃)', '상대습도( %)']].head()

Unnamed: 0,시리얼,등록일자,기온(℃),상대습도( %)
0,V02Q1940541,2022-12-29 12:07,-0.8,35
1,V02Q1940235,2022-12-29 12:07,-1.0,39
2,V02Q1940254,2022-12-29 12:07,-1.9,44
3,V02Q1940233,2022-12-29 12:07,-1.9,41
4,V02Q1940172,2022-12-29 12:07,-0.2,34


In [16]:
df_20221229=df_20221229[['시리얼', '등록일자', '기온(℃)', '상대습도( %)']]

In [17]:
df_20220101.head()

Unnamed: 0,기관 명,모델명,시리얼,구분,기온(℃),상대습도( %),풍향(°),풍속(m/s),돌풍 풍향(°),돌풍 풍속(m/s),...,소음(dB),진동_x(g),진동_y(g),진동_z(g),진동_x 최대(g),진동_y 최대(g),진동_z 최대(g),흑구 운도(℃),전송시간,등록일자
0,서울시,SDOT001,OC3CL200025,1,-11.3,47,,,,,...,49.0,,,,,,,,202112000000.0,2021-12-27 0:07
1,서울시,SDOT001,OC3CL200014,1,-10.2,44,,,,,...,58.0,,,,,,,-11.1,202112000000.0,2021-12-27 0:07
2,서울시,SDOT001,OC3CL200023,1,-10.1,42,,,,,...,59.0,,,,,,,,202112000000.0,2021-12-27 0:07
3,서울시,SDOT001,OC3CL200017,1,-10.2,42,,,,,...,62.0,,,,,,,-11.5,202112000000.0,2021-12-27 0:07
4,서울시,SDOT001,OC3CL200021,1,-10.6,44,,,,,...,63.0,,,,,,,,202112000000.0,2021-12-27 0:07


In [18]:
df_20220101 = df_20220101[df_20220101['등록일자'].str.startswith('2022')]

In [19]:
df_20220101[['시리얼', '등록일자', '기온(℃)', '상대습도( %)']].head()

Unnamed: 0,시리얼,등록일자,기온(℃),상대습도( %)
121671,OC3CL200011,2022-01-01 0:07,21.6,85
121672,OC3CL200013,2022-01-01 0:07,-8.2,37
121673,OC3CL200017,2022-01-01 0:07,-6.1,33
121674,OC3CL200016,2022-01-01 0:07,-5.8,33
121675,OC3CL200012,2022-01-01 0:07,-5.9,31


In [20]:
df_20220101=df_20220101[['시리얼', '등록일자', '기온(℃)', '상대습도( %)']]

In [21]:
df_221214=pd.read_csv(r"S-DoT_NATURE_2022.12.14-12.18.csv",encoding='cp949')
df_221214

Unnamed: 0,기관 명,모델명,시리얼,구분,기온(℃),상대습도( %),풍향(°),풍속(m/s),돌풍 풍향(°),돌풍 풍속(m/s),...,진동_x(g),진동_y(g),진동_z(g),진동_x 최대(g),진동_y 최대(g),진동_z 최대(g),흑구 운도(℃),전송시간,등록일자,Unnamed: 22
0,서울시,서울시,SDOT001,OC3CL200008,19.0,-7.5,38.0,,,,...,,,,,,,,19.0,2.022120e+11,2022-12-14 10:07
1,서울시,서울시,SDOT001,OC3CL200008,29.0,-6.5,36.0,,,,...,,,,,,,,29.0,2.022120e+11,2022-12-14 10:07
2,서울시,서울시,SDOT001,OC3CL200011,13.0,,,,,,...,54.0,,,,,,,19.0,2.022120e+11,2022-12-14 10:07
3,서울시,서울시,SDOT001,OC3CL200012,0.0,-6.0,34.0,,,,...,59.0,,,,,,,0.0,2.022120e+11,2022-12-14 10:07
4,서울시,서울시,SDOT001,OC3CL200025,0.0,-9.8,48.0,,,,...,69.0,,,,,,,0.0,2.022120e+11,2022-12-14 10:07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109991,서울시,서울시,SDOT001,V02Q1940953,1.0,-7.1,46.0,,,,...,48.0,0.09,0.02,0.98,0.14,0.11,1.06,3.0,2.022120e+11,2022-12-18 23:07
109992,서울시,서울시,SDOT001,V02Q1940954,3.0,,,,,,...,54.0,0.08,0.10,1.05,0.13,0.15,1.12,5.0,2.022120e+11,2022-12-18 23:07
109993,서울시,서울시,SDOT001,V02Q1941006,1.0,-7.9,51.0,,,,...,,,,,,,,2.0,2.022120e+11,2022-12-18 23:07
109994,서울시,서울시,SDOT001,V02Q1941015,4.0,-7.5,48.0,,,,...,,,,,,,,8.0,2.022120e+11,2022-12-18 23:07


In [22]:
print(df_221214.columns.tolist())

['기관 명', '모델명', '시리얼', '구분', '기온(℃)', '상대습도( %)', '풍향(°)', '풍속(m/s)', '돌풍 풍향(°)', '돌풍 풍속(m/s)', '조도(lux)', '자외선(UVI)', '소음(dB)', '진동_x(g)', '진동_y(g)', '진동_z(g)', '진동_x 최대(g)', '진동_y 최대(g)', '진동_z 최대(g)', '흑구 운도(℃)', '전송시간', '등록일자', 'Unnamed: 22']


In [23]:
# 형식 맞추기
df_221214.rename(columns={
    '시리얼': '모델명',
    '구분': '시리얼',
    '등록일자' :'등록일자x',
     'Unnamed: 22' :'등록일자',
}, inplace=True)

In [24]:
df_221214=df_221214[['시리얼', '등록일자', '기온(℃)', '상대습도( %)']]
df_221214

Unnamed: 0,시리얼,등록일자,기온(℃),상대습도( %)
0,OC3CL200008,2022-12-14 10:07,19.0,-7.5
1,OC3CL200008,2022-12-14 10:07,29.0,-6.5
2,OC3CL200011,2022-12-14 10:07,13.0,
3,OC3CL200012,2022-12-14 10:07,0.0,-6.0
4,OC3CL200025,2022-12-14 10:07,0.0,-9.8
...,...,...,...,...
109991,V02Q1940953,2022-12-18 23:07,1.0,-7.1
109992,V02Q1940954,2022-12-18 23:07,3.0,
109993,V02Q1941006,2022-12-18 23:07,1.0,-7.9
109994,V02Q1941015,2022-12-18 23:07,4.0,-7.5


In [25]:
df_220228=pd.read_csv(r"S-DoT_NATURE_2022.02.28-03.06.csv",encoding='cp949')
df_220228.head()

  df_220228=pd.read_csv(r"S-DoT_NATURE_2022.02.28-03.06.csv",encoding='cp949')


Unnamed: 0,기관 명,모델명,시리얼,구분,기온(℃),상대습도( %),풍향(°),풍속(m/s),돌풍 풍향(°),돌풍 풍속(m/s),...,진동_y 최대(g),진동_z 최대(g),흑구 운도(℃),전송시간,등록일자,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26
0,서울시,SDOT001,OC3CL200011,1,21.6,85.0,,,,,...,,,16.0,202202000000.0,2022-02-28 0:07,,,,,
1,서울시,SDOT001,OC3CL200014,1,5.7,89.0,,,,,...,,,6.1,202202000000.0,2022-02-28 0:07,,,,,
2,서울시,SDOT001,OC3CL200013,1,2.7,100.0,,,,,...,,,3.7,202202000000.0,2022-02-28 0:07,,,,,
3,서울시,SDOT001,OC3CL200012,1,5.6,79.0,,,,,...,,,5.3,202202000000.0,2022-02-28 0:07,,,,,
4,서울시,SDOT001,OC3CL200016,1,5.5,86.0,,,,,...,,,5.7,202202000000.0,2022-02-28 0:07,,,,,


In [26]:
# 기관명이 서울시인 데이터
df_seoul = df_220228[df_220228['기관 명'] == '서울시']

# 기관명이 서울시가 아닌 데이터
df_non_seoul = df_220228[df_220228['기관 명'] != '서울시']


In [27]:
df_seoul.head()

Unnamed: 0,기관 명,모델명,시리얼,구분,기온(℃),상대습도( %),풍향(°),풍속(m/s),돌풍 풍향(°),돌풍 풍속(m/s),...,진동_y 최대(g),진동_z 최대(g),흑구 운도(℃),전송시간,등록일자,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26
0,서울시,SDOT001,OC3CL200011,1,21.6,85.0,,,,,...,,,16.0,202202000000.0,2022-02-28 0:07,,,,,
1,서울시,SDOT001,OC3CL200014,1,5.7,89.0,,,,,...,,,6.1,202202000000.0,2022-02-28 0:07,,,,,
2,서울시,SDOT001,OC3CL200013,1,2.7,100.0,,,,,...,,,3.7,202202000000.0,2022-02-28 0:07,,,,,
3,서울시,SDOT001,OC3CL200012,1,5.6,79.0,,,,,...,,,5.3,202202000000.0,2022-02-28 0:07,,,,,
4,서울시,SDOT001,OC3CL200016,1,5.5,86.0,,,,,...,,,5.7,202202000000.0,2022-02-28 0:07,,,,,


In [28]:
df_seoul=df_seoul[['시리얼', '등록일자', '기온(℃)', '상대습도( %)']]
df_seoul

Unnamed: 0,시리얼,등록일자,기온(℃),상대습도( %)
0,OC3CL200011,2022-02-28 0:07,21.6,85.0
1,OC3CL200014,2022-02-28 0:07,5.7,89.0
2,OC3CL200013,2022-02-28 0:07,2.7,100.0
3,OC3CL200012,2022-02-28 0:07,5.6,79.0
4,OC3CL200016,2022-02-28 0:07,5.5,86.0
...,...,...,...,...
108287,V02Q1940949,2022-03-06 23:07,3.8,40.0
108288,V02Q1940951,2022-03-06 23:08,3.6,43.0
108289,V02Q1940952,2022-03-06 23:08,4.0,45.0
108290,V02Q1940954,2022-03-06 23:08,4.0,44.0


In [29]:
df_non_seoul.head()

Unnamed: 0,기관 명,모델명,시리얼,구분,기온(℃),상대습도( %),풍향(°),풍속(m/s),돌풍 풍향(°),돌풍 풍속(m/s),...,진동_y 최대(g),진동_z 최대(g),흑구 운도(℃),전송시간,등록일자,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26
25599,SDOT001,OC3CL200010,1,56,91.0,,,,,,...,,4.8,52.0,,,,0.0,0.0,,2022-03-05 0:00
25600,SDOT001,OC3CL200011,1,47,85.0,,,,,,...,,16.4,51.0,,,,0.0,0.0,,2022-03-05 0:00
25601,SDOT001,OC3CL200012,1,55,73.0,,,,,,...,,6.4,51.0,,,,0.0,0.0,,2022-03-05 0:00
25602,SDOT001,OC3CL200013,1,51,97.0,,,,,,...,,6.1,46.0,,,,0.0,0.0,,2022-03-05 0:00
25603,SDOT001,OC3CL200014,1,48,83.0,,,,,,...,,6.9,46.0,,,,0.0,0.0,,2022-03-05 0:00


In [30]:
df_non_seoul.columns.tolist()

['기관 명',
 '모델명',
 '시리얼',
 '구분',
 '기온(℃)',
 '상대습도( %)',
 '풍향(°)',
 '풍속(m/s)',
 '돌풍 풍향(°)',
 '돌풍 풍속(m/s)',
 '조도(lux)',
 '자외선(UVI)',
 '소음(dB)',
 '진동_x(g)',
 '진동_y(g)',
 '진동_z(g)',
 '진동_x 최대(g)',
 '진동_y 최대(g)',
 '진동_z 최대(g)',
 '흑구 운도(℃)',
 '전송시간',
 '등록일자',
 'Unnamed: 22',
 'Unnamed: 23',
 'Unnamed: 24',
 'Unnamed: 25',
 'Unnamed: 26']

In [31]:
# 형식 맞추기
df_non_seoul.rename(columns={
    '기관 명': '모델명',
    '시리얼':'시리얼x',
    '모델명': '시리얼',
    '구분' :'기온(℃)',
     '기온(℃)' :'상대습도( %)',
    '상대습도( %)':'상대습도( %)x',
     'Unnamed: 26':'등록일자',
    '등록일자' :'등록일자x'
}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_non_seoul.rename(columns={


In [32]:
df_non_seoul=df_non_seoul[['시리얼', '등록일자', '기온(℃)', '상대습도( %)']]
df_non_seoul

Unnamed: 0,시리얼,등록일자,기온(℃),상대습도( %)
25599,OC3CL200010,2022-03-05 0:00,56,91.0
25600,OC3CL200011,2022-03-05 0:00,47,85.0
25601,OC3CL200012,2022-03-05 0:00,55,73.0
25602,OC3CL200013,2022-03-05 0:00,51,97.0
25603,OC3CL200014,2022-03-05 0:00,48,83.0
...,...,...,...,...
105385,V02Q1940955,2022-03-05 23:00,2,32.0
105386,V02Q1941000,2022-03-05 23:00,0,33.0
105387,V02Q1941006,2022-03-05 23:00,0,37.0
105388,V02Q1941013,2022-03-05 23:00,1,34.0


In [49]:
merged_2022 = pd.concat([merged_2022,df_20220101,df_20221205,df_20221212,df_221214, df_20221229,df_seoul,df_non_seoul], ignore_index=True)

In [50]:
merged_2022

Unnamed: 0,시리얼,등록일자,기온(℃),상대습도( %)
0,OC3CL200017,2022-01-03 0:07,-3.0,52.0
1,OC3CL200014,2022-01-03 0:07,-2.4,42.0
2,OC3CL200011,2022-01-03 0:07,21.6,85.0
3,OC3CL200029,2022-01-03 0:07,-1.2,44.0
4,OC3CL200027,2022-01-03 0:07,-4.0,37.0
...,...,...,...,...
10737394,V02Q1940955,2022-03-05 23:00,2.0,32.0
10737395,V02Q1941000,2022-03-05 23:00,0.0,33.0
10737396,V02Q1941006,2022-03-05 23:00,0.0,37.0
10737397,V02Q1941013,2022-03-05 23:00,1.0,34.0


In [51]:
non_datetime_values = merged_2022[~merged_2022['등록일자'].astype(str).str.match(r'\d{4}-\d{2}-\d{2}')]
display(non_datetime_values)


Unnamed: 0,시리얼,등록일자,기온(℃),상대습도( %)


In [52]:
merged_2022.to_pickle("merged_2022_after.pkl")

In [64]:
merged_2022 = pd.read_pickle("merged_2022_after.pkl")

In [65]:
merged_2022.isnull().sum()


시리얼              0
등록일자             0
기온(℃)       137805
상대습도( %)    126525
dtype: int64

In [66]:
merged_2022[merged_2022.isnull().any(axis=1)]


Unnamed: 0,시리얼,등록일자,기온(℃),상대습도( %)
590691,V02Q1940756,2022-01-27 9:07,,
591776,V02Q1940756,2022-01-27 10:07,,
592852,V02Q1940756,2022-01-27 11:07,,
593952,V02Q1940756,2022-01-27 12:07,,
595026,V02Q1940756,2022-01-27 13:07,,
...,...,...,...,...
10628937,V02Q1940059,2022-12-31 11:07,,xxx
10628990,V02Q1940954,2022-12-31 11:07,,xxx
10628992,V02Q1940384,2022-12-31 11:07,,xxx
10628997,V02Q1940954,2022-12-31 11:07,,xxx


In [67]:
# 온도/습도 평균이 모두 null인 시리얼별 합산
null_only_serials = (
    merged_2022.groupby("시리얼")[['기온(℃)', '상대습도( %)']]
    .apply(lambda x: x.isnull().all().all())
)

# True인 시리얼만 필터링
null_only_serials = null_only_serials[null_only_serials].index.tolist()


In [68]:
null_only_serials

['OT3CL110000']

In [69]:
# 널값만 있는 시리얼 제거
cleaned_df_2022 = merged_2022[~merged_2022['시리얼'].isin(null_only_serials)]
cleaned_df_2022

Unnamed: 0,시리얼,등록일자,기온(℃),상대습도( %)
0,OC3CL200017,2022-01-03 0:07,-3.0,52.0
1,OC3CL200014,2022-01-03 0:07,-2.4,42.0
2,OC3CL200011,2022-01-03 0:07,21.6,85.0
3,OC3CL200029,2022-01-03 0:07,-1.2,44.0
4,OC3CL200027,2022-01-03 0:07,-4.0,37.0
...,...,...,...,...
10737394,V02Q1940955,2022-03-05 23:00,2.0,32.0
10737395,V02Q1941000,2022-03-05 23:00,0.0,33.0
10737396,V02Q1941006,2022-03-05 23:00,0.0,37.0
10737397,V02Q1941013,2022-03-05 23:00,1.0,34.0


In [70]:
cleaned_df_2022.isnull().sum()

시리얼              0
등록일자             0
기온(℃)       137804
상대습도( %)    126524
dtype: int64

In [71]:
# 1. 수치형 변환 (기온, 습도 컬럼을 문자열 → 숫자)
cleaned_df_2022['기온(℃)'] = pd.to_numeric(cleaned_df_2022['기온(℃)'], errors='coerce')
cleaned_df_2022['상대습도( %)'] = pd.to_numeric(cleaned_df_2022['상대습도( %)'], errors='coerce')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df_2022['기온(℃)'] = pd.to_numeric(cleaned_df_2022['기온(℃)'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df_2022['상대습도( %)'] = pd.to_numeric(cleaned_df_2022['상대습도( %)'], errors='coerce')


In [72]:
# 2. 날짜 컬럼이 없다면 추가 (예: 등록일자에서 날짜만 추출)
cleaned_df_2022['날짜'] = pd.to_datetime(cleaned_df_2022['등록일자'], format="%Y-%m-%d %H:%M", errors='coerce').dt.date



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df_2022['날짜'] = pd.to_datetime(cleaned_df_2022['등록일자'], format="%Y-%m-%d %H:%M", errors='coerce').dt.date


In [73]:
# 3. 그룹별 평균 계산 (시리얼 + 날짜)
group_avg = cleaned_df_2022.groupby(['시리얼', '날짜'])[['기온(℃)', '상대습도( %)']].transform('mean')



In [74]:
group_avg

Unnamed: 0,기온(℃),상대습도( %)
0,-1.608696,61.347826
1,-0.556522,51.347826
2,21.600000,85.000000
3,-0.386957,52.391304
4,-2.039130,50.434783
...,...,...
10737394,7.263889,34.861111
10737395,2.583333,33.972222
10737396,1.388889,36.708333
10737397,3.333333,33.888889


In [75]:
# 4. 결측값 채우기
cleaned_df_2022['기온(℃)'] = cleaned_df_2022['기온(℃)'].fillna(group_avg['기온(℃)'])
cleaned_df_2022['상대습도( %)'] = cleaned_df_2022['상대습도( %)'].fillna(group_avg['상대습도( %)'])

# 5. 필요 없다면 날짜 컬럼 삭제
cleaned_df_2022.drop(columns='날짜', inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df_2022['기온(℃)'] = cleaned_df_2022['기온(℃)'].fillna(group_avg['기온(℃)'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df_2022['상대습도( %)'] = cleaned_df_2022['상대습도( %)'].fillna(group_avg['상대습도( %)'])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df_2022.drop(columns='날짜', inplace=True)


In [76]:
cleaned_df_2022

Unnamed: 0,시리얼,등록일자,기온(℃),상대습도( %)
0,OC3CL200017,2022-01-03 0:07,-3.0,52.0
1,OC3CL200014,2022-01-03 0:07,-2.4,42.0
2,OC3CL200011,2022-01-03 0:07,21.6,85.0
3,OC3CL200029,2022-01-03 0:07,-1.2,44.0
4,OC3CL200027,2022-01-03 0:07,-4.0,37.0
...,...,...,...,...
10737394,V02Q1940955,2022-03-05 23:00,2.0,32.0
10737395,V02Q1941000,2022-03-05 23:00,0.0,33.0
10737396,V02Q1941006,2022-03-05 23:00,0.0,37.0
10737397,V02Q1941013,2022-03-05 23:00,1.0,34.0


In [77]:
group_avg.isnull().sum()

기온(℃)       132614
상대습도( %)    135428
dtype: int64

In [78]:
cleaned_df_2022.isnull().sum()

시리얼              0
등록일자             0
기온(℃)       132614
상대습도( %)    135428
dtype: int64

In [79]:
null_rows_2022 =cleaned_df_2022[cleaned_df_2022.isnull().any(axis=1)]
null_rows_2022

Unnamed: 0,시리얼,등록일자,기온(℃),상대습도( %)
606991,V02Q1940756,2022-01-28 0:07,,
608095,V02Q1940756,2022-01-28 1:07,,
608720,V02Q1940756,2022-01-28 3:07,,
609696,V02Q1940756,2022-01-28 4:07,,
610776,V02Q1940756,2022-01-28 5:07,,
...,...,...,...,...
10628937,V02Q1940059,2022-12-31 11:07,,
10628990,V02Q1940954,2022-12-31 11:07,,
10628992,V02Q1940384,2022-12-31 11:07,,
10628997,V02Q1940954,2022-12-31 11:07,,


In [81]:
null_by_serial = (
    null_rows_2022
    .groupby("시리얼")[['기온(℃)', '상대습도( %)']]
    .apply(lambda x: x.isnull().sum())
)

# 총 결측 수 계산
null_by_serial['총 결측 수'] = null_by_serial.sum(axis=1)

# 정렬
null_by_serial = null_by_serial.sort_values(by='총 결측 수', ascending=False)
null_by_serial

Unnamed: 0_level_0,기온(℃),상대습도( %),총 결측 수
시리얼,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
OC3CL200011,7723,8023,15746
OC3CL200017,5781,6075,11856
V02Q1940946,5741,6047,11788
V02Q1940423,5746,6040,11786
V02Q1940059,5733,6027,11760
V02Q1940700,5727,6027,11754
V02Q1940954,5705,5996,11701
V02Q1940703,5693,5993,11686
V02Q1940568,5662,5962,11624
OC3CL200126,5251,5545,10796


In [82]:
# 시리얼별 전체 측정 횟수
total_count_by_serial = cleaned_df_2022['시리얼'].value_counts().sort_index()

# 결측률 (%) 계산
partial_nulls = null_by_serial.copy()
partial_nulls['총 행 수'] = total_count_by_serial* 2
partial_nulls['결측률 (%)'] = (partial_nulls['총 결측 수'] / (partial_nulls['총 행 수'])) * 100


In [83]:
partial_nulls

Unnamed: 0_level_0,기온(℃),상대습도( %),총 결측 수,총 행 수,결측률 (%)
시리얼,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
OC3CL200011,7723,8023,15746,19688,79.977651
OC3CL200017,5781,6075,11856,19722,60.115607
V02Q1940946,5741,6047,11788,19622,60.075426
V02Q1940423,5746,6040,11786,19504,60.42863
V02Q1940059,5733,6027,11760,19582,60.055153
V02Q1940700,5727,6027,11754,19528,60.190496
V02Q1940954,5705,5996,11701,19714,59.353759
V02Q1940703,5693,5993,11686,19426,60.156491
V02Q1940568,5662,5962,11624,19498,59.616371
OC3CL200126,5251,5545,10796,19458,55.483606


In [84]:
serial_null_2022=partial_nulls.sort_values("결측률 (%)",ascending=False)
serial_null_2022

Unnamed: 0_level_0,기온(℃),상대습도( %),총 결측 수,총 행 수,결측률 (%)
시리얼,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
OC3CL200011,7723,8023,15746,19688,79.977651
V02Q1940856,4932,5226,10158,16492,61.5935
V02Q1940423,5746,6040,11786,19504,60.42863
V02Q1940700,5727,6027,11754,19528,60.190496
V02Q1940703,5693,5993,11686,19426,60.156491
OC3CL200017,5781,6075,11856,19722,60.115607
V02Q1940946,5741,6047,11788,19622,60.075426
V02Q1940059,5733,6027,11760,19582,60.055153
OC3CL210100,4991,5294,10285,17156,59.949872
V02Q1940568,5662,5962,11624,19498,59.616371


In [85]:
serial_null_2022.to_pickle("serial_null_2022.pkl")

In [86]:
cleaned_df_2022.to_pickle("cleaned_df_2022.pkl")

In [58]:
filtered_df = merged_2023[merged_2023['시리얼'].isin("OC3CL210100")]
filtered_df

Unnamed: 0,시리얼,측정시간,자치구,행정동,온도 평균(℃),습도 평균(%)
246,OC3CL210100,2023-01-01_01:03:03,Gangdong-gu,Amsa1(il)-dong,,
1340,OC3CL210100,2023-01-01_02:03:05,Gangdong-gu,Amsa1(il)-dong,,
2432,OC3CL210100,2023-01-01_03:03:05,Gangdong-gu,Amsa1(il)-dong,,
3526,OC3CL210100,2023-01-01_04:03:05,Gangdong-gu,Amsa1(il)-dong,,
4621,OC3CL210100,2023-01-01_05:03:05,Gangdong-gu,Amsa1(il)-dong,,
...,...,...,...,...,...,...
7627818,OC3CL210100,2023-12-31_07:00:00,Gangdong-gu,Amsa1-dong,,
7628836,OC3CL210100,2023-12-31_08:00:00,Gangdong-gu,Amsa1-dong,,
7629855,OC3CL210100,2023-12-31_09:00:00,Gangdong-gu,Amsa1-dong,,
7630875,OC3CL210100,2023-12-31_10:00:00,Gangdong-gu,Amsa1-dong,,
