In [1]:
import pandas as pd
from datetime import datetime, timedelta
import numpy as np

In [2]:
def horizontal_average(dataframe, datetime, column, threshold = 10, digits = 1):
    '''
    동일한 시간의 여러 지점 데이터를 확인한 후 해당 데이터들의 평균값을 반환하는 함수
    threshold 값을 입력하여 허용하는 NaN 값의 최대치를 설정할 수 있음
    '''  
    # 입력된 일시와 컬럼명에 해당하는 데이터 변수화
    data = dataframe.loc[datetime, column]
    
    # 임시로 nan값 입력
    mean = np.nan
    
    # data에 NaN값의 수가 threshold를 초과하는지 확인
    if data.isna().sum() <= threshold:
        # nan값을 제외한 값으로만 리스트 생성
        value_list = [i for i in data if not pd.isna(i)]
        
        # 평균값 산출 이후 반올림
        mean = np.mean(value_list)
        mean = round(mean, digits)
               
    return mean

In [3]:
def average_angle_ignore_nan(degrees):
    """
    주어진 각도의 리스트에서 NaN 값을 무시하고 평균을 계산합니다.
    
    :param degrees: 각도의 리스트 (0-360도)
    :return: 평균 각도 (0-360도)
    """
    # NaN 값을 무시하고 유효한 각도만 선택
    valid_degrees = [deg for deg in degrees if not pd.isna(deg)]
    
    if not valid_degrees:
        return np.nan  # 유효한 각도가 없는 경우 NaN 반환
    
    # 유효한 각도를 라디안으로 변환
    radians = np.deg2rad(valid_degrees)
    
    # x, y 좌표 계산
    x_coords = np.cos(radians)
    y_coords = np.sin(radians)
    
    # x, y 좌표의 평균 계산
    x_mean = np.mean(x_coords)
    y_mean = np.mean(y_coords)
    
    # 평균 좌표를 각도로 변환
    mean_rad = np.arctan2(y_mean, x_mean)
    mean_deg = np.rad2deg(mean_rad)
    
    # 결과를 0-360도 사이의 값으로 변환
    mean_deg = mean_deg % 360
    if mean_deg > 360:
        mean_deg = mean_deg - 360
    
    return round(mean_deg, -1)

# 데이터 불러오기

In [4]:
df = pd.read_csv("./데이터/2015~2024_국내기상데이터.csv")
df.head()

Unnamed: 0,지점,지점명,일시,기온(°C),강수량(mm),풍속(m/s),풍향(16방위),습도(%),증기압(hPa),이슬점온도(°C),현지기압(hPa),해면기압(hPa),지면온도(°C)
0,90,속초,2015-01-01 00:00,0.0,0.0,3.5,290.0,36.0,1.5,-18.05,1014.25,1016.55,-3.65
1,90,속초,2015-01-01 01:00,-5.0,0.0,3.9,270.0,36.0,1.5,-17.8,1014.1,1016.4,-3.4
2,90,속초,2015-01-01 02:00,-5.6,0.0,2.7,320.0,36.0,1.5,-18.3,1014.4,1016.7,-3.9
3,90,속초,2015-01-01 03:00,-6.2,0.0,2.1,270.0,37.0,1.4,-18.5,1014.9,1017.2,-4.3
4,90,속초,2015-01-01 04:00,-6.5,0.0,1.7,230.0,35.0,1.3,-19.4,1014.7,1017.0,-4.7


# 데이터 확인

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7816071 entries, 0 to 7816070
Data columns (total 13 columns):
 #   Column     Dtype  
---  ------     -----  
 0   지점         int64  
 1   지점명        object 
 2   일시         object 
 3   기온(°C)     float64
 4   강수량(mm)    float64
 5   풍속(m/s)    float64
 6   풍향(16방위)   float64
 7   습도(%)      float64
 8   증기압(hPa)   float64
 9   이슬점온도(°C)  float64
 10  현지기압(hPa)  float64
 11  해면기압(hPa)  float64
 12  지면온도(°C)   float64
dtypes: float64(10), int64(1), object(2)
memory usage: 775.2+ MB


In [6]:
df["지점"].value_counts()

지점
112    82248
133    82248
159    82248
156    82248
108    82248
       ...  
296    11879
181     9240
176     3890
187        4
116        1
Name: count, Length: 101, dtype: int64

In [7]:
df["지점"].value_counts().sort_values()[:9]

지점
116        1
187        4
176     3890
181     9240
296    11879
175    40216
239    43573
93     66911
177    74833
Name: count, dtype: int64

In [8]:
df["지점"].value_counts().sort_values()[:9].sum()

250547

- 한 지점당 최대 82248개의 행을 가지고 있음
    - 80000개의 행 미만으로 가지고있는 지점은 총 9개

In [9]:
idx = df["지점"].value_counts().sort_values()[:9].index

In [10]:
for i in idx:
    dfi = df.loc[df["지점"] == i]
    print(dfi["지점명"].value_counts())

지점명
관악산    1
Name: count, dtype: int64
지점명
성산    4
Name: count, dtype: int64
지점명
대구(기)    3890
Name: count, dtype: int64
지점명
서청주    9240
Name: count, dtype: int64
지점명
북부산    11879
Name: count, dtype: int64
지점명
진도(첨찰산)    40216
Name: count, dtype: int64
지점명
세종    43573
Name: count, dtype: int64
지점명
북춘천    66911
Name: count, dtype: int64
지점명
홍성    74833
Name: count, dtype: int64


In [11]:
df.loc[df["지점명"] == "홍성"]

Unnamed: 0,지점,지점명,일시,기온(°C),강수량(mm),풍속(m/s),풍향(16방위),습도(%),증기압(hPa),이슬점온도(°C),현지기압(hPa),해면기압(hPa),지면온도(°C)
710867,177,홍성,2015-11-03 18:00,14.4,0.0,0.4,0.0,61.0,9.9,6.9,1026.0,1028.4,11.8
710868,177,홍성,2015-11-03 19:00,10.8,0.0,0.2,0.0,81.0,10.4,7.6,1026.3,1028.8,10.2
710869,177,홍성,2015-11-03 20:00,9.2,0.0,1.7,160.0,88.0,10.2,7.3,1026.6,1029.1,8.9
710870,177,홍성,2015-11-03 21:00,7.3,0.0,0.1,0.0,94.0,9.5,6.3,1026.8,1029.3,8.0
710871,177,홍성,2015-11-03 22:00,6.5,0.0,0.0,0.0,95.0,9.2,5.7,1027.2,1029.7,7.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7790530,177,홍성,2024-05-19 19:00,23.1,0.0,1.9,230.0,50.0,14.1,12.1,1010.8,1014.0,22.7
7790531,177,홍성,2024-05-19 20:00,21.8,0.0,1.0,200.0,51.0,13.3,11.2,1011.5,1014.7,19.9
7790532,177,홍성,2024-05-19 21:00,20.3,0.0,1.9,230.0,61.0,14.5,12.5,1012.5,1015.7,18.5
7790533,177,홍성,2024-05-19 22:00,18.9,0.0,1.6,360.0,71.0,15.5,13.5,1013.3,1016.5,17.7


- 9개 지점을 확인해 보았을 때 15년 데이터가 제대로 들어가있지 않으며 데이터가 적음
    - 9개 지점 삭제

In [12]:
df.loc[df["지점"].isin(idx)]

Unnamed: 0,지점,지점명,일시,기온(°C),강수량(mm),풍속(m/s),풍향(16방위),습도(%),증기압(hPa),이슬점온도(°C),현지기압(hPa),해면기압(hPa),지면온도(°C)
29016,175,진도(첨찰산),2015-01-01 00:00,0.0,0.0,11.3,290.0,76.5,3.45,-7.875,987.35,1027.825,-2.3
29017,175,진도(첨찰산),2015-01-01 01:00,-4.0,0.0,8.3,320.0,95.0,4.30,-4.600,962.60,1022.100,-0.2
29018,175,진도(첨찰산),2015-01-01 02:00,-5.0,0.0,7.4,340.0,95.0,4.00,-5.600,962.90,1022.700,-0.1
29019,175,진도(첨찰산),2015-01-01 03:00,-5.3,0.9,7.4,340.0,94.0,3.90,-6.100,963.50,1023.400,-0.1
29020,175,진도(첨찰산),2015-01-01 04:00,-5.8,0.0,7.0,340.0,93.0,3.70,-6.700,963.60,1023.600,-0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7816066,296,북부산,2024-05-19 19:00,23.1,0.0,0.8,200.0,54.0,15.20,13.200,1012.80,1013.100,20.7
7816067,296,북부산,2024-05-19 20:00,21.1,0.0,1.0,320.0,59.0,14.70,12.700,1013.30,1013.600,18.6
7816068,296,북부산,2024-05-19 21:00,19.3,0.0,1.3,340.0,56.0,12.40,10.200,1013.90,1014.200,17.2
7816069,296,북부산,2024-05-19 22:00,18.3,0.0,1.2,360.0,60.0,12.50,10.300,1014.00,1014.400,16.1


In [13]:
df92 = df.drop(df.loc[df["지점"].isin(idx)].index, axis = 0)
df92 = df92.reset_index(drop = True)
df92

Unnamed: 0,지점,지점명,일시,기온(°C),강수량(mm),풍속(m/s),풍향(16방위),습도(%),증기압(hPa),이슬점온도(°C),현지기압(hPa),해면기압(hPa),지면온도(°C)
0,90,속초,2015-01-01 00:00,0.0,0.0,3.5,290.0,36.0,1.5,-18.05,1014.25,1016.55,-3.65
1,90,속초,2015-01-01 01:00,-5.0,0.0,3.9,270.0,36.0,1.5,-17.80,1014.10,1016.40,-3.40
2,90,속초,2015-01-01 02:00,-5.6,0.0,2.7,320.0,36.0,1.5,-18.30,1014.40,1016.70,-3.90
3,90,속초,2015-01-01 03:00,-6.2,0.0,2.1,270.0,37.0,1.4,-18.50,1014.90,1017.20,-4.30
4,90,속초,2015-01-01 04:00,-6.5,0.0,1.7,230.0,35.0,1.3,-19.40,1014.70,1017.00,-4.70
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7565519,295,남해,2024-05-19 19:00,24.5,0.0,1.3,340.0,39.0,11.9,9.60,1007.40,1012.60,23.20
7565520,295,남해,2024-05-19 20:00,22.7,0.0,0.9,360.0,42.0,11.6,9.10,1007.80,1013.10,19.90
7565521,295,남해,2024-05-19 21:00,21.1,0.0,0.6,250.0,49.0,12.2,9.90,1008.40,1013.70,18.60
7565522,295,남해,2024-05-19 22:00,23.1,0.0,1.8,290.0,43.0,12.1,9.80,1008.40,1013.60,18.50


In [14]:
df92.isna().sum()

지점           0
지점명          0
일시           0
기온(°C)       0
강수량(mm)      0
풍속(m/s)      0
풍향(16방위)     0
습도(%)        0
증기압(hPa)     0
이슬점온도(°C)    0
현지기압(hPa)    0
해면기압(hPa)    0
지면온도(°C)     0
dtype: int64

In [15]:
# 19개 지점만 사용
"""
백령도 102, 서울 108, 강화 201, 수원 119, 속초 90, 강릉 105,
충주 127, 천안 232, 대전 133, 부산 159, 울산 152, 구미 279,
광주 156, 목포 165, 여수 168, 제주184, 춘천 101, 영덕 277, 군산 140
"""
# 사용할 지점 리스트
re_list = [102, 108, 201, 119, 90, 105, 127, 232, 133, 159, 152, 279, 156, 165, 168, 184, 101, 277, 140]

In [16]:
df19 = df92.loc[df92["지점"].isin(re_list)]
df19 = df19.reset_index(drop = True)
df19

Unnamed: 0,지점,지점명,일시,기온(°C),강수량(mm),풍속(m/s),풍향(16방위),습도(%),증기압(hPa),이슬점온도(°C),현지기압(hPa),해면기압(hPa),지면온도(°C)
0,90,속초,2015-01-01 00:00,0.0,0.0,3.5,290.0,36.0,1.5,-18.05,1014.25,1016.55,-3.65
1,90,속초,2015-01-01 01:00,-5.0,0.0,3.9,270.0,36.0,1.5,-17.80,1014.10,1016.40,-3.40
2,90,속초,2015-01-01 02:00,-5.6,0.0,2.7,320.0,36.0,1.5,-18.30,1014.40,1016.70,-3.90
3,90,속초,2015-01-01 03:00,-6.2,0.0,2.1,270.0,37.0,1.4,-18.50,1014.90,1017.20,-4.30
4,90,속초,2015-01-01 04:00,-6.5,0.0,1.7,230.0,35.0,1.3,-19.40,1014.70,1017.00,-4.70
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562671,279,구미,2024-05-19 19:00,25.5,0.0,1.1,270.0,40.0,12.9,10.80,1006.70,1012.30,26.30
1562672,279,구미,2024-05-19 20:00,24.2,0.0,1.5,250.0,44.0,13.2,11.10,1007.30,1012.90,23.40
1562673,279,구미,2024-05-19 21:00,23.0,0.0,1.6,270.0,47.0,13.1,11.00,1008.30,1013.90,21.80
1562674,279,구미,2024-05-19 22:00,22.7,0.0,1.4,270.0,46.0,12.6,10.40,1008.60,1014.30,20.80


# 지점별 3시간씩 데이터 결합

In [17]:
# 3시간씩 결합하기 위해 일시컬럼 datetime으로 타입변경
df19["일시"] = pd.to_datetime(df19["일시"])

In [18]:
df19["지점"].unique()

array([ 90, 101, 102, 105, 108, 119, 127, 133, 140, 152, 156, 159, 165,
       168, 184, 201, 232, 277, 279], dtype=int64)

In [19]:
# 각 지점별로 나눠주기(시간 데이터프레임과 결합하기 위하여)
df_list = []
for name in df19["지점"].unique():
    df_name = df19.loc[df19["지점"] == name]
    df_list.append(df_name)

In [20]:
df_list

[         지점 지점명                  일시  기온(°C)  강수량(mm)  풍속(m/s)  풍향(16방위)  \
 0        90  속초 2015-01-01 00:00:00     0.0      0.0      3.5     290.0   
 1        90  속초 2015-01-01 01:00:00    -5.0      0.0      3.9     270.0   
 2        90  속초 2015-01-01 02:00:00    -5.6      0.0      2.7     320.0   
 3        90  속초 2015-01-01 03:00:00    -6.2      0.0      2.1     270.0   
 4        90  속초 2015-01-01 04:00:00    -6.5      0.0      1.7     230.0   
 ...      ..  ..                 ...     ...      ...      ...       ...   
 1554463  90  속초 2024-05-19 19:00:00    16.4      0.0      1.5      70.0   
 1554464  90  속초 2024-05-19 20:00:00    15.6      0.0      1.0     140.0   
 1554465  90  속초 2024-05-19 21:00:00    15.0      0.0      0.1       0.0   
 1554466  90  속초 2024-05-19 22:00:00    14.2      0.0      1.6     270.0   
 1554467  90  속초 2024-05-19 23:00:00    13.6      0.0      1.6     270.0   
 
          습도(%)  증기압(hPa)  이슬점온도(°C)  현지기압(hPa)  해면기압(hPa)  지면온도(°C)  
 0         36.0

In [21]:
# timedelta 함수를 사용하여 요일과시간에따라 묶을 구룹 컬럼 정리

# 15년 1월 1일 0시부터 24년 5월 19일 23시까지 82248 시간이므로 range를 82248까지
time_list = [df19.loc[0, "일시"] + (timedelta(hours = 1) * i) for i in range(0, 82248)]
time_df = pd.DataFrame(time_list)
time_df.columns = ["일시"]
time_df["group"] = (time_df.index + 2) // 3 # 3시간씩 묶기위한 group 생성
time_df

Unnamed: 0,일시,group
0,2015-01-01 00:00:00,0
1,2015-01-01 01:00:00,1
2,2015-01-01 02:00:00,1
3,2015-01-01 03:00:00,1
4,2015-01-01 04:00:00,2
...,...,...
82243,2024-05-19 19:00:00,27415
82244,2024-05-19 20:00:00,27415
82245,2024-05-19 21:00:00,27415
82246,2024-05-19 22:00:00,27416


In [22]:
# 지점별로 시간 데이터프레임과 merge후 리스트에 저장

concat_list = []
for dfi in df_list:
    concat_dfi = pd.merge(time_df, dfi, how = "left", on = "일시")
    concat_dfi["지점"] = int(list(dfi["지점"])[0])
    concat_dfi["지점명"] = list(dfi["지점명"])[0]
    concat_list.append(concat_dfi)

In [23]:
# 시간 데이터프레임과 결합된 지점들 concat
total_df = pd.concat(concat_list, ignore_index = True)

In [24]:
total_df

Unnamed: 0,일시,group,지점,지점명,기온(°C),강수량(mm),풍속(m/s),풍향(16방위),습도(%),증기압(hPa),이슬점온도(°C),현지기압(hPa),해면기압(hPa),지면온도(°C)
0,2015-01-01 00:00:00,0,90,속초,0.0,0.0,3.5,290.0,36.0,1.5,-18.05,1014.25,1016.55,-3.65
1,2015-01-01 01:00:00,1,90,속초,-5.0,0.0,3.9,270.0,36.0,1.5,-17.80,1014.10,1016.40,-3.40
2,2015-01-01 02:00:00,1,90,속초,-5.6,0.0,2.7,320.0,36.0,1.5,-18.30,1014.40,1016.70,-3.90
3,2015-01-01 03:00:00,1,90,속초,-6.2,0.0,2.1,270.0,37.0,1.4,-18.50,1014.90,1017.20,-4.30
4,2015-01-01 04:00:00,2,90,속초,-6.5,0.0,1.7,230.0,35.0,1.3,-19.40,1014.70,1017.00,-4.70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562707,2024-05-19 19:00:00,27415,279,구미,25.5,0.0,1.1,270.0,40.0,12.9,10.80,1006.70,1012.30,26.30
1562708,2024-05-19 20:00:00,27415,279,구미,24.2,0.0,1.5,250.0,44.0,13.2,11.10,1007.30,1012.90,23.40
1562709,2024-05-19 21:00:00,27415,279,구미,23.0,0.0,1.6,270.0,47.0,13.1,11.00,1008.30,1013.90,21.80
1562710,2024-05-19 22:00:00,27416,279,구미,22.7,0.0,1.4,270.0,46.0,12.6,10.40,1008.60,1014.30,20.80


In [25]:
# 지점과 group으로 그룹바이하여 지점별로3시간씩 결합

group_df = pd.DataFrame()
group_df["지점"] = total_df.groupby(["지점", "group"])["지점"].max()
group_df["지점명"] = total_df.groupby(["지점", "group"])["지점명"].value_counts().index.map(lambda x: x[2])
group_df["일시"] = total_df.groupby(["지점", "group"])["일시"].max() # 일시는 3개씩 묶였을때 마지막 시간이 나와야해서 max사용
group_df["기온"] = round(total_df.groupby(["지점", "group"])["기온(°C)"].mean(), 1)
group_df["강수량"] = total_df.groupby(["지점", "group"])["강수량(mm)"].sum() # 강수량은 3시간의 합산이므로 sum 사용
group_df["풍속"] = round(total_df.groupby(["지점", "group"])["풍속(m/s)"].mean(), 1)

# 풍향은 0~350까지 10단위의 방위임으로 위에 작성 범위를 라디안으로 변환한뒤 구하는 함수를 사용
group_df["풍향"] = round(total_df.groupby(["지점", "group"])["풍향(16방위)"].apply(average_angle_ignore_nan), -1)
group_df["습도"] = round(total_df.groupby(["지점", "group"])["습도(%)"].mean(), 0)
group_df["증기압"] = round(total_df.groupby(["지점", "group"])["증기압(hPa)"].mean(), 1)
group_df["이슬점온도"] = round(total_df.groupby(["지점", "group"])["이슬점온도(°C)"].mean(), 1)
group_df["현지기압"] = round(total_df.groupby(["지점", "group"])["현지기압(hPa)"].mean(), 1)
group_df["해면기압"] = round(total_df.groupby(["지점", "group"])["해면기압(hPa)"].mean(), 1)
group_df["지면온도"] = round(total_df.groupby(["지점", "group"])["지면온도(°C)"].mean(), 1)

group_df = group_df.reset_index(drop = True)
group_df

Unnamed: 0,지점,지점명,일시,기온,강수량,풍속,풍향,습도,증기압,이슬점온도,현지기압,해면기압,지면온도
0,90,속초,2015-01-01 00:00:00,0.0,0.0,3.5,290.0,36.0,1.5,-18.05,1014.25,1016.55,-3.65
1,90,속초,2015-01-01 03:00:00,-5.6,0.0,2.9,290.0,36.0,1.5,-18.20,1014.47,1016.77,-3.87
2,90,속초,2015-01-01 06:00:00,-6.8,0.0,2.0,240.0,35.0,1.3,-19.77,1014.63,1016.97,-5.07
3,90,속초,2015-01-01 09:00:00,-7.1,0.0,1.3,270.0,33.0,1.2,-20.60,1016.17,1018.53,-5.43
4,90,속초,2015-01-01 12:00:00,-3.5,0.0,2.2,280.0,26.0,1.2,-20.33,1016.77,1019.07,-1.23
...,...,...,...,...,...,...,...,...,...,...,...,...,...
520918,279,구미,2024-05-19 12:00:00,26.9,0.0,1.2,330.0,39.0,13.7,11.60,1009.63,1015.20,43.33
520919,279,구미,2024-05-19 15:00:00,29.9,0.0,1.0,330.0,30.0,12.7,10.53,1007.33,1012.83,49.73
520920,279,구미,2024-05-19 18:00:00,29.8,0.0,1.2,310.0,28.0,11.7,9.27,1006.17,1011.67,38.37
520921,279,구미,2024-05-19 21:00:00,24.2,0.0,1.4,260.0,44.0,13.1,10.97,1007.43,1013.03,23.83


In [26]:
group_df.isna().sum()

지점       0
지점명      0
일시       0
기온       1
강수량      1
풍속       1
풍향       1
습도       1
증기압      1
이슬점온도    1
현지기압     1
해면기압     1
지면온도     1
dtype: int64

- 3시간단위로 묶기전에 없던 결측치가 생김
    - 3시간단위로 묶으면서 일부지점에 없는 시간대가 생기면서 결측치가 만들어짐
        - 결측 컬럼 250들은 각각 동일한 행의 결측

In [27]:
group_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520923 entries, 0 to 520922
Data columns (total 13 columns):
 #   Column  Non-Null Count   Dtype         
---  ------  --------------   -----         
 0   지점      520923 non-null  int64         
 1   지점명     520923 non-null  object        
 2   일시      520923 non-null  datetime64[ns]
 3   기온      520922 non-null  float64       
 4   강수량     520922 non-null  float64       
 5   풍속      520922 non-null  float64       
 6   풍향      520922 non-null  float64       
 7   습도      520922 non-null  float64       
 8   증기압     520922 non-null  float64       
 9   이슬점온도   520922 non-null  float64       
 10  현지기압    520922 non-null  float64       
 11  해면기압    520922 non-null  float64       
 12  지면온도    520922 non-null  float64       
dtypes: datetime64[ns](1), float64(10), int64(1), object(1)
memory usage: 51.7+ MB


In [28]:
# 같은지점에 같은시간이 들어가있는지 확인
group_df.duplicated(["지점", "일시"]).sum()

0

In [29]:
group_df.loc[group_df["기온"].isna()]

Unnamed: 0,지점,지점명,일시,기온,강수량,풍속,풍향,습도,증기압,이슬점온도,현지기압,해면기압,지면온도
174289,127,충주,2018-05-08 09:00:00,,,,,,,,,,


- 지점, 일시 중복 행 없음

In [30]:
# 3시간씩 묶었을 때 나와야하는 행 수
3427 * 8 * 19 + 19

520923

# 00시 23시를 제외한 나머지 묶이면 안되는 시간이 있는지 확인

In [31]:
drop_time_list = [0]

In [32]:
a = range(25)

In [33]:
for i in a:
    if (i % 3) != 0:
        drop_time_list.append(i)

In [34]:
drop_time_list

[0, 1, 2, 4, 5, 7, 8, 10, 11, 13, 14, 16, 17, 19, 20, 22, 23]

In [35]:
drop_time_list[1:-1]

[1, 2, 4, 5, 7, 8, 10, 11, 13, 14, 16, 17, 19, 20, 22]

In [36]:
group_df.loc[group_df["일시"].map(lambda x : x.hour in drop_time_list[1:-1])]

Unnamed: 0,지점,지점명,일시,기온,강수량,풍속,풍향,습도,증기압,이슬점온도,현지기압,해면기압,지면온도


# 24년 5월 19일 23시 - 총 19개

- 위 시간대에 외에 들어있는 23시 확인

In [37]:
group_df.loc[group_df["일시"] == "2015-01-01 00:00:00"]

Unnamed: 0,지점,지점명,일시,기온,강수량,풍속,풍향,습도,증기압,이슬점온도,현지기압,해면기압,지면온도
0,90,속초,2015-01-01,0.0,0.0,3.5,290.0,36.0,1.5,-18.05,1014.25,1016.55,-3.65
27417,101,춘천,2015-01-01,0.0,0.0,3.2,250.0,55.0,1.7,-16.4,973.95,1027.92,-3.72
54834,102,백령도,2015-01-01,0.0,0.0,8.7,290.0,62.0,2.7,-11.35,1014.85,1029.28,-1.95
82251,105,강릉,2015-01-01,0.0,0.0,3.6,340.0,45.0,2.0,-15.52,1017.18,1024.0,-1.3
109668,108,서울,2015-01-01,0.0,0.0,6.1,290.0,51.0,2.4,-14.15,1018.9,1027.07,-4.32
137085,119,수원,2015-01-01,0.0,0.0,4.2,320.0,44.0,2.1,-14.78,1010.95,1027.35,-3.32
164502,127,충주,2015-01-01,0.0,0.0,5.6,290.0,48.0,1.8,-16.2,1004.9,1027.92,-1.2
191919,133,대전,2015-01-01,0.0,0.3,4.2,320.0,61.0,2.6,-11.4,1020.65,1028.85,-1.12
219336,140,군산,2015-01-01,0.0,0.2,9.1,270.0,53.0,2.7,-10.8,1026.1,1027.75,-3.08
246753,152,울산,2015-01-01,0.0,0.0,6.5,290.0,55.0,2.8,-10.75,1020.3,1026.0,-3.12


In [38]:
group_df.loc[group_df["일시"].map(lambda x : x.hour in [23])]

Unnamed: 0,지점,지점명,일시,기온,강수량,풍속,풍향,습도,증기압,이슬점온도,현지기압,해면기압,지면온도
27416,90,속초,2024-05-19 23:00:00,13.9,0.0,1.6,270.0,82.0,13.0,10.8,1012.1,1014.2,14.0
54833,101,춘천,2024-05-19 23:00:00,20.4,0.0,2.2,240.0,56.0,13.4,11.35,1005.1,1013.85,20.85
82250,102,백령도,2024-05-19 23:00:00,15.3,0.0,2.7,180.0,89.0,15.4,13.45,1008.25,1012.55,17.4
109667,105,강릉,2024-05-19 23:00:00,16.0,0.0,0.7,230.0,73.0,13.2,11.15,1010.95,1014.15,16.15
137084,108,서울,2024-05-19 23:00:00,19.5,0.0,1.4,280.0,69.0,15.6,13.6,1004.8,1014.75,18.4
164501,119,수원,2024-05-19 23:00:00,17.8,0.0,0.6,240.0,77.0,15.6,13.65,1010.75,1015.45,17.85
191918,127,충주,2024-05-19 23:00:00,17.6,0.0,0.8,100.0,72.0,14.6,12.55,1001.0,1014.4,18.6
219335,133,대전,2024-05-19 23:00:00,19.0,0.0,0.8,340.0,48.0,10.5,7.7,1007.6,1015.5,17.85
246752,140,군산,2024-05-19 23:00:00,16.7,0.0,0.4,300.0,89.0,16.9,14.85,1013.2,1016.5,16.15
274169,152,울산,2024-05-19 23:00:00,17.4,0.0,0.7,20.0,62.0,12.1,9.8,1004.35,1013.85,17.35


- 19개 이외에 다른 23시는 없음

# 15년 1월 1일 00시, 24년 5월 19일 23시 제거

In [39]:
group_df = group_df.drop(group_df.loc[group_df["일시"] == "2015-01-01 00:00:00"].index, axis = 0)
group_df = group_df.drop(group_df.loc[group_df["일시"] == "2024-05-19 23:00:00"].index, axis = 0)
group_df = group_df.reset_index(drop = True)

In [40]:
group_df

Unnamed: 0,지점,지점명,일시,기온,강수량,풍속,풍향,습도,증기압,이슬점온도,현지기압,해면기압,지면온도
0,90,속초,2015-01-01 03:00:00,-5.6,0.0,2.9,290.0,36.0,1.5,-18.20,1014.47,1016.77,-3.87
1,90,속초,2015-01-01 06:00:00,-6.8,0.0,2.0,240.0,35.0,1.3,-19.77,1014.63,1016.97,-5.07
2,90,속초,2015-01-01 09:00:00,-7.1,0.0,1.3,270.0,33.0,1.2,-20.60,1016.17,1018.53,-5.43
3,90,속초,2015-01-01 12:00:00,-3.5,0.0,2.2,280.0,26.0,1.2,-20.33,1016.77,1019.07,-1.23
4,90,속초,2015-01-01 15:00:00,-2.4,0.0,3.5,290.0,27.0,1.4,-19.03,1016.33,1018.63,3.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...
520880,279,구미,2024-05-19 09:00:00,19.2,0.0,0.6,340.0,69.0,15.2,13.23,1011.00,1016.73,21.37
520881,279,구미,2024-05-19 12:00:00,26.9,0.0,1.2,330.0,39.0,13.7,11.60,1009.63,1015.20,43.33
520882,279,구미,2024-05-19 15:00:00,29.9,0.0,1.0,330.0,30.0,12.7,10.53,1007.33,1012.83,49.73
520883,279,구미,2024-05-19 18:00:00,29.8,0.0,1.2,310.0,28.0,11.7,9.27,1006.17,1011.67,38.37


# 결측이 있는 행 확인

In [41]:
nan_df = group_df.loc[group_df["기온"].isna()]

In [42]:
nan_df.reset_index(inplace = True)

In [43]:
# 기존 인덱스를 남겨두는 이유
# 3시간단위로 그룹되어있는 전체 데이터에서의 인덱스를 알기 위하여
nan_df.rename(columns = {"index" : "group_df index"}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_df.rename(columns = {"index" : "group_df index"}, inplace = True)


In [44]:
nan_df

Unnamed: 0,group_df index,지점,지점명,일시,기온,강수량,풍속,풍향,습도,증기압,이슬점온도,현지기압,해면기압,지면온도
0,174276,127,충주,2018-05-08 09:00:00,,,,,,,,,,


In [45]:
# 결측이 있는 행의 시간대 확인
nan_df["일시"].value_counts().index

DatetimeIndex(['2018-05-08 09:00:00'], dtype='datetime64[ns]', name='일시', freq=None)

In [46]:
len(nan_df["일시"].value_counts().index)

1

In [47]:
nan_i = []
for i in nan_df["일시"].value_counts().index:
    nan_i.append(i)

In [48]:
nan_i

[Timestamp('2018-05-08 09:00:00')]

In [49]:
# 각 시간대별 어떤지점의 결측이 있는지 확인
nan_detail_list = []
for i in nan_df["일시"].value_counts().index:
    df = nan_df.loc[nan_df["일시"] == i]
    nan_detail_list.append(df)

In [50]:
nan_detail_list[0]["group_df index"].unique()

array([174276], dtype=int64)

- 해당시간에 따라 결측이없는 지점의 해당시간대의 평균으로 값 대체

# 결측이 없는 행 추출

In [51]:
not_nan_df = group_df.loc[~group_df["기온"].isna()]

In [52]:
for i in nan_detail_list:
    print(group_df.loc[i["group_df index"].unique()]["기온"])

174276   NaN
Name: 기온, dtype: float64


# 컬럼의 결측값 처리

- 해당 일시(시간)의 다른 지점 평균으로 대체

In [53]:
def df_fillna(data):
    
    ndf = data.loc[data["기온"].isna()]
    ndf.reset_index(inplace = True)
    ndf.rename(columns = {"index" : "df_index"}, inplace = True)

    not_ndf = data.loc[~data["기온"].isna()]

    ndf_list = [] # ndf에서 일시별로 리스트
    for i in ndf["일시"].value_counts().index:
        df = ndf.loc[ndf["일시"] == i]
        ndf_list.append(df)

    for i in ndf_list:
        for j in not_ndf.columns[-10:]:
            # 소수점 1번째 자리 컬럼
            if j in ["기온", "강수량", "풍속", "습도", "증기압", "이슬점온도", "현지기압", "해면기압", "지면온도"]:
                data.loc[i["df_index"].unique(), j] = data.loc[i["df_index"].unique(), j].apply(lambda x : round(not_ndf.loc[not_ndf["일시"] == str(i["일시"].unique()[0])][j].mean(), 1))
                
            # 풍향은 average_angle_ignore_nan 함수를 사용
            else:
                data.loc[i["df_index"], j] = average_angle_ignore_nan(not_ndf.loc[not_ndf["일시"] == i.iloc[0, 3]]["풍향"]) 
            
    return data

In [54]:
df_fillna(group_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ndf.rename(columns = {"index" : "df_index"}, inplace = True)


Unnamed: 0,지점,지점명,일시,기온,강수량,풍속,풍향,습도,증기압,이슬점온도,현지기압,해면기압,지면온도
0,90,속초,2015-01-01 03:00:00,-5.6,0.0,2.9,290.0,36.0,1.5,-18.20,1014.47,1016.77,-3.87
1,90,속초,2015-01-01 06:00:00,-6.8,0.0,2.0,240.0,35.0,1.3,-19.77,1014.63,1016.97,-5.07
2,90,속초,2015-01-01 09:00:00,-7.1,0.0,1.3,270.0,33.0,1.2,-20.60,1016.17,1018.53,-5.43
3,90,속초,2015-01-01 12:00:00,-3.5,0.0,2.2,280.0,26.0,1.2,-20.33,1016.77,1019.07,-1.23
4,90,속초,2015-01-01 15:00:00,-2.4,0.0,3.5,290.0,27.0,1.4,-19.03,1016.33,1018.63,3.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...
520880,279,구미,2024-05-19 09:00:00,19.2,0.0,0.6,340.0,69.0,15.2,13.23,1011.00,1016.73,21.37
520881,279,구미,2024-05-19 12:00:00,26.9,0.0,1.2,330.0,39.0,13.7,11.60,1009.63,1015.20,43.33
520882,279,구미,2024-05-19 15:00:00,29.9,0.0,1.0,330.0,30.0,12.7,10.53,1007.33,1012.83,49.73
520883,279,구미,2024-05-19 18:00:00,29.8,0.0,1.2,310.0,28.0,11.7,9.27,1006.17,1011.67,38.37


In [55]:
group_df.isna().sum()

지점       0
지점명      0
일시       0
기온       0
강수량      0
풍속       0
풍향       0
습도       0
증기압      0
이슬점온도    0
현지기압     0
해면기압     0
지면온도     0
dtype: int64

In [60]:
# 풍향의 360 = 0 과 같으므로 360을 0 으로 변경
group_df.loc[group_df.loc[group_df["풍향"] == 360].index, "풍향"] = 0.0

In [59]:
group_df

Unnamed: 0,지점,지점명,일시,기온,강수량,풍속,풍향,습도,증기압,이슬점온도,현지기압,해면기압,지면온도,month,hour
0,90,속초,2015-01-01 03:00:00,-5.6,0.0,2.9,290.0,36.0,1.5,-18.20,1014.47,1016.77,-3.87,1,3
1,90,속초,2015-01-01 06:00:00,-6.8,0.0,2.0,240.0,35.0,1.3,-19.77,1014.63,1016.97,-5.07,1,6
2,90,속초,2015-01-01 09:00:00,-7.1,0.0,1.3,270.0,33.0,1.2,-20.60,1016.17,1018.53,-5.43,1,9
3,90,속초,2015-01-01 12:00:00,-3.5,0.0,2.2,280.0,26.0,1.2,-20.33,1016.77,1019.07,-1.23,1,12
4,90,속초,2015-01-01 15:00:00,-2.4,0.0,3.5,290.0,27.0,1.4,-19.03,1016.33,1018.63,3.50,1,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
520880,279,구미,2024-05-19 09:00:00,19.2,0.0,0.6,340.0,69.0,15.2,13.23,1011.00,1016.73,21.37,5,9
520881,279,구미,2024-05-19 12:00:00,26.9,0.0,1.2,330.0,39.0,13.7,11.60,1009.63,1015.20,43.33,5,12
520882,279,구미,2024-05-19 15:00:00,29.9,0.0,1.0,330.0,30.0,12.7,10.53,1007.33,1012.83,49.73,5,15
520883,279,구미,2024-05-19 18:00:00,29.8,0.0,1.2,310.0,28.0,11.7,9.27,1006.17,1011.67,38.37,5,18


# group_df를 피봇테이블로 변경

In [62]:
pivot_df = group_df.pivot_table(index = "일시", columns = "지점", values = group_df.columns[3:])

In [63]:
pivot_df

Unnamed: 0_level_0,hour,hour,hour,hour,hour,hour,hour,hour,hour,hour,...,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압
지점,90,101,102,105,108,119,127,133,140,152,...,152,156,159,165,168,184,201,232,277,279
일시,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2015-01-01 03:00:00,3,3,3,3,3,3,3,3,3,3,...,1013.47,1014.10,1009.77,1018.53,1012.43,1020.57,1017.53,1019.93,1010.97,1014.93
2015-01-01 06:00:00,6,6,6,6,6,6,6,6,6,6,...,1013.97,1014.33,1010.57,1018.73,1013.03,1020.97,1017.80,1020.17,1011.30,1015.47
2015-01-01 09:00:00,9,9,9,9,9,9,9,9,9,9,...,1015.40,1015.30,1011.63,1019.43,1014.23,1021.83,1018.67,1021.33,1013.03,1016.37
2015-01-01 12:00:00,12,12,12,12,12,12,12,12,12,12,...,1015.70,1016.97,1012.20,1021.10,1015.00,1022.93,1019.83,1022.47,1014.00,1017.07
2015-01-01 15:00:00,15,15,15,15,15,15,15,15,15,15,...,1015.10,1016.10,1011.43,1020.43,1013.83,1022.57,1018.50,1021.73,1013.07,1016.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-19 09:00:00,9,9,9,9,9,9,9,9,9,9,...,1005.63,1008.20,1006.97,1011.00,1008.60,1013.10,1011.77,1006.93,1009.77,1011.00
2024-05-19 12:00:00,12,12,12,12,12,12,12,12,12,12,...,1004.63,1007.73,1006.20,1010.93,1008.00,1012.87,1011.20,1006.13,1009.43,1009.63
2024-05-19 15:00:00,15,15,15,15,15,15,15,15,15,15,...,1003.37,1006.00,1004.97,1009.57,1006.50,1012.03,1009.90,1004.27,1009.07,1007.33
2024-05-19 18:00:00,18,18,18,18,18,18,18,18,18,18,...,1002.57,1005.13,1004.13,1008.97,1005.37,1011.63,1009.13,1003.53,1009.03,1006.17


# 피봇테이블에서 풍향컬럼을 범주화

In [64]:
pd.get_dummies(pivot_df["풍향"])

지점,90,101,102,105,108,119,127,133,140,152,156,159,165,168,184,201,232,277,279
일시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2015-01-01 03:00:00,290.0,260.0,290.0,320.0,290.0,320.0,280.0,330.0,290.0,300.0,20.0,300.0,70.0,350.0,320.0,130.0,320.0,280.0,310.0
2015-01-01 06:00:00,240.0,280.0,290.0,310.0,290.0,320.0,270.0,320.0,280.0,290.0,50.0,300.0,310.0,270.0,330.0,170.0,300.0,290.0,310.0
2015-01-01 09:00:00,270.0,330.0,290.0,300.0,280.0,320.0,270.0,340.0,280.0,330.0,50.0,310.0,340.0,310.0,320.0,220.0,310.0,280.0,310.0
2015-01-01 12:00:00,280.0,120.0,300.0,300.0,290.0,310.0,280.0,330.0,290.0,320.0,20.0,310.0,340.0,330.0,330.0,230.0,300.0,290.0,300.0
2015-01-01 15:00:00,290.0,240.0,300.0,290.0,290.0,280.0,280.0,320.0,280.0,330.0,30.0,310.0,340.0,340.0,330.0,230.0,280.0,270.0,290.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-19 09:00:00,20.0,320.0,150.0,260.0,80.0,300.0,20.0,180.0,130.0,130.0,20.0,220.0,130.0,110.0,50.0,140.0,130.0,310.0,340.0
2024-05-19 12:00:00,350.0,260.0,150.0,80.0,230.0,220.0,330.0,150.0,320.0,190.0,240.0,200.0,260.0,210.0,40.0,210.0,160.0,30.0,330.0
2024-05-19 15:00:00,0.0,260.0,170.0,60.0,260.0,280.0,320.0,20.0,280.0,170.0,240.0,190.0,290.0,260.0,20.0,250.0,260.0,50.0,330.0
2024-05-19 18:00:00,0.0,200.0,170.0,40.0,260.0,280.0,290.0,300.0,280.0,170.0,320.0,190.0,290.0,240.0,20.0,260.0,320.0,60.0,310.0


In [67]:
dummie_list = []

# 지점과 시간별로 풍향이 여러개이기때문에 범주화
# 풍향의 각 컬럼별로 범주화 후 리스트에 저장
for loc in pivot_df["풍향"].columns:
    dum = pd.get_dummies(pivot_df["풍향"][loc])
    dum.columns = [(loc, i) for i in dum.columns]
    dummie_list.append(dum)

In [68]:
dummie_list[0]

Unnamed: 0_level_0,"(90, 0.0)","(90, 10.0)","(90, 20.0)","(90, 30.0)","(90, 40.0)","(90, 50.0)","(90, 60.0)","(90, 70.0)","(90, 80.0)","(90, 90.0)",...,"(90, 260.0)","(90, 270.0)","(90, 280.0)","(90, 290.0)","(90, 300.0)","(90, 310.0)","(90, 320.0)","(90, 330.0)","(90, 340.0)","(90, 350.0)"
일시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-01 03:00:00,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
2015-01-01 06:00:00,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2015-01-01 09:00:00,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
2015-01-01 12:00:00,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
2015-01-01 15:00:00,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-19 09:00:00,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2024-05-19 12:00:00,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2024-05-19 15:00:00,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2024-05-19 18:00:00,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [84]:
# 풍향범주화 리스트 concat
dum_df = pd.concat(dummie_list, axis = 1)

# column명을 ("풍향", ("지점", "방위"))로 변경
dum_df.columns = [("풍향", i) for i in dum_df.columns]

In [86]:
dum_df

Unnamed: 0_level_0,"(풍향, (90, 0.0))","(풍향, (90, 10.0))","(풍향, (90, 20.0))","(풍향, (90, 30.0))","(풍향, (90, 40.0))","(풍향, (90, 50.0))","(풍향, (90, 60.0))","(풍향, (90, 70.0))","(풍향, (90, 80.0))","(풍향, (90, 90.0))",...,"(풍향, (279, 260.0))","(풍향, (279, 270.0))","(풍향, (279, 280.0))","(풍향, (279, 290.0))","(풍향, (279, 300.0))","(풍향, (279, 310.0))","(풍향, (279, 320.0))","(풍향, (279, 330.0))","(풍향, (279, 340.0))","(풍향, (279, 350.0))"
일시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-01 03:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2015-01-01 06:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2015-01-01 09:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2015-01-01 12:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2015-01-01 15:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-19 09:00:00,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2024-05-19 12:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2024-05-19 15:00:00,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2024-05-19 18:00:00,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [89]:
# 기존 피봇테이블에서의 풍향 컬럼 삭제
pivot_df = pivot_df.drop("풍향", axis = 1)

In [90]:
# 피봇테이블과 풍향범주화테이블 concat
total_df = pd.concat([pivot_df, dum_df], axis = 1)

In [91]:
total_df

Unnamed: 0_level_0,hour,hour,hour,hour,hour,hour,hour,hour,hour,hour,...,풍향,풍향,풍향,풍향,풍향,풍향,풍향,풍향,풍향,풍향
Unnamed: 0_level_1,90,101,102,105,108,119,127,133,140,152,...,"(279, 260.0)","(279, 270.0)","(279, 280.0)","(279, 290.0)","(279, 300.0)","(279, 310.0)","(279, 320.0)","(279, 330.0)","(279, 340.0)","(279, 350.0)"
일시,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2015-01-01 03:00:00,3,3,3,3,3,3,3,3,3,3,...,0,0,0,0,0,1,0,0,0,0
2015-01-01 06:00:00,6,6,6,6,6,6,6,6,6,6,...,0,0,0,0,0,1,0,0,0,0
2015-01-01 09:00:00,9,9,9,9,9,9,9,9,9,9,...,0,0,0,0,0,1,0,0,0,0
2015-01-01 12:00:00,12,12,12,12,12,12,12,12,12,12,...,0,0,0,0,1,0,0,0,0,0
2015-01-01 15:00:00,15,15,15,15,15,15,15,15,15,15,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-19 09:00:00,9,9,9,9,9,9,9,9,9,9,...,0,0,0,0,0,0,0,0,1,0
2024-05-19 12:00:00,12,12,12,12,12,12,12,12,12,12,...,0,0,0,0,0,0,0,1,0,0
2024-05-19 15:00:00,15,15,15,15,15,15,15,15,15,15,...,0,0,0,0,0,0,0,1,0,0
2024-05-19 18:00:00,18,18,18,18,18,18,18,18,18,18,...,0,0,0,0,0,1,0,0,0,0


In [94]:
total_df.isna().sum().sum()

0

In [95]:
total_df.to_csv("./국내기상데이터최종.csv")