In [1]:
import pandas as pd
import numpy as np
import glob
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer

In [9]:
# 불러올 csv 파일들이 있는 베이스 경로 설정
directory = 'C:/Users/Jayden Jeong/OneDrive/바탕 화면/project/final_project/git/데이터/중국기상데이터/'

# 디렉토리 내의 모든 csv 파일 매칭하기
csv_files = glob.glob(directory + '*.csv')

# 모든 csv 파일을 데이터프레임으로 읽어오기
dfs = []
for csv_file in csv_files:
    df = pd.read_csv(csv_file, encoding ='cp949')
    dfs.append(df)

# 하나의 데이터프레임으로 병합
total_df = pd.concat(dfs, ignore_index=True)

total_df.head()

Unnamed: 0,지점,지점명,일시,습도,16 방위 풍향,풍속,강수량,현지기압,해면기압,변화량 기압,...,18,6,Unnamed: 6,950.8,1037.4,.2,-24,-20.7,Unnamed: 12,Unnamed: 13
0,50527.0,HAILAR,2015-01-01 00:00,73.9,16.0,5.0,,945.6,1034.0,0.4,...,,,,,,,,,,
1,50527.0,HAILAR,2015-01-01 03:00,75.0,18.0,3.0,,,1032.1,1.6,...,,,,,,,,,,
2,50527.0,HAILAR,2015-01-01 06:00,77.1,22.0,3.0,,942.0,1029.6,2.0,...,,,,,,,,,,
3,50527.0,HAILAR,2015-01-01 09:00,76.0,20.0,2.0,,,1030.6,0.6,...,,,,,,,,,,
4,50527.0,HAILAR,2015-01-01 12:00,74.2,25.0,2.0,,944.0,1032.9,1.4,...,,,,,,,,,,


In [10]:
# 컬럼명들 확인해보니 뒤에 이상한 컬럼들이 있음
total_df.columns

Index(['지점', '지점명', '일시', '습도', '16 방위 풍향', '풍속', '강수량', '현지기압', '해면기압',
       '변화량 기압', '이슬점 온도', '기온', '최고 기온', '최저 기온', '50527', 'HAILAR',
       '2015-02-01 00:00', '74.8', '18', '6', 'Unnamed: 6', '950.8', '1037.4',
       '.2', '-24', '-20.7', 'Unnamed: 12', 'Unnamed: 13'],
      dtype='object')

In [11]:
# 이상한 컬럼들 삭제
total_df = total_df.iloc[:,:14]

In [12]:
total_df.head()

Unnamed: 0,지점,지점명,일시,습도,16 방위 풍향,풍속,강수량,현지기압,해면기압,변화량 기압,이슬점 온도,기온,최고 기온,최저 기온
0,50527.0,HAILAR,2015-01-01 00:00,73.9,16.0,5.0,,945.6,1034.0,0.4,-31.8,-28.6,,
1,50527.0,HAILAR,2015-01-01 03:00,75.0,18.0,3.0,,,1032.1,1.6,-29.5,-26.4,,
2,50527.0,HAILAR,2015-01-01 06:00,77.1,22.0,3.0,,942.0,1029.6,2.0,-25.9,-23.0,,-29.6
3,50527.0,HAILAR,2015-01-01 09:00,76.0,20.0,2.0,,,1030.6,0.6,-27.9,-24.9,,
4,50527.0,HAILAR,2015-01-01 12:00,74.2,25.0,2.0,,944.0,1032.9,1.4,-30.1,-26.9,,


In [13]:
total_df.shape

(1968796, 14)

In [14]:
# 19개 지점만 사용
city = ['HAMI', '우한', 'YINCHUAN','MINQIN', '지난', 'YU ZHONG', 'YUSHU', 'YAN AN', '치치하르',
 'JARUD QI', '우루무치', '베이징', '칭따오', '쑤조우', '난징', '상하이', 'DACHEN DAO', '푸조우', '광조우']

In [15]:
# 사용할 지점만 필터링
df_19city = total_df[total_df["지점명"].isin(city)]

In [16]:
# 인덱스 재설정
df_19city = df_19city.reset_index(drop = True)

In [17]:
df_19city.shape

(470697, 14)

In [18]:
# 각 지점별 결측인 날짜를 찾아야함.
# 2015년01월01일부터 2024년 05월 19일 21시까지 3시간단위 날짜 생성
date = pd.date_range(start="2015-01-01", end="2024-05-20", freq = "3H")
date = pd.DataFrame(date)[:-1] # 이렇게해야 19일 21시까지 나옴
date.columns = ["일시"]

In [19]:
# DataFrame을 19번 복제하여 axis=0 방향으로 추가 (지역이 19개라서)
date_list = [date] * 19
result_df = pd.concat(date_list, axis=0).reset_index(drop=True)

In [20]:
# datetime으로 변환 (머지를 위한 전처리)
df_19city['일시'] = pd.to_datetime(df['일시'])

# datetime으로 변환 (머지를 위한 전처리)
result_df['일시'] = pd.to_datetime(result_df['일시'])

# 인덱스기준으로 머지 (left join)
merged_df = pd.merge(result_df, df_19city, left_index=True, right_index=True, how="left")

# 결측치가 있는 행 확인
missing_dates = merged_df[merged_df.isnull().any(axis=1)]

In [21]:
merged_df.shape

(520904, 15)

In [22]:
df_19city.head()

Unnamed: 0,지점,지점명,일시,습도,16 방위 풍향,풍속,강수량,현지기압,해면기압,변화량 기압,이슬점 온도,기온,최고 기온,최저 기온
0,50745.0,치치하르,2024-05-01 00:00:00,78.8,25.0,1.0,,1005.5,1025.7,0.2,-23.9,-21.2,,
1,50745.0,치치하르,2024-05-01 03:00:00,54.1,29.0,2.0,,,1024.5,0.9,-20.8,-13.5,,
2,50745.0,치치하르,2024-05-01 06:00:00,44.9,32.0,2.0,,1002.2,1022.0,2.4,-19.9,-10.2,,-21.4
3,50745.0,치치하르,2024-05-01 09:00:00,65.0,25.0,2.0,,,1022.2,0.1,-20.3,-15.2,,
4,50745.0,치치하르,2024-05-01 12:00:00,56.2,32.0,2.0,,1002.3,1022.3,0.2,-19.8,-12.9,,


In [23]:
merged_df.head()

Unnamed: 0,일시_x,지점,지점명,일시_y,습도,16 방위 풍향,풍속,강수량,현지기압,해면기압,변화량 기압,이슬점 온도,기온,최고 기온,최저 기온
0,2015-01-01 00:00:00,50745.0,치치하르,2024-05-01 00:00:00,78.8,25.0,1.0,,1005.5,1025.7,0.2,-23.9,-21.2,,
1,2015-01-01 03:00:00,50745.0,치치하르,2024-05-01 03:00:00,54.1,29.0,2.0,,,1024.5,0.9,-20.8,-13.5,,
2,2015-01-01 06:00:00,50745.0,치치하르,2024-05-01 06:00:00,44.9,32.0,2.0,,1002.2,1022.0,2.4,-19.9,-10.2,,-21.4
3,2015-01-01 09:00:00,50745.0,치치하르,2024-05-01 09:00:00,65.0,25.0,2.0,,,1022.2,0.1,-20.3,-15.2,,
4,2015-01-01 12:00:00,50745.0,치치하르,2024-05-01 12:00:00,56.2,32.0,2.0,,1002.3,1022.3,0.2,-19.8,-12.9,,


In [24]:
# 최고기온과 최저기온의 결측치가 너무 많음. 일반 "기온"만 사용하는게 나을 듯 함
merged_df.isna().sum()

일시_x             0
지점           50207
지점명          50207
일시_y        510379
습도           57350
16 방위 풍향     50353
풍속           50323
강수량         447024
현지기압        167703
해면기압         77978
변화량 기압       50344
이슬점 온도       50340
기온           50350
최고 기온       278980
최저 기온       278755
dtype: int64

In [25]:
# 최저기온, 최고기온 컬럼 삭제
merged_df = merged_df.drop(["최고 기온", "최저 기온"], axis = 1)

In [26]:
merged_df.head()

Unnamed: 0,일시_x,지점,지점명,일시_y,습도,16 방위 풍향,풍속,강수량,현지기압,해면기압,변화량 기압,이슬점 온도,기온
0,2015-01-01 00:00:00,50745.0,치치하르,2024-05-01 00:00:00,78.8,25.0,1.0,,1005.5,1025.7,0.2,-23.9,-21.2
1,2015-01-01 03:00:00,50745.0,치치하르,2024-05-01 03:00:00,54.1,29.0,2.0,,,1024.5,0.9,-20.8,-13.5
2,2015-01-01 06:00:00,50745.0,치치하르,2024-05-01 06:00:00,44.9,32.0,2.0,,1002.2,1022.0,2.4,-19.9,-10.2
3,2015-01-01 09:00:00,50745.0,치치하르,2024-05-01 09:00:00,65.0,25.0,2.0,,,1022.2,0.1,-20.3,-15.2
4,2015-01-01 12:00:00,50745.0,치치하르,2024-05-01 12:00:00,56.2,32.0,2.0,,1002.3,1022.3,0.2,-19.8,-12.9


In [27]:
# 미리 분석했던 이상치로 보이는 강수량 이상치는 모두 0으로 치환
merged_df["강수량"] = merged_df["강수량"].replace(-999, 0)
merged_df["강수량"] = merged_df["강수량"].replace(-99.8, 0)

In [29]:
(merged_df["강수량"] == -999).sum()

0

In [30]:
merged_df["강수량"].value_counts()

0.0      18690
0.1       6749
2.0       5792
1.0       4420
3.0       3806
         ...  
136.0        1
137.0        1
160.0        1
154.0        1
144.0        1
Name: 강수량, Length: 169, dtype: int64

In [31]:
# 피벗테이블로 변환하여, 지점별로 묶기
pdf = pd.pivot_table(merged_df,
                    index = "일시_x",
                    columns = "지점")

  pdf = pd.pivot_table(merged_df,


In [32]:
pdf.head()

Unnamed: 0_level_0,16 방위 풍향,16 방위 풍향,16 방위 풍향,16 방위 풍향,16 방위 풍향,16 방위 풍향,16 방위 풍향,16 방위 풍향,16 방위 풍향,16 방위 풍향,...,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압
지점,50745.0,51463.0,52203.0,52681.0,52983.0,53614.0,53845.0,54026.0,54511.0,54823.0,...,54823.0,54857.0,56029.0,57494.0,58027.0,58238.0,58362.0,58666.0,58847.0,59287.0
일시_x,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2015-01-01 00:00:00,25.0,,,,,29.0,,,14.333333,,...,,1002.55,650.2,,1029.3,1026.0,1009.8,997.2,,
2015-01-01 03:00:00,29.0,,,,,17.5,,,24.0,11.0,...,1019.2,1009.45,651.0,1000.8,1026.6,450.45,1009.4,,,
2015-01-01 06:00:00,32.0,,,,,21.0,,,24.5,29.0,...,,1002.25,651.2,,1028.25,1025.0,1008.3,997.2,,
2015-01-01 09:00:00,25.0,,,,,16.0,,,15.5,27.0,...,1017.0,1008.2,652.4,999.8,1025.633333,450.6,1008.3,,,
2015-01-01 12:00:00,32.0,,,,,16.5,,,29.5,29.0,...,,1002.2,652.0,,1027.85,1025.1,,1000.9,,


In [33]:
pdf.shape

(27416, 171)

In [34]:
# 결측치 확인
pdf.isna().sum()

          지점     
16 방위 풍향  50745.0    11355
          51463.0    11362
          52203.0    11041
          52681.0    10689
          52983.0    10800
                     ...  
현지기압      58238.0    14242
          58362.0    14470
          58666.0    14728
          58847.0    14746
          59287.0    14456
Length: 171, dtype: int64

In [35]:
# 연속된 True 값을 계산하는 함수
def calculate_consecutive_trues(series):
    n = len(series)
    result = [0] * n
    count = 0

    # 첫 번째 패스: 연속된 True의 그룹 길이를 계산
    for i in range(n):
        if series[i]:
            count += 1
        else:
            count = 0
        result[i] = count

    # 두 번째 패스: 그룹의 마지막 True 값에 그룹의 전체 길이를 설정
    final_result = [0] * n
    i = 0
    while i < n:
        if result[i] > 0:
            length = result[i]
            for j in range(length):
                final_result[i - j] = length
            i += length
        else:
            i += 1

    return final_result

In [36]:
# 연속된 결측 확인하기
consecutive_nan_counts = pdf.isna().apply(calculate_consecutive_trues, axis=0)

In [37]:
consecutive_nan_counts

Unnamed: 0_level_0,16 방위 풍향,16 방위 풍향,16 방위 풍향,16 방위 풍향,16 방위 풍향,16 방위 풍향,16 방위 풍향,16 방위 풍향,16 방위 풍향,16 방위 풍향,...,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압
지점,50745.0,51463.0,52203.0,52681.0,52983.0,53614.0,53845.0,54026.0,54511.0,54823.0,...,54823.0,54857.0,56029.0,57494.0,58027.0,58238.0,58362.0,58666.0,58847.0,59287.0
일시_x,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2015-01-01 00:00:00,0,128,256,512,512,0,128,256,0,1,...,1,0,0,1,0,0,0,0,32,128
2015-01-01 03:00:00,0,128,256,512,512,0,128,256,0,0,...,0,0,0,0,0,0,0,1,32,128
2015-01-01 06:00:00,0,128,256,512,512,0,128,256,0,0,...,1,0,0,1,0,0,0,0,32,128
2015-01-01 09:00:00,0,128,256,512,512,0,128,256,0,0,...,0,0,0,0,0,0,0,1,32,128
2015-01-01 12:00:00,0,128,256,512,512,0,128,256,0,0,...,1,0,0,1,0,0,32,0,32,128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-19 09:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2024-05-19 12:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2024-05-19 15:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2024-05-19 18:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


21/9/16년부터 1024개의 NAN값. (인덱스로치면 196040)

In [38]:
# 풍향의 연속된 결측치확인
pdf.loc["2015-02-01 00:00:00":, "16 방위 풍향"].isna().apply(calculate_consecutive_trues, axis=0)

지점,50745.0,51463.0,52203.0,52681.0,52983.0,53614.0,53845.0,54026.0,54511.0,54823.0,54857.0,56029.0,57494.0,58027.0,58238.0,58362.0,58666.0,58847.0,59287.0
일시_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2015-02-01 00:00:00,64,0,128,256,256,512,0,128,256,0,1,0,0,0,0,0,32,0,0
2015-02-01 03:00:00,64,0,128,256,256,512,0,128,256,0,0,0,0,0,0,0,32,0,0
2015-02-01 06:00:00,64,0,128,256,256,512,0,128,256,0,0,0,0,0,0,0,32,0,0
2015-02-01 09:00:00,64,0,128,256,256,512,0,128,256,0,0,0,0,0,0,0,32,0,0
2015-02-01 12:00:00,64,0,128,256,256,512,0,128,256,0,0,0,0,0,0,0,32,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-19 09:00:00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2024-05-19 12:00:00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2024-05-19 15:00:00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2024-05-19 18:00:00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


interpolate_df를 생성

In [39]:
def horizontal_average(dataframe, datetime, column, threshold=3, digits=1):
    '''
    동일한 시간의 여러 지점 데이터를 확인한 후 해당 데이터들의 평균값을 반환하는 함수
    threshold 값을 입력하여 허용하는 NaN 값의 최대치를 설정할 수 있음
    '''  
    # 입력된 일시와 컬럼명에 해당하는 데이터 변수화
    data = dataframe.loc[datetime, column]
    
    # 임시로 nan값 입력
    mean = np.nan
    
    # data에 NaN값의 수가 threshold를 초과하는지 확인
    if data.isna().sum() <= threshold:
        # nan값을 제외한 값으로만 리스트 생성
        value_list = [i for i in data if not pd.isna(i)]
        
        # 평균값 산출 이후 반올림
        mean = np.mean(value_list)
        mean = round(mean, digits)
               
    return mean

In [40]:
pdf

Unnamed: 0_level_0,16 방위 풍향,16 방위 풍향,16 방위 풍향,16 방위 풍향,16 방위 풍향,16 방위 풍향,16 방위 풍향,16 방위 풍향,16 방위 풍향,16 방위 풍향,...,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압
지점,50745.0,51463.0,52203.0,52681.0,52983.0,53614.0,53845.0,54026.0,54511.0,54823.0,...,54823.0,54857.0,56029.0,57494.0,58027.0,58238.0,58362.0,58666.0,58847.0,59287.0
일시_x,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2015-01-01 00:00:00,25.0,,,,,29.0,,,14.333333,,...,,1002.55,650.2,,1029.300000,1026.00,1009.8,997.2,,
2015-01-01 03:00:00,29.0,,,,,17.5,,,24.000000,11.0,...,1019.2,1009.45,651.0,1000.8,1026.600000,450.45,1009.4,,,
2015-01-01 06:00:00,32.0,,,,,21.0,,,24.500000,29.0,...,,1002.25,651.2,,1028.250000,1025.00,1008.3,997.2,,
2015-01-01 09:00:00,25.0,,,,,16.0,,,15.500000,27.0,...,1017.0,1008.20,652.4,999.8,1025.633333,450.60,1008.3,,,
2015-01-01 12:00:00,32.0,,,,,16.5,,,29.500000,29.0,...,,1002.20,652.0,,1027.850000,1025.10,,1000.9,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-19 09:00:00,,,,,,27.0,,,21.666667,,...,,1010.15,650.9,1001.2,1024.933333,1001.60,1010.7,,,
2024-05-19 12:00:00,,,,,,24.5,,,18.000000,,...,,1001.45,651.3,,1029.750000,1028.90,1011.5,994.8,,
2024-05-19 15:00:00,,,,,,20.0,,,14.666667,,...,,1010.90,649.5,1001.1,1027.166667,451.45,1011.4,,,
2024-05-19 18:00:00,,,,,,9.0,,,21.666667,,...,,1001.80,647.4,,1031.000000,1025.30,1010.3,994.8,,


In [41]:
pdf2 = pdf.copy()

In [42]:
# 연속으로 결측치가 3행 이하인 경우 선형 보간 수행
threshold = 3  # 연속된 결측치의 최대 개수
for column in pdf2.columns:
    consecutive_nulls = 0
    consecutive_indices = []  # 연속된 결측치의 인덱스들을 저장하기 위한 리스트
    for idx, value in enumerate(pdf2[column].isnull()):
        if value:
            consecutive_nulls += 1
            consecutive_indices.append(idx)
        else:
            if consecutive_nulls <= threshold:
                # 연속된 결측치의 개수가 임계치 이하일 경우 보간 수행
                if consecutive_nulls > 0:
                    start_idx = consecutive_indices[0]
                    end_idx = consecutive_indices[-1]
                    pdf2[column].iloc[start_idx:end_idx + 1] = pdf2[column].iloc[start_idx:end_idx + 1].interpolate(method="linear")
            consecutive_nulls = 0
            consecutive_indices = []

# 마지막 행이 결측치로 끝나는 경우를 위해 처리
for column in pdf.columns:
    if pdf2[column].iloc[-threshold:].isnull().all():
        pdf2[column].iloc[-threshold:] = pdf2[column].iloc[-threshold:].interpolate(method="linear")


In [43]:
pdf2

Unnamed: 0_level_0,16 방위 풍향,16 방위 풍향,16 방위 풍향,16 방위 풍향,16 방위 풍향,16 방위 풍향,16 방위 풍향,16 방위 풍향,16 방위 풍향,16 방위 풍향,...,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압
지점,50745.0,51463.0,52203.0,52681.0,52983.0,53614.0,53845.0,54026.0,54511.0,54823.0,...,54823.0,54857.0,56029.0,57494.0,58027.0,58238.0,58362.0,58666.0,58847.0,59287.0
일시_x,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2015-01-01 00:00:00,25.0,,,,,29.0,,,14.333333,,...,,1002.55,650.2,,1029.300000,1026.00,1009.8,997.2,,
2015-01-01 03:00:00,29.0,,,,,17.5,,,24.000000,11.0,...,1019.2,1009.45,651.0,1000.8,1026.600000,450.45,1009.4,,,
2015-01-01 06:00:00,32.0,,,,,21.0,,,24.500000,29.0,...,,1002.25,651.2,,1028.250000,1025.00,1008.3,997.2,,
2015-01-01 09:00:00,25.0,,,,,16.0,,,15.500000,27.0,...,1017.0,1008.20,652.4,999.8,1025.633333,450.60,1008.3,,,
2015-01-01 12:00:00,32.0,,,,,16.5,,,29.500000,29.0,...,,1002.20,652.0,,1027.850000,1025.10,,1000.9,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-19 09:00:00,,,,,,27.0,,,21.666667,,...,,1010.15,650.9,1001.2,1024.933333,1001.60,1010.7,,,
2024-05-19 12:00:00,,,,,,24.5,,,18.000000,,...,,1001.45,651.3,,1029.750000,1028.90,1011.5,994.8,,
2024-05-19 15:00:00,,,,,,20.0,,,14.666667,,...,,1010.90,649.5,1001.1,1027.166667,451.45,1011.4,,,
2024-05-19 18:00:00,,,,,,9.0,,,21.666667,,...,,1001.80,647.4,,1031.000000,1025.30,1010.3,994.8,,


In [44]:
pdf.isna().sum()

          지점     
16 방위 풍향  50745.0    11355
          51463.0    11362
          52203.0    11041
          52681.0    10689
          52983.0    10800
                     ...  
현지기압      58238.0    14242
          58362.0    14470
          58666.0    14728
          58847.0    14746
          59287.0    14456
Length: 171, dtype: int64

In [45]:
pdf2.isna().sum()

          지점     
16 방위 풍향  50745.0    11355
          51463.0    11362
          52203.0    11041
          52681.0    10689
          52983.0    10800
                     ...  
현지기압      58238.0    14242
          58362.0    14470
          58666.0    14728
          58847.0    14746
          59287.0    14456
Length: 171, dtype: int64

In [46]:
# 풍향 제외하고 결측치 전처리하기
pdf_no_wind_direction = pdf2.iloc[:, 19:]

In [47]:
pdf_no_wind_direction.shape

(27416, 152)

In [48]:
ss = StandardScaler()

In [49]:
# KNN 보간법 사용을 위해 스케일링 진행하기 (KNN은 거리에 민감하기때문)
scaled_pdf = ss.fit_transform(pdf_no_wind_direction)

### KNN 보간법 사용

In [50]:
imputer=KNNImputer(n_neighbors=1)

In [51]:
# 보간법 적용된 배열들을 imputer_df에 저장
imputer_df = imputer.fit_transform(scaled_pdf)

In [186]:
imputer_df

array([[-2.90737802,  0.43306492, -1.62640684, ..., -0.00434383,
         0.11940668,  0.06562087],
       [ 0.3762485 ,  0.42682903,  0.45167779, ...,  0.02657802,
         0.18558068,  0.15894197],
       [ 0.37789179,  0.4112393 , -1.62640684, ..., -0.00434383,
         0.20308125,  0.13283428],
       ...,
       [ 0.38249301,  0.41092751, -1.62640684, ...,  0.05664093,
         0.08604622,  0.01007259],
       [ 0.37657716, -2.70670685, -1.62640684, ..., -0.02495839,
         0.03791967,  0.22504442],
       [-2.90737802,  0.40812136,  0.68231931, ...,  0.05621146,
         0.02151289,  0.07784149]])

In [190]:
# 보간법 배열들을 데이터프레임으로 변환
imputer_df = pd.DataFrame(imputer_df, columns=pdf_no_wind_direction.columns)

In [200]:
# 스케일링했던걸 다시 복원
reverse_df = ss.inverse_transform(imputer_df)

In [199]:
imputer_df

Unnamed: 0_level_0,강수량,강수량,강수량,강수량,강수량,강수량,강수량,강수량,강수량,강수량,...,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압
지점,50745.0,51463.0,52203.0,52681.0,52983.0,53614.0,53845.0,54026.0,54511.0,54823.0,...,54823.0,54857.0,56029.0,57494.0,58027.0,58238.0,58362.0,58666.0,58847.0,59287.0
0,-2.907378,0.433065,-1.626407,0.504876,0.369330,0.496487,-3.031432,0.444001,0.433376,0.381353,...,0.258792,0.038485,0.617209,-0.012535,0.220651,0.260452,0.031416,-0.004344,0.119407,0.065621
1,0.376248,0.426829,0.451678,0.236607,0.362807,-2.234358,0.365559,0.470275,-1.031249,0.393312,...,0.372176,0.120179,0.854298,-0.028100,0.197595,-6.439323,0.026967,0.026578,0.185581,0.158942
2,0.377892,0.411239,-1.626407,0.236607,0.362807,-2.234358,0.355727,0.438747,0.485997,0.383871,...,0.345918,0.034933,0.913570,-0.008088,0.211684,0.248811,0.014734,-0.004344,0.203081,0.132834
3,0.376248,0.417475,0.682319,0.529069,0.019872,-2.234358,-3.031432,0.458598,0.433376,0.390165,...,0.345918,0.105379,1.269203,-0.039218,0.189340,-6.437577,0.014734,0.045475,0.037920,0.225044
4,0.375920,-2.706707,0.684630,0.512941,0.383061,-2.234358,0.355388,0.438747,0.427529,0.381353,...,0.355467,0.034341,1.150659,-0.026988,0.208269,0.249975,0.050321,0.027437,0.137454,0.096172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27411,0.405499,0.408121,-1.626407,0.507027,0.386494,0.493756,0.017045,0.444001,0.427529,-2.763149,...,0.032026,0.128467,0.824661,-0.023653,0.183363,-0.023580,0.041424,-0.024958,0.040107,0.056733
27412,0.500810,0.420593,-1.626407,0.510253,0.434553,-0.869482,-3.031432,0.473194,0.427529,0.434223,...,0.002188,0.025461,0.943206,-0.001417,0.224493,0.294209,0.050321,-0.024958,0.123782,0.175051
27413,0.382493,0.410928,-1.626407,0.507296,0.362464,0.494848,-3.031432,0.439622,0.428114,0.380724,...,-0.050327,0.137347,0.409756,-0.024765,0.202434,-6.427682,0.049209,0.056641,0.086046,0.010073
27414,0.376577,-2.706707,-1.626407,0.505145,0.362807,0.496487,-3.031432,0.146820,0.491844,0.066651,...,0.170473,0.029605,-0.212602,-0.029212,0.235167,0.252303,0.036976,-0.024958,0.037920,0.225044


In [205]:
merged_df.columns

Index(['일시_x', '지점', '지점명', '일시_y', '습도', '16 방위 풍향', '풍속', '강수량', '현지기압',
       '해면기압', '변화량 기압', '이슬점 온도', '기온'],
      dtype='object')

In [239]:
# KNN Imputer 생성
imputer = KNNImputer(n_neighbors=1)

# 각 지점별로 결측치를 채워나갈 데이터프레임을 생성
imputed_df = pdf_no_wind_direction.copy()

# 컬럼들 나열. 풍향은 제외하고 적용할거기 때문에.
features = ['강수량', '현지기압', '습도', '풍속', '해면기압', '변화량 기압', '이슬점 온도', '기온']

# 각 컬럼별로 KNN Imputer 적용
for feature in features:
    # 해당 특성에 대한 컬럼들을 선택하여 2D 배열로 변환
    feature_cols = [(feature, column) for column in imputed_df.columns.levels[1] if column != '지점']
    feature_data = imputed_df[feature_cols].values
    
    # KNN Imputer를 적용하여 결측치를 채움
    imputed_data = imputer.fit_transform(feature_data)
    
    # 결과를 다시 데이터프레임에 반영
    imputed_df[feature_cols] = imputed_data

In [241]:
imputed_df.head()

Unnamed: 0_level_0,강수량,강수량,강수량,강수량,강수량,강수량,강수량,강수량,강수량,강수량,...,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압
지점,50745.0,51463.0,52203.0,52681.0,52983.0,53614.0,53845.0,54026.0,54511.0,54823.0,...,54823.0,54857.0,56029.0,57494.0,58027.0,58238.0,58362.0,58666.0,58847.0,59287.0
일시_x,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2015-01-01 00:00:00,0.5,10.0,0.2,2.0,0.7,0.0,0.0,0.0,2.0,12.0,...,996.1,1002.55,650.2,1006.25,1029.3,1026.0,1009.8,997.2,1009.9,1011.233333
2015-01-01 03:00:00,0.9,0.3,0.0,5.0,0.7,0.1,6.0,0.0,4.0,12.0,...,1019.2,1009.45,651.0,1000.8,1026.6,450.45,1009.4,1010.8,1005.3,992.8
2015-01-01 06:00:00,0.9,0.3,0.0,0.0,0.7,0.0,1.0,0.0,0.0,12.0,...,984.1,1002.25,651.2,1002.3,1028.25,1025.0,1008.3,997.2,1009.9,1007.05
2015-01-01 09:00:00,0.9,0.3,0.0,0.0,0.7,0.0,6.0,0.0,4.0,12.0,...,1017.0,1008.2,652.4,999.8,1025.633333,450.6,1008.3,1000.4,1006.65,996.7
2015-01-01 12:00:00,0.0,0.0,0.0,0.0,0.0,0.0,4.0,7.0,2.0,0.1,...,986.0,1002.2,652.0,1013.9,1027.85,1025.1,1014.35,1000.9,1003.55,998.7


In [259]:
wind = pdf.loc[:,"16 방위 풍향"]

In [246]:
imputed_df[imputed_df.isna().any(axis = 1)]

Unnamed: 0_level_0,강수량,강수량,강수량,강수량,강수량,강수량,강수량,강수량,강수량,강수량,...,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압
지점,50745.0,51463.0,52203.0,52681.0,52983.0,53614.0,53845.0,54026.0,54511.0,54823.0,...,54823.0,54857.0,56029.0,57494.0,58027.0,58238.0,58362.0,58666.0,58847.0,59287.0
일시_x,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2


In [260]:
wind.head()

지점,50745.0,51463.0,52203.0,52681.0,52983.0,53614.0,53845.0,54026.0,54511.0,54823.0,54857.0,56029.0,57494.0,58027.0,58238.0,58362.0,58666.0,58847.0,59287.0
일시_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2015-01-01 00:00:00,25.0,,,,,29.0,,,14.333333,,24.0,25.0,22.0,7.0,14.5,35.0,20.0,,
2015-01-01 03:00:00,29.0,,,,,17.5,,,24.0,11.0,22.666667,27.0,18.0,14.0,18.0,1.0,20.0,,
2015-01-01 06:00:00,32.0,,,,,21.0,,,24.5,29.0,15.333333,27.0,20.0,8.666667,9.0,33.0,20.0,,
2015-01-01 09:00:00,25.0,,,,,16.0,,,15.5,27.0,19.666667,28.0,0.0,18.666667,11.0,1.0,22.0,,
2015-01-01 12:00:00,32.0,,,,,16.5,,,29.5,29.0,13.666667,18.0,0.0,7.666667,7.0,,21.5,,


In [267]:
# 풍향 전처리.
def average_angle_ignore_nan(data):
    # NaN 값을 무시하고 각도의 평균을 계산하는 함수
    data = data.dropna()
    if len(data) == 0:
        return np.nan
    angles = np.deg2rad(data)
    mean_angle = np.arctan2(np.mean(np.sin(angles)), np.mean(np.cos(angles)))
    mean_angle = np.rad2deg(mean_angle)
    if mean_angle < 0:
        mean_angle += 360
    return mean_angle

for time in wind.index:
    # 각 시간대별 데이터 지정
    data = wind.loc[time, :]
    # 데이터에서 na값의 인덱스를 시리즈로 저장
    na_index = data[data.isna()].index
    
    # 각 데이터의 결측치가 10개 이하일 때 진행
    if len(na_index) <= 10:
        # nan값을 제외한 나머지 각도들의 평균 산출
        mean = average_angle_ignore_nan(data)
        # 결측치 평균값으로 대체
        for idx in na_index:
            wind.loc[time, idx] = mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wind.loc[time, idx] = mean


In [268]:
wind.head()

지점,50745.0,51463.0,52203.0,52681.0,52983.0,53614.0,53845.0,54026.0,54511.0,54823.0,54857.0,56029.0,57494.0,58027.0,58238.0,58362.0,58666.0,58847.0,59287.0
일시_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2015-01-01 00:00:00,25.0,21.588052,21.588052,21.588052,21.588052,29.0,21.588052,21.588052,14.333333,21.588052,24.0,25.0,22.0,7.0,14.5,35.0,20.0,21.588052,21.588052
2015-01-01 03:00:00,29.0,18.395611,18.395611,18.395611,18.395611,17.5,18.395611,18.395611,24.0,11.0,22.666667,27.0,18.0,14.0,18.0,1.0,20.0,18.395611,18.395611
2015-01-01 06:00:00,32.0,21.780098,21.780098,21.780098,21.780098,21.0,21.780098,21.780098,24.5,29.0,15.333333,27.0,20.0,8.666667,9.0,33.0,20.0,21.780098,21.780098
2015-01-01 09:00:00,25.0,16.73749,16.73749,16.73749,16.73749,16.0,16.73749,16.73749,15.5,27.0,19.666667,28.0,0.0,18.666667,11.0,1.0,22.0,16.73749,16.73749
2015-01-01 12:00:00,32.0,17.489009,17.489009,17.489009,17.489009,16.5,17.489009,17.489009,29.5,29.0,13.666667,18.0,0.0,7.666667,7.0,17.489009,21.5,17.489009,17.489009


In [276]:
wind = round(wind, 1)

In [278]:
wind.isna().sum()

지점
50745.0    341
51463.0    283
52203.0    282
52681.0    135
52983.0    134
53614.0    217
53845.0    187
54026.0    163
54511.0    128
54823.0    192
54857.0    193
56029.0    227
57494.0    214
58027.0    321
58238.0    335
58362.0    356
58666.0    349
58847.0    354
59287.0    290
dtype: int64

In [279]:
wind[wind.isna().any(axis=1)]

지점,50745.0,51463.0,52203.0,52681.0,52983.0,53614.0,53845.0,54026.0,54511.0,54823.0,54857.0,56029.0,57494.0,58027.0,58238.0,58362.0,58666.0,58847.0,59287.0
일시_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2016-12-16 06:00:00,,19.0,,,,,,,,5.0,19.0,,2.0,24.0,13.7,,8.8,,23.0
2023-09-12 18:00:00,18.7,,6.3,7.0,19.3,,16.5,,2.0,,,,,,,14.3,,,25.0
2023-09-12 21:00:00,17.7,,7.0,10.0,32.7,,2.0,,11.0,,,,,,,14.3,,,19.0
2023-09-13 00:00:00,17.7,,8.0,8.0,25.3,,19.0,,2.0,,,,,,,16.3,,,16.0
2023-09-13 03:00:00,14.7,,12.0,5.0,28.3,,33.0,,36.0,,,,,,,11.7,,,17.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-05 03:00:00,,,,,28.0,,,,15.3,4.0,27.7,,7.7,9.0,15.3,,27.0,,
2024-05-05 06:00:00,,,,,21.0,,,,22.3,20.0,28.0,,23.3,8.0,15.7,,25.0,,
2024-05-05 09:00:00,,,,,12.5,,,,15.0,11.0,27.7,,13.7,9.0,26.7,,22.0,,
2024-05-05 12:00:00,,,,,23.5,,,,27.3,11.0,28.0,,12.7,8.0,16.3,,22.0,,


In [282]:
# 나머지 결측치는 앞의 값으로 채움
wind2 = wind.fillna(method='ffill')

In [287]:
wind2.head()

지점,50745.0,51463.0,52203.0,52681.0,52983.0,53614.0,53845.0,54026.0,54511.0,54823.0,54857.0,56029.0,57494.0,58027.0,58238.0,58362.0,58666.0,58847.0,59287.0
일시_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2015-01-01 00:00:00,25.0,21.6,21.6,21.6,21.6,29.0,21.6,21.6,14.3,21.6,24.0,25.0,22.0,7.0,14.5,35.0,20.0,21.6,21.6
2015-01-01 03:00:00,29.0,18.4,18.4,18.4,18.4,17.5,18.4,18.4,24.0,11.0,22.7,27.0,18.0,14.0,18.0,1.0,20.0,18.4,18.4
2015-01-01 06:00:00,32.0,21.8,21.8,21.8,21.8,21.0,21.8,21.8,24.5,29.0,15.3,27.0,20.0,8.7,9.0,33.0,20.0,21.8,21.8
2015-01-01 09:00:00,25.0,16.7,16.7,16.7,16.7,16.0,16.7,16.7,15.5,27.0,19.7,28.0,0.0,18.7,11.0,1.0,22.0,16.7,16.7
2015-01-01 12:00:00,32.0,17.5,17.5,17.5,17.5,16.5,17.5,17.5,29.5,29.0,13.7,18.0,0.0,7.7,7.0,17.5,21.5,17.5,17.5


In [326]:
# 10분의1로 줄어들었기때문에 10만큼 곱해줘야함
wind3 = wind * 10

In [335]:
# 반올림하기
wind3 = round(wind3, -1)

In [343]:
wind3

지점,50745.0,51463.0,52203.0,52681.0,52983.0,53614.0,53845.0,54026.0,54511.0,54823.0,54857.0,56029.0,57494.0,58027.0,58238.0,58362.0,58666.0,58847.0,59287.0
일시_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2015-01-01 00:00:00,250.0,220.0,220.0,220.0,220.0,290.0,220.0,220.0,140.0,220.0,240.0,250.0,220.0,70.0,140.0,350.0,200.0,220.0,220.0
2015-01-01 03:00:00,290.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,240.0,110.0,230.0,270.0,180.0,140.0,180.0,10.0,200.0,180.0,180.0
2015-01-01 06:00:00,320.0,220.0,220.0,220.0,220.0,210.0,220.0,220.0,240.0,290.0,150.0,270.0,200.0,90.0,90.0,330.0,200.0,220.0,220.0
2015-01-01 09:00:00,250.0,170.0,170.0,170.0,170.0,160.0,170.0,170.0,160.0,270.0,200.0,280.0,0.0,190.0,110.0,10.0,220.0,170.0,170.0
2015-01-01 12:00:00,320.0,180.0,180.0,180.0,180.0,160.0,180.0,180.0,300.0,290.0,140.0,180.0,0.0,80.0,70.0,180.0,220.0,180.0,180.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-19 09:00:00,140.0,140.0,140.0,140.0,140.0,270.0,140.0,140.0,220.0,140.0,240.0,40.0,0.0,140.0,150.0,20.0,200.0,140.0,140.0
2024-05-19 12:00:00,140.0,140.0,140.0,140.0,140.0,240.0,140.0,140.0,180.0,140.0,210.0,70.0,0.0,160.0,140.0,20.0,220.0,140.0,140.0
2024-05-19 15:00:00,130.0,130.0,130.0,130.0,130.0,200.0,130.0,130.0,150.0,130.0,230.0,110.0,0.0,60.0,220.0,10.0,200.0,130.0,130.0
2024-05-19 18:00:00,130.0,130.0,130.0,130.0,130.0,90.0,130.0,130.0,220.0,130.0,120.0,110.0,0.0,110.0,240.0,20.0,220.0,130.0,130.0


In [361]:
# 풍향이 360인 애들은 모두 0으로 치환
wind3 = wind3.applymap(lambda x: 0 if x == 360 else x)

In [362]:
wind_ohe = pd.get_dummies(wind3.iloc[:,0])

In [363]:
wind_ohe

Unnamed: 0_level_0,0.0,10.0,20.0,30.0,40.0,50.0,60.0,70.0,80.0,90.0,...,260.0,270.0,280.0,290.0,300.0,310.0,320.0,330.0,340.0,350.0
일시_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-01 00:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2015-01-01 03:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2015-01-01 06:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2015-01-01 09:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2015-01-01 12:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-19 09:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2024-05-19 12:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2024-05-19 15:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2024-05-19 18:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [357]:
36 * 19

684

In [None]:
# 원핫인코딩하는 함수

cols = pdf.columns[:19]

for i in range(len(cols)):
    wind_ohe = pd.get_dummies(wind3.iloc[:,i])

In [389]:
wind3[wind3.loc[:,52983.0] < 0]

지점,50745.0,51463.0,52203.0,52681.0,52983.0,53614.0,53845.0,54026.0,54511.0,54823.0,54857.0,56029.0,57494.0,58027.0,58238.0,58362.0,58666.0,58847.0,59287.0
일시_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2016-02-03 18:00:00,250.0,250.0,310.0,270.0,-3190.0,160.0,320.0,0.0,80.0,0.0,20.0,250.0,40.0,250.0,250.0,250.0,250.0,320.0,330.0
2017-07-24 18:00:00,200.0,150.0,70.0,250.0,-9980.0,120.0,60.0,300.0,180.0,110.0,180.0,90.0,180.0,180.0,180.0,180.0,180.0,80.0,30.0


In [397]:
wind3[wind3.loc[:,57494.0] < 0]

지점,50745.0,51463.0,52203.0,52681.0,52983.0,53614.0,53845.0,54026.0,54511.0,54823.0,54857.0,56029.0,57494.0,58027.0,58238.0,58362.0,58666.0,58847.0,59287.0
일시_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2015-03-10 18:00:00,140.0,60.0,70.0,140.0,140.0,140.0,140.0,310.0,140.0,140.0,140.0,260.0,-4920.0,260.0,90.0,290.0,110.0,120.0,140.0
2015-03-10 21:00:00,140.0,0.0,90.0,190.0,190.0,190.0,190.0,310.0,190.0,190.0,160.0,230.0,-4910.0,240.0,110.0,320.0,120.0,330.0,190.0
2015-03-11 18:00:00,220.0,160.0,70.0,160.0,160.0,160.0,160.0,140.0,160.0,160.0,180.0,220.0,-4920.0,0.0,110.0,290.0,140.0,200.0,160.0
2015-03-11 21:00:00,200.0,160.0,70.0,160.0,160.0,160.0,160.0,300.0,160.0,160.0,140.0,60.0,-4920.0,170.0,160.0,200.0,150.0,290.0,340.0
2015-03-12 15:00:00,220.0,150.0,80.0,150.0,150.0,150.0,150.0,320.0,150.0,150.0,200.0,240.0,-4920.0,110.0,150.0,90.0,40.0,160.0,320.0


In [393]:
# 이상치는 100으로 채우기
wind3.loc["2016-02-03 15:00:00":, 52983.0] 

일시_x
2016-02-03 15:00:00     100.0
2016-02-03 18:00:00   -3190.0
2016-02-03 21:00:00     150.0
2016-02-04 00:00:00     250.0
2016-02-04 03:00:00     290.0
                        ...  
2024-05-19 09:00:00     140.0
2024-05-19 12:00:00     140.0
2024-05-19 15:00:00     130.0
2024-05-19 18:00:00     130.0
2024-05-19 21:00:00     180.0
Name: 52983.0, Length: 24227, dtype: float64

In [400]:
wind3 = wind3.replace(-3190, 100)

In [394]:
# 이상치는 250으로 채우기
wind3.loc["2017-07-24 15:00:00":, 52983.0] 

일시_x
2017-07-24 15:00:00     250.0
2017-07-24 18:00:00   -9980.0
2017-07-24 21:00:00       0.0
2017-07-25 00:00:00       0.0
2017-07-25 03:00:00      90.0
                        ...  
2024-05-19 09:00:00     140.0
2024-05-19 12:00:00     140.0
2024-05-19 15:00:00     130.0
2024-05-19 18:00:00     130.0
2024-05-19 21:00:00     180.0
Name: 52983.0, Length: 19931, dtype: float64

In [401]:
wind3 = wind3.replace(-9980, 250)

In [398]:
# 이상치는 110으로 채우기
wind3.loc["2015-03-10 15:00:00":, 57494.0] 

일시_x
2015-03-10 15:00:00     110.0
2015-03-10 18:00:00   -4920.0
2015-03-10 21:00:00   -4910.0
2015-03-11 00:00:00     140.0
2015-03-11 03:00:00     140.0
                        ...  
2024-05-19 09:00:00       0.0
2024-05-19 12:00:00       0.0
2024-05-19 15:00:00       0.0
2024-05-19 18:00:00       0.0
2024-05-19 21:00:00     220.0
Name: 57494.0, Length: 26867, dtype: float64

In [402]:
wind3 = wind3.replace(-4920, 110)
wind3 = wind3.replace(-4910, 110)

In [403]:
# 원핫 인코딩된 데이터프레임을 저장할 리스트 초기화
encoded_list = []

# 각 열에 대해 원핫 인코딩 수행
cols = wind3.columns
for col in cols:
    wind_ohe = pd.get_dummies(wind3[col], prefix=str(col))
    encoded_list.append(wind_ohe)

# 모든 원핫 인코딩된 데이터프레임을 axis=1 기준으로 병합
encoded_wind3 = pd.concat(encoded_list, axis=1)

In [408]:
cols

Float64Index([50745.0, 51463.0, 52203.0, 52681.0, 52983.0, 53614.0, 53845.0,
              54026.0, 54511.0, 54823.0, 54857.0, 56029.0, 57494.0, 58027.0,
              58238.0, 58362.0, 58666.0, 58847.0, 59287.0],
             dtype='float64', name='지점')

In [405]:
encoded_wind3

Unnamed: 0_level_0,50745.0_0.0,50745.0_10.0,50745.0_20.0,50745.0_30.0,50745.0_40.0,50745.0_50.0,50745.0_60.0,50745.0_70.0,50745.0_80.0,50745.0_90.0,...,59287.0_260.0,59287.0_270.0,59287.0_280.0,59287.0_290.0,59287.0_300.0,59287.0_310.0,59287.0_320.0,59287.0_330.0,59287.0_340.0,59287.0_350.0
일시_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-01 00:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2015-01-01 03:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2015-01-01 06:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2015-01-01 09:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2015-01-01 12:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-19 09:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2024-05-19 12:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2024-05-19 15:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2024-05-19 18:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [409]:
# 기존 열 인덱스 가져오기
old_columns = encoded_wind3.columns

# 새로운 인덱스 생성
new_index = pd.MultiIndex.from_product([['풍향'], old_columns])

# 데이터프레임에 새로운 멀티 인덱스 설정
encoded_wind3.columns = new_index


In [410]:
encoded_wind3.head()

Unnamed: 0_level_0,풍향,풍향,풍향,풍향,풍향,풍향,풍향,풍향,풍향,풍향,풍향,풍향,풍향,풍향,풍향,풍향,풍향,풍향,풍향,풍향,풍향
Unnamed: 0_level_1,50745.0_0.0,50745.0_10.0,50745.0_20.0,50745.0_30.0,50745.0_40.0,50745.0_50.0,50745.0_60.0,50745.0_70.0,50745.0_80.0,50745.0_90.0,...,59287.0_260.0,59287.0_270.0,59287.0_280.0,59287.0_290.0,59287.0_300.0,59287.0_310.0,59287.0_320.0,59287.0_330.0,59287.0_340.0,59287.0_350.0
일시_x,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2015-01-01 00:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2015-01-01 03:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2015-01-01 06:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2015-01-01 09:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2015-01-01 12:00:00,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [411]:
encoded_wind3.shape

(27416, 684)

In [412]:
imputed_df.head()

Unnamed: 0_level_0,강수량,강수량,강수량,강수량,강수량,강수량,강수량,강수량,강수량,강수량,...,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압,현지기압
지점,50745.0,51463.0,52203.0,52681.0,52983.0,53614.0,53845.0,54026.0,54511.0,54823.0,...,54823.0,54857.0,56029.0,57494.0,58027.0,58238.0,58362.0,58666.0,58847.0,59287.0
일시_x,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2015-01-01 00:00:00,0.5,10.0,0.2,2.0,0.7,0.0,0.0,0.0,2.0,12.0,...,996.1,1002.55,650.2,1006.25,1029.3,1026.0,1009.8,997.2,1009.9,1011.233333
2015-01-01 03:00:00,0.9,0.3,0.0,5.0,0.7,0.1,6.0,0.0,4.0,12.0,...,1019.2,1009.45,651.0,1000.8,1026.6,450.45,1009.4,1010.8,1005.3,992.8
2015-01-01 06:00:00,0.9,0.3,0.0,0.0,0.7,0.0,1.0,0.0,0.0,12.0,...,984.1,1002.25,651.2,1002.3,1028.25,1025.0,1008.3,997.2,1009.9,1007.05
2015-01-01 09:00:00,0.9,0.3,0.0,0.0,0.7,0.0,6.0,0.0,4.0,12.0,...,1017.0,1008.2,652.4,999.8,1025.633333,450.6,1008.3,1000.4,1006.65,996.7
2015-01-01 12:00:00,0.0,0.0,0.0,0.0,0.0,0.0,4.0,7.0,2.0,0.1,...,986.0,1002.2,652.0,1013.9,1027.85,1025.1,1014.35,1000.9,1003.55,998.7


In [413]:
imputed_df.shape

(27416, 152)

In [419]:
done_df = pd.concat([imputed_df, encoded_wind3], axis = 1)

In [420]:
done_df.head()

Unnamed: 0_level_0,강수량,강수량,강수량,강수량,강수량,강수량,강수량,강수량,강수량,강수량,...,풍향,풍향,풍향,풍향,풍향,풍향,풍향,풍향,풍향,풍향
지점,50745.0,51463.0,52203.0,52681.0,52983.0,53614.0,53845.0,54026.0,54511.0,54823.0,...,59287.0_260.0,59287.0_270.0,59287.0_280.0,59287.0_290.0,59287.0_300.0,59287.0_310.0,59287.0_320.0,59287.0_330.0,59287.0_340.0,59287.0_350.0
일시_x,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2015-01-01 00:00:00,0.5,10.0,0.2,2.0,0.7,0.0,0.0,0.0,2.0,12.0,...,0,0,0,0,0,0,0,0,0,0
2015-01-01 03:00:00,0.9,0.3,0.0,5.0,0.7,0.1,6.0,0.0,4.0,12.0,...,0,0,0,0,0,0,0,0,0,0
2015-01-01 06:00:00,0.9,0.3,0.0,0.0,0.7,0.0,1.0,0.0,0.0,12.0,...,0,0,0,0,0,0,0,0,0,0
2015-01-01 09:00:00,0.9,0.3,0.0,0.0,0.7,0.0,6.0,0.0,4.0,12.0,...,0,0,0,0,0,0,0,0,0,0
2015-01-01 12:00:00,0.0,0.0,0.0,0.0,0.0,0.0,4.0,7.0,2.0,0.1,...,0,0,0,0,0,0,0,0,0,0


In [422]:
# 만들어진 최종파일 csv로 추출
done_df.to_csv("./china_done.csv")