# 라이브러리

In [1]:
import librosa
import os
from glob import glob
from tqdm import tqdm
import numpy as np
import pandas as pd
from datetime import datetime
from datetime import timedelta
import matplotlib.pyplot as plt

# 10초 간격 데이터 추출

In [2]:
file_list = [x[:-4] for x in os.listdir(r'D:\1. 음향기반 강우관측\Dataset\음향 데이터\음향자료_초단위')]

tenmin_list = [x[:-5]+"0" for x in os.listdir(r'D:\1. 음향기반 강우관측\Dataset\음향 데이터\음향자료_초단위/')]

time_list = sorted(list(set(tenmin_list)))

dataset = pd.DataFrame({"time":time_list})

In [3]:
for k in tqdm(range(len(dataset))):

    # 10초 간의 파일 list 생성
    temp_file_list = []
    for sec in range(10):
        temp_time = (datetime.strptime(dataset.loc[k, 'time'], '%Y_%m_%d_%H_%M_%S') + timedelta(seconds = sec))
 
        if temp_time.strftime("%Y_%m_%d_%H_%M_%S") in file_list:
            temp_file_list.append(temp_time.strftime("%Y_%m_%d_%H_%M_%S")+'.npy')

    # 10초 길이의 음향 합치기
    temp_sig = np.load(r'D:\1. 음향기반 강우관측\Dataset\음향 데이터\음향자료_초단위/'+temp_file_list[0])

    for ii in range(1,len(temp_file_list)):
        temp_sig = np.concatenate((temp_sig, np.load(r'D:\1. 음향기반 강우관측\Dataset\음향 데이터\음향자료_초단위/'+temp_file_list[ii])))

    # threshold -20dB 주고, 최고주파수, 평균, 표준편차 값 구하여 dataset 만들기
    threshold = -20
    num_of_row = 500
    n_fft = (num_of_row - 1) * 2
    sr = 44100
    hop_length = 512
    win_length = int(44100/100)

    X = librosa.stft(temp_sig, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=False)
    mag = abs(X)

    Xdb = librosa.amplitude_to_db(mag)

    Xdb = np.where(Xdb > threshold, Xdb, -30)

    max_freq_list = []

    for x in range(Xdb.shape[1]):

        for y in range(Xdb.shape[0]):
            if Xdb[y,x] > -30:
                max_freq = y
        max_freq_list.append(max_freq)

    dataset.loc[k,'max_frequency'] = np.mean(max_freq_list)  # 각 window의 max frequency의 평균
    
    temp_mean_list = []
    for row in range(Xdb.shape[0]):
        temp_mean_list.append(np.mean(Xdb[row,:]+30)*row)
    dataset.loc[k,'weighted mean_dB'] = np.mean(temp_mean_list) # 각 frequency별 시계열 주기 세기 평균값에 대한 frequency 가중치 평균
    
    dataset.loc[k,'mean_dB'] = np.mean(Xdb.reshape(Xdb.shape[0]*Xdb.shape[1]))
    dataset.loc[k,'standard deviation_dB'] = np.std(Xdb.reshape(Xdb.shape[0]*Xdb.shape[1]))

    reshaped_array = Xdb.reshape(Xdb.shape[0]*Xdb.shape[1])
    dataset.loc[k,'standard deviation_dB'] = (np.sum((reshaped_array - np.mean(reshaped_array))**2)) / (Xdb.shape[0]*Xdb.shape[1] -1 )

dataset.to_csv(r'D:\1. 음향기반 강우관측\Dataset\음향 데이터\CSV_DATASET/10sec_unbiased standard deviation_20220731_20221128.csv', encoding='cp949', index=None)

100%|██████████| 14510/14510 [35:36<00:00,  6.79it/s] 


# Parsivel 데이터 read

In [3]:
par_file_list = glob(r"D:\1. 음향기반 강우관측\Dataset\음향 데이터\CSV_DATASET/*.mis")

In [4]:
date = []
time = []
intensity = []
dsd = []

for par_file in tqdm(par_file_list):
    with open(par_file, "r") as f:

        while True:
            
            line = f.readline()
            if not line: break
                
            if line.startswith('21'):
                date.append(line[3:-1])
                
            if line.startswith('20'):
                time.append(line[3:-1])
                
            if line.startswith('01'):
                intensity.append(line[3:-1])
                
            if line.startswith('90'):
                dsd.append(line[3:-1])            

all_data = pd.DataFrame({'date':date, 'time':time, 'intensity':intensity, 'dsd':dsd})

for k in tqdm(range(len(all_data))):
    
    date = all_data['date'][k] + ' ' + all_data['time'][k]
    full_date = str(datetime.strptime(date, "%d.%m.%Y %H:%M:%S"))
    all_data.loc[k, 'full_date'] = full_date

100%|██████████| 12/12 [00:02<00:00,  4.20it/s]
100%|██████████| 67012/67012 [01:16<00:00, 874.95it/s] 


# dataset에 파시벨 관측값 추가

In [5]:
dataset = pd.read_csv(r'D:\1. 음향기반 강우관측\Dataset\음향 데이터\CSV_DATASET/dataset_20220731_20230827.csv', encoding='cp949')

In [6]:
for k in tqdm(range(len(dataset))):
    dataset.loc[k,'time'] = datetime.strptime(dataset.loc[k,'time'], '%Y_%m_%d_%H_%M_%S').strftime("%Y-%m-%d %H:%M:%S")

100%|██████████| 66693/66693 [01:37<00:00, 687.23it/s]


In [7]:
for k in tqdm(range(0,7921)):
    temp_time = dataset.loc[k,'time'][:-2] + "00"
    dataset.loc[k,'rainfall intensity'] = float(all_data[all_data['full_date'] == temp_time]["intensity"].values[0])

100%|██████████| 7921/7921 [00:23<00:00, 332.15it/s]


In [8]:
for k in tqdm(range(7921,len(dataset))):
    temp_time = dataset.loc[k,'time']
    
    if len(all_data[all_data['full_date'] == temp_time]["intensity"]) > 0:
        dataset.loc[k,'rainfall intensity'] = float(all_data[all_data['full_date'] == temp_time]["intensity"].values[0])

    else:
        dataset.loc[k,'rainfall intensity'] = np.nan

100%|██████████| 58772/58772 [05:17<00:00, 184.92it/s]


In [9]:
dataset.to_csv(r'D:\1. 음향기반 강우관측\Dataset\음향 데이터\CSV_DATASET/10sec_final_dataset_20220731_20230827.csv', encoding='cp949', index=None)