# Libraries

In [1]:
import librosa
import os
from glob import glob
from tqdm import tqdm
import numpy as np
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import librosa.display

# Extract data at 1 minute intervals

In [2]:
min_list = [x[:-6]+"00" for x in os.listdir(r'second')]

time_list = sorted(list(set(min_list)))

dataset = pd.DataFrame({"time":time_list})

In [14]:
for k in tqdm(range(len(dataset))):

    # Create a list of files for 1 minute
    temp_file_list = [x for x in os.listdir(r'second') if dataset.loc[k,'time'][:-3] in x]

    # Combine 1 minute of sound
    temp_sig = np.load(r'second/'+temp_file_list[0])

    for ii in range(1,len(temp_file_list)):
        temp_sig = np.concatenate((temp_sig, np.load(r'second/'+temp_file_list[ii])))

    # Create a dataset with a threshold of -20 dB and get the peak frequency, mean, and standard deviation values
    threshold = -20
    num_of_row = 500
    n_fft = (num_of_row - 1) * 2
    sr = 44100
    hop_length = 512
    win_length = int(44100/100)

    X = librosa.stft(temp_sig, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=False)
    mag = abs(X)

    Xdb = librosa.amplitude_to_db(mag)

    Xdb = np.where(Xdb > threshold, Xdb, -30)

    max_freq_list = []

    for x in range(Xdb.shape[1]):

        for y in range(Xdb.shape[0]):
            if Xdb[y,x] > -30:
                max_freq = y
        max_freq_list.append(max_freq)

    dataset.loc[k,'max_frequency'] = np.mean(max_freq_list)  # average the max frequency of each window
    
    temp_mean_list = []
    for row in range(Xdb.shape[0]):
        temp_mean_list.append(np.mean(Xdb[row,:]+30)*row)
    dataset.loc[k,'weighted mean_dB'] = np.mean(temp_mean_list) # frequency-weighted average of time series period intensity averages for each frequency
    
    dataset.loc[k,'mean_dB'] = np.mean(Xdb.reshape(Xdb.shape[0]*Xdb.shape[1]))
    dataset.loc[k,'standard deviation_dB'] = np.std(Xdb.reshape(Xdb.shape[0]*Xdb.shape[1]))
   
    reshaped_array = Xdb.reshape(Xdb.shape[0]*Xdb.shape[1])
    dataset.loc[k,'standard deviation_dB'] = (np.sum((reshaped_array - np.mean(reshaped_array))**2)) / (Xdb.shape[0]*Xdb.shape[1] -1 )

dataset.to_csv(r'1min_unbiased standard deviation_20220731_20221128.csv', encoding='cp949', index=None)

100%|██████████| 2420/2420 [2:45:57<00:00,  4.11s/it]  


# Read Parsivel data

In [None]:
date = []
time = []
intensity = []
dsd = []


with open(r"20220731.mis", "r") as f:
    while True:
        
        line = f.readline()
        if not line: break
            
        if line.startswith('21'):
            date.append(line[3:-1])
            
        if line.startswith('20'):
            time.append(line[3:-1])
            
        if line.startswith('01'):
            intensity.append(line[3:-1])
            
        if line.startswith('90'):
            dsd.append(line[3:-1])            

data_23 = pd.DataFrame({'date':date, 'time':time, 'intensity':intensity, 'dsd':dsd})

date = []
time = []
intensity = []
dsd = []


with open(r"20220801.mis", "r") as f:
    while True:
        
        line = f.readline()
        if not line: break
            
        if line.startswith('21'):
            date.append(line[3:-1])
            
        if line.startswith('20'):
            time.append(line[3:-1])
            
        if line.startswith('01'):
            intensity.append(line[3:-1])
            
        if line.startswith('90'):
            dsd.append(line[3:-1])            

data_24 = pd.DataFrame({'date':date, 'time':time, 'intensity':intensity, 'dsd':dsd})

date = []
time = []
intensity = []
dsd = []


with open(r"20221128.mis", "r") as f:
    while True:
        
        line = f.readline()
        if not line: break
            
        if line.startswith('21'):
            date.append(line[3:-1])
            
        if line.startswith('20'):
            time.append(line[3:-1])
            
        if line.startswith('01'):
            intensity.append(line[3:-1])
            
        if line.startswith('90'):
            dsd.append(line[3:-1])            

data_28 = pd.DataFrame({'date':date, 'time':time, 'intensity':intensity, 'dsd':dsd})

date = []
time = []
intensity = []
dsd = []


with open(r"20221129.mis", "r") as f:
    while True:
        
        line = f.readline()
        if not line: break
            
        if line.startswith('21'):
            date.append(line[3:-1])
            
        if line.startswith('20'):
            time.append(line[3:-1])
            
        if line.startswith('01'):
            intensity.append(line[3:-1])
            
        if line.startswith('90'):
            dsd.append(line[3:-1])            

data_29 = pd.DataFrame({'date':date, 'time':time, 'intensity':intensity, 'dsd':dsd})

date = []
time = []
intensity = []
dsd = []



all_data = pd.concat([data_23,data_24])
all_data = pd.concat([all_data,data_28])
all_data = pd.concat([all_data,data_29])
all_data.reset_index(inplace=True, drop=True)

for k in range(len(all_data)):
    
    date = all_data['date'][k] + ' ' + all_data['time'][k]
    full_date = str(datetime.datetime.strptime(date, "%d.%m.%Y %H:%M:%S"))
    all_data.loc[k, 'full_date'] = full_date

# Add a wavelength observation to the dataset

In [None]:
dataset = pd.read_csv(r'XGBoost train/min_dataset_20220731_20221128.csv', encoding='cp949')

In [None]:
for k in range(len(dataset)):
    dataset.loc[k,'time'] = datetime.datetime.strptime(dataset.loc[k,'time'], '%Y_%m_%d_%H_%M_%S').strftime("%Y-%m-%d %H:%M:%S")

In [None]:
for k in tqdm(range(len(dataset))):
    temp_time = dataset.loc[k,'time']
    min_rainfall_intensity = np.mean(all_data[all_data['full_date'].str.contains(temp_time[:-3])]['intensity'].values.astype(float))
    dataset.loc[k,'rainfall intensity'] = min_rainfall_intensity

In [None]:
dataset.to_csv(r'XGBoost train/min_Final_dataset_20220731_20221128.csv', encoding='cp949', index=None)