## process raw data

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/rainfall/201807_202306

In [None]:
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn import metrics
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.dates as mdates
import matplotlib as mpl
from datetime import datetime, timedelta

%matplotlib inline

In [None]:
# read raw rainfall data
df = pd.read_csv('saws201807_202306kp.txt', delimiter=r'\s+', names = ['Date','Time','rainfall'],dtype=str, skiprows=2,skipfooter=1)
# replace '32767' in raw data by NaN
df['rainfall']=df['rainfall'].replace('32767', np.NaN)
df.rainfall=df.rainfall.astype(float)
# format date and time
df.Date=pd.to_datetime(df.Date,format='%Y%m%d')
df.loc[df['Time']=='2400','Time']='0000'
df.loc[df['Time']=='0000','Date']=df.loc[df['Time']=='0000','Date']+ timedelta(days=1)
df.Date = df.Date.astype(str)
df['DateAndTime'] = df['Date'].str.cat(df['Time'])
df['DateAndTime'] = pd.to_datetime(df['DateAndTime'], format = '%Y-%m-%d%H%M')
df

In [None]:
df['Year']=pd.DatetimeIndex(df['DateAndTime']).year
df['Month']=pd.DatetimeIndex(df['DateAndTime']).month
df['Day']=pd.DatetimeIndex(df['DateAndTime']).day
df['Hour']=pd.DatetimeIndex(df['DateAndTime']).hour
df['Minute']=pd.DatetimeIndex(df['DateAndTime']).minute
df

##export data by year

In [None]:
data = df[["DateAndTime", "rainfall"]]
data['DateAndTime'] = pd.to_datetime(data['DateAndTime'])
data

In [None]:
# create separate data by year
data['Year'] = data['DateAndTime'].dt.year
for year in data['Year'].unique():
    year_data = data[data['Year'] == year]
    year_data.to_csv(f'/content/drive/MyDrive/rainfall/201807_202306/rainfallkp_{year}.txt', sep='\t', index=False)

##check rainfall time series by range

In [None]:
# check rainfall time series by range
range = ((data['DateAndTime'] >= pd.to_datetime('2022-12-31 21:00:00')) &
                   (data['DateAndTime'] <= pd.to_datetime('2023-01-01 03:00:00')))
range_data = data[range]
plt.plot(range_data['DateAndTime'], range_data['rainfall'])
plt.xlabel('Date and Time')
plt.ylabel('Rainfall')
plt.title('Rainfall Data')
plt.xticks(rotation=45)
plt.show()

## check rainfall time series by month or day

In [None]:
def monthfig(month):
    fig,ax = plt.subplots(figsize=(20, 5))
    dff=pd.DataFrame(df.loc[df['Month']==month,['DateAndTime','rainfall']])
    dff.rainfall=dff.rainfall.astype(float)
    x = dff['DateAndTime']
    y = dff['rainfall']
    ax.plot(x, y,linewidth=0.5)
    ax1 = fig.gca()
    ax1.xaxis.set_major_formatter(mdates.DateFormatter('%d-%m'))
    plt.show()
    return

In [None]:
for month in range(1,13):
    monthfig(month)

In [None]:
def dayfig(month,day):
    fig,ax = plt.subplots(figsize=(20, 5))
    dff=pd.DataFrame(df.loc[(df['Month']==month)&(df['Day'].isin(day)),['DateAndTime','rainfall']])
    dff.rainfall=dff.rainfall.astype(float)
    x = dff['DateAndTime']
    y = dff['rainfall']
    ax.plot(x, y,linewidth=0.5)
    ax1 = fig.gca()
    ax1.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))
    plt.show()
    return

In [None]:
dayfig(2,[27])

## generate rainfall file for SWMM

In [None]:
df['Station']='STA01'
df.columns

In [None]:
newdf = pd.DataFrame(df[['Station', 'Year', 'Month', 'Day', 'Hour', 'Minute','rainfall']])
newdf

In [None]:
newdf.rainfall=newdf.rainfall.astype(float)
newdf.dtypes

In [None]:
newdf['rainfall']=newdf['rainfall']/10 # unit of raw data is in 0.1 mm, now convert to 1 mm

In [None]:
# code for all processed rainfall
newdf.to_csv(f'/content/drive/MyDrive/rainfall/201807_202306/allrainfall.csv',index=None)
# code for a certain month
newdf.loc[newdf['Month']==7].to_csv(f'/content/drive/MyDrive/rainfall/201807_202306/prec_mm_Jul.dat', sep = "\t",header=None,index=None)
# code for certain days
newdf.loc[(newdf['Month']==2)&(newdf['Day'].isin([26,27]))].to_csv(f'/content/drive/MyDrive/rainfall/201807_202306/prec_mm_2.26.dat',sep = "\t",header=None,index=None)