In [1]:
import pandas as pd
from ydata_profiling import ProfileReport

In [2]:
dam_address = "../dataset/raw/dam_occupancy.csv"
weather_address = "../dataset/external/weather.csv"
flood_address = "../dataset/external/flood.csv"
climate_change_address = "../dataset/external/climate_change.csv"

In [3]:
dam_df = pd.read_csv(dam_address)
weather_df = pd.read_csv(weather_address)
flood_df = pd.read_csv(flood_address)
cli_change_df = pd.read_csv(climate_change_address)

## Dam Dataset

In [4]:
dam_df

Unnamed: 0,DATE,GENERAL_DAM_OCCUPANCY_RATE,GENERAL_DAM_RESERVED_WATER
0,2005-01-01,44.62,388
1,2005-01-02,44.62,388
2,2005-01-03,44.47,386
3,2005-01-04,44.42,386
4,2005-01-05,44.35,385
...,...,...,...
5936,2021-04-03,73.66,639
5937,2021-04-04,73.76,640
5938,2021-04-05,74.70,648
5939,2021-04-06,75.88,659


In [5]:
dam_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5941 entries, 0 to 5940
Data columns (total 3 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   DATE                        5941 non-null   object 
 1   GENERAL_DAM_OCCUPANCY_RATE  5941 non-null   float64
 2   GENERAL_DAM_RESERVED_WATER  5941 non-null   int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 139.4+ KB


In [6]:
def lower_column_names(df: pd.DataFrame):
    df.columns = df.columns.str.lower()
    return df

In [7]:
dam_df = (
    dam_df
    .pipe(lower_column_names)
    .assign(**{"datetime": lambda df: pd.to_datetime(df["date"])})
    .drop(columns="date")
)

## Weather Dataset

In [8]:
weather_df.head(3)

Unnamed: 0,date,weather_code,temperature_2m_max,temperature_2m_min,temperature_2m_mean,apparent_temperature_max,apparent_temperature_min,apparent_temperature_mean,sunrise,sunset,...,sunshine_duration,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,wind_speed_10m_max,wind_gusts_10m_max,wind_direction_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration
0,2004-12-31 21:00:00+00:00,53.0,10.563,7.363,8.990084,9.071384,4.146955,6.033844,0,0,...,25582.63,3.5,3.5,0.0,10.0,23.507751,37.44,31.536366,5.19,0.814421
1,2005-01-01 21:00:00+00:00,51.0,7.813,4.713,6.660917,4.816814,2.181339,3.76984,0,0,...,22339.408,2.8,2.8,0.0,17.0,13.276144,25.919998,349.9135,3.92,0.644396
2,2005-01-02 21:00:00+00:00,51.0,8.613,2.263,6.03175,5.408062,-1.033089,3.009735,0,0,...,27602.402,0.1,0.1,0.0,1.0,13.70839,26.28,260.3468,6.45,0.863363


In [9]:
weather_df = (
    weather_df
    .assign(**{
        "datetime": lambda df: pd.to_datetime(
            pd.to_datetime(df["date"]).dt.tz_localize(None).dt.date
        )
    })
    .drop(columns="date")
)

In [10]:
weather_df.head(3)

Unnamed: 0,weather_code,temperature_2m_max,temperature_2m_min,temperature_2m_mean,apparent_temperature_max,apparent_temperature_min,apparent_temperature_mean,sunrise,sunset,daylight_duration,...,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,wind_speed_10m_max,wind_gusts_10m_max,wind_direction_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration,datetime
0,53.0,10.563,7.363,8.990084,9.071384,4.146955,6.033844,0,0,33432.535,...,3.5,3.5,0.0,10.0,23.507751,37.44,31.536366,5.19,0.814421,2004-12-31
1,51.0,7.813,4.713,6.660917,4.816814,2.181339,3.76984,0,0,33479.133,...,2.8,2.8,0.0,17.0,13.276144,25.919998,349.9135,3.92,0.644396,2005-01-01
2,51.0,8.613,2.263,6.03175,5.408062,-1.033089,3.009735,0,0,33529.406,...,0.1,0.1,0.0,1.0,13.70839,26.28,260.3468,6.45,0.863363,2005-01-02


In [11]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5941 entries, 0 to 5940
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   weather_code                 5941 non-null   float64       
 1   temperature_2m_max           5941 non-null   float64       
 2   temperature_2m_min           5941 non-null   float64       
 3   temperature_2m_mean          5941 non-null   float64       
 4   apparent_temperature_max     5941 non-null   float64       
 5   apparent_temperature_min     5941 non-null   float64       
 6   apparent_temperature_mean    5941 non-null   float64       
 7   sunrise                      5941 non-null   int64         
 8   sunset                       5941 non-null   int64         
 9   daylight_duration            5941 non-null   float64       
 10  sunshine_duration            5941 non-null   float64       
 11  precipitation_sum            5941 non-null 

## Flood Dataset

In [12]:
flood_df.head(3)

Unnamed: 0,date,river_discharge
0,2005-01-01 00:00:00+00:00,0.358313
1,2005-01-02 00:00:00+00:00,0.358313
2,2005-01-03 00:00:00+00:00,0.3122


In [13]:
flood_df = (
    flood_df
    .assign(**{
        "datetime": lambda df: pd.to_datetime(
            pd.to_datetime(df["date"]).dt.tz_localize(None).dt.date
        )
    })
    .drop(columns="date")
)

In [14]:
flood_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5941 entries, 0 to 5940
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   river_discharge  5941 non-null   float64       
 1   datetime         5941 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(1)
memory usage: 93.0 KB


## Climate Change Dataset

In [15]:
cli_change_df.head()

Unnamed: 0,date,climate_change_temperature_2m_mean,climate_change_temperature_2m_max,climate_change_temperature_2m_min,climate_change_wind_speed_10m_mean,climate_change_wind_speed_10m_max,climate_change_cloud_cover_mean,climate_change_shortwave_radiation_sum,climate_change_relative_humidity_2m_mean,climate_change_relative_humidity_2m_max,climate_change_relative_humidity_2m_min,climate_change_dew_point_2m_mean,climate_change_dew_point_2m_min,climate_change_dew_point_2m_max,climate_change_precipitation_sum,climate_change_rain_sum,climate_change_snowfall_sum,climate_change_pressure_msl_mean,climate_change_soil_moisture_0_to_10cm_mean,climate_change_et0_fao_evapotranspiration_sum
0,2005-01-01 00:00:00+00:00,9.559418,12.070409,6.575862,11.667466,28.228834,,,71.42315,82.39349,62.762245,4.387951,2.55415,6.447038,0.127574,,,1015.54956,,1.436427
1,2005-01-02 00:00:00+00:00,9.612512,10.779707,6.923346,21.014172,23.266617,,,74.50186,80.47813,65.83115,4.551526,2.793407,5.660715,1.530689,,,1009.44135,,0.990955
2,2005-01-03 00:00:00+00:00,3.61561,7.089004,1.420831,15.315918,26.408207,,,57.580574,67.56276,51.900047,-3.412664,-4.796935,-1.25025,13.774413,,,1021.83325,,1.098593
3,2005-01-04 00:00:00+00:00,3.718706,5.198303,1.418316,19.740993,17.782825,,,70.659294,84.647385,58.968933,-1.529647,-3.967449,0.955127,0.127524,,,1025.425,,0.777874
4,2005-01-05 00:00:00+00:00,6.921801,9.3076,4.415801,15.074694,29.294376,,,80.73801,87.732025,75.03782,3.757444,2.720729,4.944419,0.382522,,,1018.5167,,1.082161


In [16]:
cli_change_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5941 entries, 0 to 5940
Data columns (total 20 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   date                                           5941 non-null   object 
 1   climate_change_temperature_2m_mean             5941 non-null   float64
 2   climate_change_temperature_2m_max              5941 non-null   float64
 3   climate_change_temperature_2m_min              5941 non-null   float64
 4   climate_change_wind_speed_10m_mean             5941 non-null   float64
 5   climate_change_wind_speed_10m_max              5941 non-null   float64
 6   climate_change_cloud_cover_mean                0 non-null      float64
 7   climate_change_shortwave_radiation_sum         0 non-null      float64
 8   climate_change_relative_humidity_2m_mean       5941 non-null   float64
 9   climate_change_relative_humidity_2m_max        5941 

In [17]:
cli_change_df = (
    cli_change_df
    .pipe(lower_column_names)
    .assign(
        **{
            "datetime": lambda df: pd.to_datetime(
                pd.to_datetime(df["date"]).dt.tz_localize(None).dt.date
            )
        }
    )
    .drop(columns="date")
    .dropna(axis=1)
)

In [18]:
cli_change_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5941 entries, 0 to 5940
Data columns (total 15 columns):
 #   Column                                         Non-Null Count  Dtype         
---  ------                                         --------------  -----         
 0   climate_change_temperature_2m_mean             5941 non-null   float64       
 1   climate_change_temperature_2m_max              5941 non-null   float64       
 2   climate_change_temperature_2m_min              5941 non-null   float64       
 3   climate_change_wind_speed_10m_mean             5941 non-null   float64       
 4   climate_change_wind_speed_10m_max              5941 non-null   float64       
 5   climate_change_relative_humidity_2m_mean       5941 non-null   float64       
 6   climate_change_relative_humidity_2m_max        5941 non-null   float64       
 7   climate_change_relative_humidity_2m_min        5941 non-null   float64       
 8   climate_change_dew_point_2m_mean               5941 non-nu

## Merge Dfs

In [19]:
final_df = (
    dam_df
    .merge(weather_df, how="left", on="datetime")
    .merge(flood_df, how="left", on="datetime")
    .merge(cli_change_df, how="left", on="datetime")
    .dropna()
    .drop_duplicates()
    .drop(columns=["sunrise", "sunset"]) # constant value columns
    .drop(columns="snowfall_sum") # 96% of values are 0
    .astype({"weather_code": "int64"}) # categorical column
)
# precipitation_hours is cyclic

In [20]:
# ProfileReport(final_df)

In [21]:
final_df.to_csv("../dataset/interim/cleaned_dataset.csv", index=False)