# Creating Uniform Voltage Data
Before we start working on our voltage data, we have to convert it into a uniform format.  

The two main data sources in this project are the current operating ESMI station data, which is scraped from the Prayas ESMI website using the india_esmi_scraper.py scraper, as well as the Harvard Dataverse Data (link?).  
The two are 

# Aggregating Voltage Data into Usable Metrics
Before we can perform any analysis on correlation between weather factors and power outages in India, we first aggregate the power data into something that can be used.  

The data being processed here is the station data that comes directly from the india_esmi_scraper or from the previous Harvard Dataverse station data.  
ESMI data comes minute-wise, and ERA5 weather data comes hour-wise, so it is necessary to minimally convert the ESMI data to hourly data.  
Additionally, the ESMI data is not complete, meaning we have to apply our discretion when choosing the times and stations with which to perform our analysis.  
To give ourselves the maximum flexibility, we will process the ESMI data into four versions:  
- Outage Events: timestamped with outage beginning and duration
- Hourly Data: each complete hour with the percentage of the time period being spent at voltage = 0
- Daily Aggregate Outage Events: each complete day with the frequency of power outage events and average duration of said events, aggregated from the first outage events dataset
- Minute-wise Data for Future Interpolation: because the data is incomplete, but in some cases may only be minorly incomplete, we can set some arbitrary threshold of missing minutes that we are willing to interpolate, and save that trimmed set of minute-wise data for later interpolation.

In [None]:
import os
import pandas as pd
from tqdm import tqdm

# Path to the folder containing CSV files
folder_path = "C:/Users/user/Desktop/data_check/Original_Study_Stations/"

# Path to the folder where you want to save the result files
result_folder = "C:/Users/user/Desktop/data_check/only_0"

# Ensure the result folder exists, if not create it
os.makedirs(result_folder, exist_ok=True)  # Create folder only if it doesn't exist

# Get a list of CSV files in the folder
csv_files = [file_name for file_name in os.listdir(folder_path) if file_name.endswith('.csv')]

# Create a progress bar with tqdm
with tqdm(total=len(csv_files), desc="Processing CSV files") as pbar:
    for file_name in csv_files:
        # Read the CSV file into a DataFrame
        df = pd.read_csv(os.path.join(folder_path, file_name))

        # Filter rows where voltage column holds the value of 0
        df_filtered = df[df['voltage'] == 0]

        # Save the filtered DataFrame to a new CSV file
        result_file_name = os.path.splitext(file_name)[0] + "_only0.csv"
        result_file_path = os.path.join(result_folder, result_file_name)
        df_filtered.to_csv(result_file_path, index=False)

        # Update the progress bar
        pbar.update(1)


# Read the original CSV file
df = pd.read_csv("D:/ESMI_Dataset/Final_Data/power_output_analysis/India_all_stations_power_outage_data.csv")

# Convert the 'datetime' column to datetime objects
df['datetime'] = pd.to_datetime(df['datetime'])

# Sort the DataFrame by 'station_name' and 'datetime'
df = df.sort_values(by=['station_name', 'datetime'])

# Initialize an empty list to store the power outage events
power_outages = []

# Initialize variables to keep track of the current outage
current_station = None
current_start = None
current_end = None
current_duration = 0
current_rows = 0

# Create a tqdm progress bar
progress_bar = tqdm(total=len(df), desc="Processing data", unit="rows")

# Iterate through the DataFrame
for index, row in df.iterrows():
    progress_bar.update(1)  # Update progress bar
    if current_station is None:
        # Start a new outage event
        current_station = row['station_name']
        current_start = row['datetime']
        current_end = row['datetime']
        current_duration = 1
        current_rows = 1
    elif row['station_name'] == current_station and row['datetime'] == current_end + pd.Timedelta(minutes=1):
        # Continue current outage event
        current_end = row['datetime']
        current_duration += 1
        current_rows += 1
    else:
        # End current outage event and start a new one
        power_outages.append([current_station, current_duration, current_start, current_end, current_rows])
        current_station = row['station_name']
        current_start = row['datetime']
        current_end = row['datetime']
        current_duration = 1
        current_rows = 1

# Add the last outage event to the list
if current_station is not None:
    power_outages.append([current_station, current_duration, current_start, current_end, current_rows])

# Close the progress bar
progress_bar.close()

# Convert the list of outage events into a DataFrame
outages_df = pd.DataFrame(power_outages, columns=['station_name', 'duration', 'start', 'end', 'number_of_rows'])

# Write the DataFrame to a new CSV file
outages_df.to_csv("D:/ESMI_Dataset/Final_Data/power_output_analysis/power_outages.csv", index=False)


# Converting hourly data into daily aggregate data

In [None]:
import os
import pandas as pd
import numpy as np

# Define the folder containing the hourly weather variables CSV files
folder_path = 'D:/ESMI_Dataset/EAR5_Data/raw_ESMI_boundary'

# Create a sub-folder called 'daily'
daily_folder = os.path.join(folder_path, 'daily')
os.makedirs(daily_folder, exist_ok=True)

# Get a list of all CSV files in the folder
files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

# Iterate through each CSV file
for file in files:
    # Read the CSV file into a DataFrame
    df = pd.read_csv(os.path.join(folder_path, file))
    
    # Convert 'datetime' column to datetime type
    df['datetime'] = pd.to_datetime(df['datetime'])
    
    # Extract station information from the current CSV file
    station_info = df[['station_id', 'station_lat', 'station_lon']].iloc[0]
    
    # Aggregate daily average for 't2m', 'd2m', 'sp'
    daily_avg = df.groupby(df['datetime'].dt.date)[['t2m', 'd2m', 'sp']].mean()
    
    # Aggregate daily average of squared values for 'u10' and 'v10'
    df['u10_sq'] = df['u10'] ** 2
    df['v10_sq'] = df['v10'] ** 2
    daily_avg_squared = df.groupby(df['datetime'].dt.date)[['u10_sq', 'v10_sq']].mean()
    
    # Calculate daily average wind speed
    daily_avg['wind_speed'] = np.sqrt(daily_avg_squared['u10_sq'] + daily_avg_squared['v10_sq'])
    
    # Extract 'tp' value for 11:00 pm of each day
    df['hour'] = df['datetime'].dt.hour
    last_hour_tp = df[df['hour'] == 23].groupby(df['datetime'].dt.date)['tp'].last()
    
    # Merge daily average, wind speed, and last hour tp columns
    daily_data = daily_avg.join(last_hour_tp, rsuffix='_last_hour')
    
    # Add station information to daily data
    daily_data['station_id'] = station_info['station_id']
    daily_data['station_lat'] = station_info['station_lat']
    daily_data['station_lon'] = station_info['station_lon']
    daily_data.reset_index(inplace=True)
    
    # Reorder columns to have station information first
    cols = ['station_id', 'station_lat', 'station_lon', 'datetime', 't2m', 'd2m', 'sp', 'wind_speed', 'tp_last_hour']
    daily_data = daily_data[cols]
    daily_data.rename(columns={'datetime': 'date'}, inplace=True)
    
    # Save the aggregated daily weather variables to a new CSV file in the 'daily' sub-folder
    output_file = os.path.join(daily_folder, file.split('.')[0] + '_daily.csv')
    daily_data.to_csv(output_file, index=False)
    print(f"Daily aggregated data saved to: {output_file}")
