In [1]:
import glob
import xarray as xr

In [2]:
stations = glob.glob('data/full_files/*.nc')

In [5]:
import xarray as xr
import pandas as pd
import os
import pytz
from timezonefinder import TimezoneFinder

def utc_to_local(dataset, latitude, longitude):
    tf = TimezoneFinder()
    timezone_str = tf.certain_timezone_at(lat=latitude, lng=longitude)
    if timezone_str is None:
        print(f"Warning: Could not determine the time zone for latitude={latitude}, longitude={longitude}. Using UTC.")
        return dataset
    else:
        timezone = pytz.timezone(timezone_str)
        dataset['time'] = pd.to_datetime(dataset['time'].values).tz_localize(pytz.UTC).tz_convert(timezone)
        return dataset

for station in stations:
    large_ds = xr.open_dataset(station)
    station_name = station.split('/')[-1].split('.')[0]
    
    latitude = float(large_ds.attrs.get('geospatial_lat_min'))
    longitude = float(large_ds.attrs.get('geospatial_lon_min'))
    elevation = large_ds.attrs.get('geospatial_vertical_min')
    
    # Convert time from UTC to local time
    large_ds = utc_to_local(large_ds, latitude, longitude)
    
    dates = large_ds.indexes['time'].to_series().dt.date.unique()

    # Filter dates to include only those after January 1, 2005
    dates = dates[dates >= pd.to_datetime('2005-01-01').date()]

    # Iterate over each unique date
    for date in dates:
        output_file = f"data/daily/{station_name}_{date.strftime('%Y%m%d')}.nc"
        
        if not os.path.exists(output_file):
            try:
                # Select data for the current date
                mini_ds = large_ds.sel(time=str(date))
                
                # Save the mini Dataset to a new NetCDF file
                mini_ds.to_netcdf(output_file)
            except:
                print(f"Skipping ambiguous time for date: {date}")
                continue

Skipping ambiguous time for date: 2006-02-18
Skipping ambiguous time for date: 2006-11-05
Skipping ambiguous time for date: 2007-02-24
Skipping ambiguous time for date: 2007-10-14
Skipping ambiguous time for date: 2008-02-16
Skipping ambiguous time for date: 2008-10-19
Skipping ambiguous time for date: 2009-02-14
Skipping ambiguous time for date: 2009-10-18
Skipping ambiguous time for date: 2010-02-20
Skipping ambiguous time for date: 2010-10-17
Skipping ambiguous time for date: 2011-02-19
Skipping ambiguous time for date: 2011-10-16
Skipping ambiguous time for date: 2012-02-25
Skipping ambiguous time for date: 2012-10-21
Skipping ambiguous time for date: 2013-02-16
Skipping ambiguous time for date: 2013-10-20
Skipping ambiguous time for date: 2014-02-15
Skipping ambiguous time for date: 2014-10-19
Skipping ambiguous time for date: 2015-02-21
Skipping ambiguous time for date: 2015-10-18
Skipping ambiguous time for date: 2016-02-20
Skipping ambiguous time for date: 2016-10-16
Skipping a

In [7]:
import glob

files = glob.glob('data/daily/*nc')

len(files)

159375

In [11]:
import xarray as xr
import os
import shutil

# Directory containing the xarray files
directory = "data/daily"

# Directory to move the files that meet the condition
output_directory = "data/daily_processed"

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# List of variables to check for NaN percentage
variables = ["GHI", "DHI", "BNI", "ghi_extra", "ghi_clear", "bhi_clear", "dhi_clear", "dni_clear", "ghi", "bhi", "dhi", "dni"]

# Threshold for non-NaN percentage
threshold = 0.8

# Iterate over the files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".nc"):  # Assuming the files have a ".nc" extension
        file_path = os.path.join(directory, filename)
        
        # Open the xarray dataset
        ds = xr.open_dataset(file_path)
        
        # Check the percentage of non-NaN values for each variable
        valid_vars = []
        for var in variables:
            if var in ds:
                non_nan_percentage = ds[var].count() / len(ds[var])
                if non_nan_percentage >= threshold:
                    valid_vars.append(var)
        
        # If all specified variables have more than 80% non-NaN values
        if len(valid_vars) == len(variables):
            # Interpolate NaN values for all variables
            ds = ds.interpolate_na(dim="time")
            
            # Save the modified dataset to the output directory
            output_path = os.path.join(output_directory, filename)
            ds.to_netcdf(output_path)
            
        else:
            pass

In [12]:
import glob

files = glob.glob('data/daily_processed/*nc')

len(files)

101375