In [1]:
# Working with the acurite raw data
source = "acurite"
stateDirs = "rawCSVdirList"

dropColumns = ['Sensor Type','Accumulated Rain',
       'Wind Speed', 'Wind Average', 'Wind Direction',
       'Wired Sensor Temperature', 'Wired Sensor Humidity',
       'Soil & Liquid Temperature', 'Water Detected', 'UV Index',
       'Light Intensity', 'Measured Light', 'Lightning Strike Count',
       'Lightning Closest Strike Distance','Feels Like',
       'Wind Chill']

dupeOnColumns = ['Sensor Name', 'Timestamp']
splitCol = "Sensor Name"
timeCol = "Timestamp"
timestampFormat = '%Y/%m/%d %I:%M %p'
locale = 'America/New_York'


In [2]:
import pandas as pd
import os
import sys

# This is the  path to the package & config dirs
pyPath = '../'

# importing python scripts
pyAPath = os.path.abspath(pyPath)

if pyAPath not in sys.path:
    sys.path.append(os.path.abspath(pyPath))

from clowderwoodpy import cw_start, getCWDataFileList


In [4]:
# TODO move functions to clowderwoodpy
def load_csv_files(file_paths,dropCols=None,uniqueVals=None,timeCol=None,timeFormat=None,timeLoc=None):
    """Load multiple CSV files into a list of DataFrames."""
    dataframes = []
    metaDict = {}
    for file in file_paths:
        fn = file[-20:]
        metaDict[fn] = {}
        metaDict[fn]["full path"] = file
        try:
            df = pd.read_csv(file)
            df["file"] = fn
            metaDict[fn]["raw count"] = len(df)
            dataframes.append(df)

            if timeCol:
                if timeFormat:
                    df[timeCol] = pd.to_datetime(df[timeCol], format=timeFormat)

                    # Extract most recent (max) and earliest (min) values
                    metaDict[fn]["earliest"] = df['Timestamp'].max().isoformat()
                    metaDict[fn]["latest"] = df['Timestamp'].min().isoformat()

                    if timeLoc:
                        df[timeCol]  = df[timeCol].dt.tz_localize(timeLoc,ambiguous="infer")

                else:
                    raise MissingTimeFormat(f"Can't convert column {timeCol} without source format.")

            if dropCols:
                df.drop(dropCols, axis=1, inplace=True)
            if uniqueVals:
                if isinstance(uniqueVals,list):
                    for col in uniqueVals:
                        metaDict[fn][col+" (unique)"] = list(df[col].unique())
                else: 
                    col = uniqueVals
                    metaDict[fn][col+" (unique)"] = list(df[col].unique())

        except Exception as e:
            print(f"Error loading {file}: {e}")

    return dataframes,metaDict

# TODO perhaps add changing the index here? But not for acurite

def remove_duplicates(dataframes,dupeColumns,sortCols=None):
    """Concatenate DataFrames and remove duplicate rows."""
    combined_df = pd.concat(dataframes, ignore_index=True)
    cleaned_df = combined_df.drop_duplicates(subset=dupeColumns, keep='first').copy()
    if sortCols:
        cleaned_df.sort_values(by=sortCols,inplace=True)

    cleaned_df.reset_index(inplace=True, drop=True)

    return cleaned_df

def rollingOutlierDetection(rawSeries,label,rollWindow="2h",minDeviation=None):
    """Find outliers ."""

    # Create a rolling window of 1 hour (2 windows of 30 minutes around each point, excluding the point itself)
    rolling_window = rawSeries.rolling(window=rollWindow, min_periods=1, center=True)

    # Calculate rolling mean and std deviation
    rolling_mean = rolling_window.mean()
    rolling_std = rolling_window.std()

    # Identify outliers (more than 2 standard deviations away from the rolling mean)
    outliers = (rawSeries > rolling_mean + 2 * rolling_std) | (rawSeries < rolling_mean - 2 * rolling_std)

    extremes  = (rawSeries > rolling_mean + 4 * rolling_std) | (rawSeries < rolling_mean - 4 * rolling_std)

    resultsDict={
        label : rawSeries,
        'rolling mean' : rolling_mean,
        'rolling std' :  rolling_std,
        'outlier'  : outliers,
        'extreme' : extremes,
    }

    if minDeviation:
        # create new outliers and extremes
        exceedMinDel = abs(rawSeries - rolling_mean) > minDeviation
        resultsDict['outlier > min del'] = outliers & exceedMinDel
        resultsDict['extreme > min del']  = extremes & exceedMinDel


    resultsDF = pd.DataFrame(resultsDict)

    return resultsDF


In [5]:
cwDataDict, cwLabelsDict = cw_start(verbose=True)


Welcome to clowderwoodpy v 0.0.1 .

 This initialization has loaded a dictionary which contains the current 
 raw, processed, and summarized data locations. The dictionary is
 organized by sensor or source classes, as the first level key:
	 acurite
	 rainGauge

 Each sensor has directories for different data processing states. These
 directories include the following, with the caution that not all sources
 have all data processing states. These are the second level key:
	 rawCSVdirList
	 summaryDir
	 processedDir
	 rawHTTPdirList

 In preparing the dictionary the paths are converted to absolute 
 paths from the values recorded in ../config/clowderwoodDataFile.json .


In [7]:
print ("Data from",source,",",stateDirs,":")
fileList= getCWDataFileList(cwDataDict,source,stateDirs)
print ("\t",len(fileList),"files.")

dfList, rawMetaDict = load_csv_files(fileList,
                                     dropCols=dropColumns,
                                     uniqueVals="Sensor Name",
                                     timeCol='Timestamp',timeFormat=timestampFormat)

# Not setting the locale on the Timestamp column because infer fails, possibly because there
# are multiple rows with same timestamp. Will try adding locale when separating into individual
# 

cleanDF = remove_duplicates(dfList,dupeOnColumns,sortCols=dupeOnColumns)
rawMetaDF = pd.DataFrame.from_dict(rawMetaDict, orient='index')



print("Overlap check:\n\tbefore =",rawMetaDF["raw count"].sum(),
      "\n\tafter  =",len(cleanDF))


Data from acurite , rawCSVdirList :
	 101 files.
Overlap check:
	before = 2100940 
	after  = 1800436


In [8]:
# TODO create splitCol_opt_indexTime function and move to clowderwoodpy

# see <https://pandas.pydata.org/docs/user_guide/timeseries.html>

from pandas.api.types import is_datetime64_any_dtype 
import matplotlib.pyplot as plt
# def splitCol_opt_indexTime(dataframe,splitCol,timeCol=None,locale=None):
#   """"""
splitMetaDict = {}
splitDFDict = {}
splitList = list(cleanDF[splitCol].unique())

for s in splitList:
    splitMetaDict[s] = {}
    splitMetaDict[s]["notes"] = ""

    idx = cleanDF[splitCol] == s
    df = cleanDF.loc[idx].copy()
    splitMetaDict[s]["raw count"] = len(df)

    df.drop(splitCol, axis='columns', inplace=True)

    if is_datetime64_any_dtype(df[timeCol]):

        df.sort_values(by=timeCol, inplace=True)

        if locale:
            # Getting timestamp as timezone aware
            df[timeCol] = df[timeCol].dt.tz_localize(locale, ambiguous='NaT', nonexistent='NaT')

            # Drop rows with NaT values in 'Timestamp' column
            df.dropna(subset=[timeCol], inplace=True)

            delta = splitMetaDict[s]["raw count"] - len(df)
            if delta != 0:
                noteStr = f"{splitCol} '{s}' had {delta} ambiguous timestamps in the\n{timeCol}, which did not survive conversion to {locale}.\n"
                print(noteStr)
                splitMetaDict[s]["notes"] = splitMetaDict[s]["notes"]+noteStr

        # Extract most recent (max) and earliest (min) values
        splitMetaDict[s]["latest entry"] = df[timeCol].max().isoformat()
        splitMetaDict[s]["earliest entry"] = df[timeCol].min().isoformat()

        # Determine usual delta between timestamps

        tempDF = df[[timeCol]].copy()
        tempDF['interval'] = tempDF[timeCol].diff()
        tempDF = tempDF.dropna(subset=['interval'])
        intervalFreqSeries = tempDF['interval'].value_counts().sort_index()
        usualInterval = intervalFreqSeries.idxmax()
        splitMetaDict[s]["usual interval"] = usualInterval

        # Characterize the remaining deltas 

        intervalFreqSeries.drop(usualInterval,axis='index',inplace=True)
        splitMetaDict[s]["gap count"] =  intervalFreqSeries.sum()

        idx = tempDF["interval"] != usualInterval
        splitMetaDict[s]["gap df"] = tempDF.loc[idx]

        # Make the timeCol the index
        df.set_index(timeCol,inplace=True)
    else:
        colType = str(df[timeCol].dtype)
        raise AttributeError(f"The \'{timeCol}\' column is not datetimelike but is {colType}:\n\t{e}")
    
    splitDFDict[s] = df
    

Sensor Name 'Back Fence' had 48 ambiguous timestamps in the
Timestamp, which did not survive conversion to America/New_York.

Sensor Name 'Back Fence Retired 2' had 12 ambiguous timestamps in the
Timestamp, which did not survive conversion to America/New_York.

Sensor Name 'Garage' had 36 ambiguous timestamps in the
Timestamp, which did not survive conversion to America/New_York.

Sensor Name 'Greenhouse' had 24 ambiguous timestamps in the
Timestamp, which did not survive conversion to America/New_York.

Sensor Name 'In home  (w/display)' had 59 ambiguous timestamps in the
Timestamp, which did not survive conversion to America/New_York.

Sensor Name 'Raingauge' had 24 ambiguous timestamps in the
Timestamp, which did not survive conversion to America/New_York.



In [17]:
# What is the range of dates in 
# 'Raingauge'? First seems to be 2023-10-21 10:30:00-04:00
# 'Back Fence' 2019-11-16 00:00:00-05:00 - 2025-04-02 21:55:00-04:00
# 'Back Fence Retired 2' 2023-10-31 00:00:00-04:00 - 2023-11-05 16:15:00-05:00

splitDFDict['Back Fence'].sort_values(by='Timestamp',ascending=True) 
splitDFDict['Back Fence'].columns

Index(['Temperature ( F )', 'Humidity ( RH )', 'Dew Point ( F )',
       'Heat Index ( F )', 'Barometric Pressure ( INHG )', 'file'],
      dtype='object')

In [18]:
import pandas as pd

# Define the temperature thresholds
thresholds = [25, 30, 32, 40, 45, 50, 55, 60, 65, 70]

# Filter the DataFrame for the specified date range (April 1 to June 30)
filtered_df = splitDFDict['Back Fence'].loc[
    (splitDFDict['Back Fence'].index.month >= 4) & 
    (splitDFDict['Back Fence'].index.month <= 6)
]

# Create a dictionary to store the results
results = {}

# Iterate over each threshold
for threshold in thresholds:
    # Filter for rows where the temperature is at or below the threshold
    below_threshold = filtered_df[filtered_df['Temperature ( F )'] <= threshold]
    
    # Group by year and find the last date for each year
    last_dates = below_threshold.groupby(below_threshold.index.year).apply(
        lambda group: group.index.max()
    )
    
    # Store the results
    results[threshold] = last_dates

# Convert the results dictionary to a DataFrame for better readability
results_df = pd.DataFrame(results)

# Display the results
print(results_df)

                                 25                        30  \
Timestamp                                                       
2020                            NaT                       NaT   
2021      2021-04-03 07:15:00-04:00 2021-04-03 08:30:00-04:00   
2022                            NaT                       NaT   
2023                            NaT                       NaT   
2024                            NaT                       NaT   
2025                            NaT                       NaT   

                                 32                        40  \
Timestamp                                                       
2020      2020-05-10 06:15:00-04:00 2020-05-12 07:45:00-04:00   
2021      2021-04-23 07:30:00-04:00 2021-05-14 07:00:00-04:00   
2022      2022-04-20 07:35:00-04:00 2022-04-21 07:30:00-04:00   
2023                            NaT 2023-05-04 06:35:00-04:00   
2024      2024-04-23 07:30:00-04:00 2024-04-25 07:10:00-04:00   
2025                    