In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gudhi as gd
import os
from datetime import datetime
import matplotlib.dates as mdates

In [26]:

def preprocess_snow_data(folder_path):
    all_data = []

    # Get all CSV files in the folder
    file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]

    for file_path in file_paths:
        # Read the first few rows to extract the year
        with open(file_path, 'r') as file:
            first_row = file.readlines()[0]  # Get the first row
            year_month = first_row.split(',')[1].strip()  # Extract the month-year string
            year = year_month.split(' ')[1]  # Split and take the year part

        # Load the actual data, skipping the metadata rows
        df = pd.read_csv(file_path, skiprows=4)

        # Identify date columns by removing metadata-related columns
        meta_cols = ["GHCN ID", "Station Name", "County", "State", "Elevation", "Latitude", "Longitude"]
        date_cols = [col for col in df.columns if col not in meta_cols]

        # Melt the dataframe to long format, keeping only date columns
        df_long = df.melt(id_vars=["County"], value_vars=date_cols, var_name="date", value_name="snow_depth")

        # Add the year to the 'date' column
        df_long['date'] = df_long['date'] + ' ' + year  # Append the extracted year to the date

        # Replace 'M' with NaN and 'T' (trace amounts) with 0.01
        df_long["snow_depth"] = df_long["snow_depth"].replace({"M": None, "T": 0.01}).astype(float)

        # Append to the list
        all_data.append(df_long)

    # Concatenate all files
    clean_data = pd.concat(all_data, ignore_index=True)

    # Convert 'date' to datetime
    clean_data['date'] = pd.to_datetime(clean_data['date'], format='%b %d %Y')

    return clean_data

# Example: preprocess data from a folder
folder_path = '.\data'
cleaned_snow_data = preprocess_snow_data(folder_path)
cleaned_snow_data.head()


Unnamed: 0,County,date,snow_depth
0,MINERAL,2024-12-01,0.0
1,MADISON,2024-12-01,
2,DEERLODGE,2024-12-01,0.0
3,DEERLODGE,2024-12-01,0.0
4,POWDERRIVER,2024-12-01,0.0


In [27]:
def aggregate_snow_data(snow_data, county_areas):
    
    # Step 1: Average sensor readings per county per day
    county_avg_snow = snow_data.groupby(["date", "County"])["snow_depth"].mean().reset_index()

    # Step 2: Merge with county area data
    county_avg_snow = county_avg_snow.merge(county_areas, on="County", how="left")

    # Step 3: Compute the weighted average snow depth for the state
    county_avg_snow["weighted_snow"] = county_avg_snow["snow_depth"] * county_avg_snow["AREA"]
    
    # Compute the state-wide weighted average per day
    state_snow = county_avg_snow.groupby("date").apply(
        lambda x: x["weighted_snow"].sum() / x["AREA"].sum()
    ).reset_index(name="state_avg_snow")
    
    # Ensure the result is in DataFrame format
    return state_snow

# Load county area data
county_areas_df = pd.read_csv("MTcounties.csv")

# Example usage (assuming cleaned data is ready)
aggregated_snow_data = aggregate_snow_data(cleaned_snow_data, county_areas_df)
aggregated_snow_data = aggregated_snow_data.sort_values(by="date")

# Print or work with the DataFrame
print(aggregated_snow_data)


          date  state_avg_snow
0   2024-01-01        0.023049
1   2024-01-02        0.000367
2   2024-01-03        0.001840
3   2024-01-04        0.098346
4   2024-01-05        0.041817
..         ...             ...
118 2024-12-27        0.078915
119 2024-12-28        0.122403
120 2024-12-29        0.162725
121 2024-12-30        1.119695
122 2024-12-31        0.475893

[123 rows x 2 columns]


In [5]:
aggregated_snow_data.head(10)

Unnamed: 0,date,state_avg_snow
0,Dec 1,0.009223
1,Dec 10,0.218799
2,Dec 11,0.273005
3,Dec 12,0.091755
4,Dec 13,0.026163
5,Dec 14,0.048648
6,Dec 15,0.238264
7,Dec 16,0.169388
8,Dec 17,0.355134
9,Dec 18,0.270861


In [8]:
aggregated_snow_data['state_avg_snow'].max()

2.6649999744887705