In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gudhi as gd

from datetime import datetime
import matplotlib.dates as mdates

In [None]:
# List of file paths
file_paths = ["snow_oct.csv", "snow_nov.csv", "snow_dec.csv", "snow_jan.csv"]

def preprocess_snow_data(file_paths):
    all_data = []

    for file_path in file_paths:
        # Load data, skipping metadata rows
        df = pd.read_csv(file_path, skiprows=4)

        # Identify date columns by removing metadata-related columns
        meta_cols = ["GHCN ID", "Station Name", "County", "State", "Elevation", "Latitude", "Longitude"]
        date_cols = [col for col in df.columns if col not in meta_cols]

        # Melt the dataframe to long format, keeping only date columns
        df_long = df.melt(id_vars=["County"], value_vars=date_cols, var_name="date", value_name="snow_depth")

        # Replace 'M' with NaN and 'T' (trace amounts) with 0.01
        df_long["snow_depth"] = df_long["snow_depth"].replace({"M": None, "T": 0.01}).astype(float)

        # Append to the list
        all_data.append(df_long)

    # Concatenate all files
    clean_data = pd.concat(all_data, ignore_index=True)

    return clean_data



# Test preprocessing with the uploaded December data
cleaned_snow_data = preprocess_snow_data(file_paths)
cleaned_snow_data.head(345)




Unnamed: 0,County,date,snow_depth
0,MINERAL,Oct 1,0.0
1,DEERLODGE,Oct 1,
2,DEERLODGE,Oct 1,0.0
3,POWDERRIVER,Oct 1,0.0
4,LEWISANDCLARK,Oct 1,
...,...,...,...
340,VALLEY,Oct 2,0.0
341,VALLEY,Oct 2,
342,VALLEY,Oct 2,0.0
343,VALLEY,Oct 2,0.0


In [29]:
def aggregate_snow_data(snow_data, county_areas):
    
    # Step 1: Average sensor readings per county per day
    county_avg_snow = snow_data.groupby(["date", "County"])["snow_depth"].mean().reset_index()

    # Step 2: Merge with county area data
    county_avg_snow = county_avg_snow.merge(county_areas, on="County", how="left")

    # Step 3: Compute the weighted average snow depth for the state
    county_avg_snow["weighted_snow"] = county_avg_snow["snow_depth"] * county_avg_snow["AREA"]
    
    # Compute the state-wide weighted average per day
    state_snow = county_avg_snow.groupby("date").apply(
        lambda x: x["weighted_snow"].sum() / x["AREA"].sum()
    ).reset_index(name="state_avg_snow")
    
    # Ensure the result is in DataFrame format
    return state_snow

# Load county area data
county_areas_df = pd.read_csv("MTcounties.csv")

# Example usage (assuming cleaned data is ready)
aggregated_snow_data = aggregate_snow_data(cleaned_snow_data, county_areas_df)

# Print or work with the DataFrame
print(aggregated_snow_data)


       date  state_avg_snow
0     Dec 1        0.009223
1    Dec 10        0.218799
2    Dec 11        0.273005
3    Dec 12        0.091755
4    Dec 13        0.026163
..      ...             ...
118   Oct 5        0.000000
119   Oct 6        0.000000
120   Oct 7        0.000000
121   Oct 8        0.000000
122   Oct 9        0.000000

[123 rows x 2 columns]


In [31]:
aggregated_snow_data.head(10)

Unnamed: 0,date,state_avg_snow
0,Dec 1,0.009223
1,Dec 10,0.218799
2,Dec 11,0.273005
3,Dec 12,0.091755
4,Dec 13,0.026163
5,Dec 14,0.048648
6,Dec 15,0.238264
7,Dec 16,0.169388
8,Dec 17,0.355134
9,Dec 18,0.270861
