In [29]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import glob

# For Singular Files DataFrame

In [30]:
file_path = r"C:\Users\15093\work\classes\Summer 25\Sittin on the dock of the bay\Data\All Data\CC2435009_20250513_181211.csv"

with open(file_path, 'r') as f:
    header_lines = [next(f) for _ in range(29)]


meta = {}
for line in header_lines:
    line = line.strip()
    if not line.startswith('%'):
        continue
    line = line.lstrip('%').strip()

    if not line or ',' not in line:
        continue  # skip empty or malformed lines

    key, value = line.split(',', 1)  # only split on the first comma
    meta[key.strip()] = value.strip()

print(meta)


latitude = float(meta.get("Start latitude", "nan"))
longitude = float(meta.get("Start longitude", "nan"))
utc_time = pd.to_datetime(meta.get("Cast time (UTC)", pd.NaT))
device_id = meta.get("Device", "")
file_name = meta.get("File name", "")
cast_duration = float(meta.get("Cast duration (Seconds)", "nan"))
samples_per_sec = float(meta.get("Samples per second", "nan"))

data = np.genfromtxt(file_path, delimiter=',', skip_header=29)
df = pd.DataFrame(data)

# columns of data collected
df.columns = ["Pressure (dbar)","Depth (m)", "Temperature (°C)", "Conductivity (µS/cm)", 
              "Specific Conductance (µS/cm)", "Salinity (PSS)", 
              "Sound Velocity (m/s)", "Density (kg/m³)"]

# can change what paramters of the metadata you want
df["Latitude"] = latitude
df["Longitude"] = longitude
df["UTC Time"] = utc_time
df["File Name"] = file_name
#df["Device ID"] = device_id
#df["Cast Duration"] = cast_duration
#df["Samples Per Sec"] = samples_per_sec

df

{'Device': 'CC2435009', 'File name': 'CC2435009_20250513_181211', 'Cast time (UTC)': '2025-05-13 18:12:11', 'Cast time (local)': '2025-05-13 12:12:11', 'Sample type': 'Cast', 'Cast data': 'Processed', 'Location source': 'GPS', 'Default latitude': '32', 'Default altitude': '0', 'Start latitude': '40.2741476', 'Start longitude': '-106.8399001', 'Start altitude': '2287.0380859375', 'Start GPS horizontal error(Meter)': '44.708999633789064', 'Start GPS vertical error(Meter)': '673.39300537109376', 'Start GPS number of satellites': '3', 'End latitude': '', 'End longitude': '', 'End altitude': '', 'End GPS horizontal error(Meter)': '', 'End GPS vertical error(Meter)': '', 'End GPS number of satellites': '', 'Cast duration (Seconds)': '34', 'Samples per second': '5', 'Electronics calibration date': '0001-01-01', 'Conductivity calibration date': '2024-10-09', 'Temperature calibration date': '2024-10-07', 'Pressure calibration date': '2024-09-17'}


Unnamed: 0,Pressure (dbar),Depth (m),Temperature (°C),Conductivity (µS/cm),Specific Conductance (µS/cm),Salinity (PSS),Sound Velocity (m/s),Density (kg/m³),Latitude,Longitude,UTC Time,File Name
0,0.15,0.153151,9.153524,296.339874,433.834991,0.204679,1444.146359,999.934361,40.274148,-106.8399,2025-05-13 18:12:11,CC2435009_20250513_181211
1,0.45,0.458792,9.255136,295.835466,431.811837,0.203774,1444.561198,999.927172,40.274148,-106.8399,2025-05-13 18:12:11,CC2435009_20250513_181211
2,0.75,0.764653,9.132318,295.647002,433.089545,0.2043,1444.069489,999.938574,40.274148,-106.8399,2025-05-13 18:12:11,CC2435009_20250513_181211
3,1.05,1.070511,9.033097,296.121842,435.049797,0.205173,1443.672646,999.948268,40.274148,-106.8399,2025-05-13 18:12:11,CC2435009_20250513_181211
4,1.35,1.376366,8.925369,296.570547,437.092585,0.206079,1443.240202,999.958485,40.274148,-106.8399,2025-05-13 18:12:11,CC2435009_20250513_181211
5,1.65,1.68222,8.913978,296.471006,437.092649,0.206069,1443.198541,999.960762,40.274148,-106.8399,2025-05-13 18:12:11,CC2435009_20250513_181211
6,2.094145,2.13503,8.925132,296.00581,436.263319,0.205674,1443.250564,999.961758,40.274148,-106.8399,2025-05-13 18:12:11,CC2435009_20250513_181211


# For Full DataFrame

In [31]:
import os
import glob
import pandas as pd
import numpy as np

folder_path = r"C:\Users\15093\work\classes\Summer 25\Sittin on the dock of the bay\Data\All Data"
file_paths = glob.glob(os.path.join(folder_path, "*.csv"))

all_dfs = []

for file_path in file_paths:
    # Step 1: Read header lines (first 29 lines)
    with open(file_path, 'r', encoding='utf-8') as f:
        header_lines = [next(f) for _ in range(29)]

#collect the meta data
    meta = {}
    for line in header_lines:
        line = line.strip()
        if not line.startswith('%'):
            continue
        line = line.lstrip('%').strip()
    
        if not line or ',' not in line:
            continue  # skip empty or malformed lines
    
        key, value = line.split(',', 1)
        meta[key.strip()] = value.strip()

#skip invaliid measurement files
    if meta.get("Sample type", "").lower() == "invalid":
        print(f"Skipping file due to invalid sample: {os.path.basename(file_path)}")
        continue


    latitude = float(meta.get("Start latitude", "nan"))
    longitude = float(meta.get("Start longitude", "nan"))
    utc_time = pd.to_datetime(meta.get("Cast time (UTC)", pd.NaT))
    device_id = meta.get("Device", "")
    file_name = meta.get("File name", "")
    cast_duration = float(meta.get("Cast duration (Seconds)", "nan"))
    samples_per_sec = float(meta.get("Samples per second", "nan"))
    
    try:
        data = np.genfromtxt(file_path, delimiter=',', skip_header=29)
        if data.ndim == 1:
            data = data.reshape(1, -1)
    except Exception as e:
        continue
        
    df = pd.DataFrame(data)
    
    df.columns = ["Pressure (dbar)", "Depth (m)", "Temperature (°C)", "Conductivity (µS/cm)", 
                  "Specific Conductance (µS/cm)", "Salinity (PSS)", 
                  "Sound Velocity (m/s)", "Density (kg/m³)"]
    
    # Step 6: Add metadata columns
    df["Latitude"] = latitude
    df["Longitude"] = longitude
    df["UTC Time"] = utc_time
    df["File Name"] = file_name

    # Step 7: Append to list
    all_dfs.append(df)

# concat all dataframes
big_df = pd.concat(all_dfs, ignore_index=True)


Skipping file due to invalid sample: CC2435009_20250515_205151.csv
Skipping file due to invalid sample: CC2435009_20250516_180225.csv
Skipping file due to invalid sample: CC2435009_20250517_163909.csv
Skipping file due to invalid sample: CC2435009_20250517_171247.csv
Skipping file due to invalid sample: CC2435009_20250517_194351.csv
Skipping file due to invalid sample: CC2435009_20250517_194449.csv
Skipping file due to invalid sample: CC2435009_20250519_171944.csv
Skipping file due to invalid sample: CC2435009_20250519_172106.csv
Skipping file due to invalid sample: CC2435009_20250519_172625.csv
Skipping file due to invalid sample: CC2435009_20250519_172818.csv
Skipping file due to invalid sample: CC2435009_20250519_172856.csv
Skipping file due to invalid sample: CC2435009_20250519_172932.csv


In [32]:
big_df

Unnamed: 0,Pressure (dbar),Depth (m),Temperature (°C),Conductivity (µS/cm),Specific Conductance (µS/cm),Salinity (PSS),Sound Velocity (m/s),Density (kg/m³),Latitude,Longitude,UTC Time,File Name
0,0.15000,0.153151,9.153524,296.339874,433.834991,0.204679,1444.146359,999.934361,40.274148,-106.839900,2025-05-13 18:12:11,CC2435009_20250513_181211
1,0.45000,0.458792,9.255136,295.835466,431.811837,0.203774,1444.561198,999.927172,40.274148,-106.839900,2025-05-13 18:12:11,CC2435009_20250513_181211
2,0.75000,0.764653,9.132318,295.647002,433.089545,0.204300,1444.069489,999.938574,40.274148,-106.839900,2025-05-13 18:12:11,CC2435009_20250513_181211
3,1.05000,1.070511,9.033097,296.121842,435.049797,0.205173,1443.672646,999.948268,40.274148,-106.839900,2025-05-13 18:12:11,CC2435009_20250513_181211
4,1.35000,1.376366,8.925369,296.570547,437.092585,0.206079,1443.240202,999.958485,40.274148,-106.839900,2025-05-13 18:12:11,CC2435009_20250513_181211
...,...,...,...,...,...,...,...,...,...,...,...,...
4096,7.95000,8.106426,9.856007,304.843004,437.289056,0.206849,1447.095628,999.915785,40.283620,-106.853463,2025-05-19 19:10:35,CC2435009_20250519_191035
4097,8.25000,8.412304,9.843826,303.101144,434.942401,0.205699,1447.050447,999.917371,40.283620,-106.853463,2025-05-19 19:10:35,CC2435009_20250519_191035
4098,8.55000,8.718180,9.705838,301.472365,434.325143,0.205300,1446.504031,999.930334,40.283620,-106.853463,2025-05-19 19:10:35,CC2435009_20250519_191035
4099,8.85000,9.024043,8.859839,295.236544,435.968619,0.205457,1443.091548,999.998864,40.283620,-106.853463,2025-05-19 19:10:35,CC2435009_20250519_191035


In [33]:
big_df.to_csv(r'C:\Users\15093\work\classes\Summer 25\Sittin on the dock of the bay\big_df.csv', index=False)