# Time Series Analysis

### Loading Libraries

In [149]:
# ZipFiles & IO
import io
import os
import zipfile

#URL
import urllib.request

# Time-Zone
import pytz

# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd
import polars as pl
import polars.selectors as cs

# Data Visualization
import hvplot
import altair as alt
import seaborn as sns
import holoviews as hv
import matplotlib.pyplot as plt

# XGBoost
import xgboost as xgb

# Scikit-Learn
from sklearn import decomposition

# Date & Time
from datetime import datetime
from datetime import timedelta

In [2]:
hvplot.extension('matplotlib')

### Loading Dataset

#### URL Function

In [3]:
def download_and_modify_url(url, local_filename):
    urllib.request.urlretrieve(url, local_filename)
    
    with open(local_filename, 'r') as file:
        lines = file.readlines()
    
    with open(local_filename, 'w') as file:
        for i, line in enumerate(lines):
            if i < 34 or i == 35:
                continue
            file.write(line)

In [4]:
url = 'https://github.com/mattharrison/datasets/raw/master/data/dirtydevil.txt'

In [5]:
local_filename = '/Users/isisromero/desktop/polars/datasets/devilclean.txt'

In [6]:
download_and_modify_url(url, local_filename)

In [7]:
print(f"Saved & Modified File within {local_filename}")

Saved & Modified File within /Users/isisromero/desktop/polars/datasets/devilclean.txt


#### Tweaking Function

In [8]:
def tweak_river(df_):
    return (df_
            .select('agency_cd', 'site_no', 'tz_cd', 
                    pl.col('datetime').str.to_datetime(),
                    cfs=pl.col('144166_00060'),
                    gage_height=pl.col('144167_00065').cast(pl.Float64)
                   )
           )

In [9]:
raw = pl.read_csv('/Users/isisromero/desktop/polars/datasets/devilclean.txt', separator='\t')

In [10]:
dd = tweak_river(raw)

In [11]:
print(dd)

shape: (539_305, 6)
┌───────────┬─────────┬───────┬─────────────────────┬──────┬─────────────┐
│ agency_cd ┆ site_no ┆ tz_cd ┆ datetime            ┆ cfs  ┆ gage_height │
│ ---       ┆ ---     ┆ ---   ┆ ---                 ┆ ---  ┆ ---         │
│ str       ┆ i64     ┆ str   ┆ datetime[μs]        ┆ f64  ┆ f64         │
╞═══════════╪═════════╪═══════╪═════════════════════╪══════╪═════════════╡
│ USGS      ┆ 9333500 ┆ MDT   ┆ 2001-05-07 01:00:00 ┆ 71.0 ┆ null        │
│ USGS      ┆ 9333500 ┆ MDT   ┆ 2001-05-07 01:15:00 ┆ 71.0 ┆ null        │
│ USGS      ┆ 9333500 ┆ MDT   ┆ 2001-05-07 01:30:00 ┆ 71.0 ┆ null        │
│ USGS      ┆ 9333500 ┆ MDT   ┆ 2001-05-07 01:45:00 ┆ 70.0 ┆ null        │
│ USGS      ┆ 9333500 ┆ MDT   ┆ 2001-05-07 02:00:00 ┆ 70.0 ┆ null        │
│ …         ┆ …       ┆ …     ┆ …                   ┆ …    ┆ …           │
│ USGS      ┆ 9333500 ┆ MDT   ┆ 2020-09-28 08:30:00 ┆ 9.53 ┆ 6.16        │
│ USGS      ┆ 9333500 ┆ MDT   ┆ 2020-09-28 08:45:00 ┆ 9.2  ┆ 6.15        │
│ USG

### Dates Convertion

In [12]:
# Reading as a Date
print(pl.read_csv('/Users/isisromero/desktop/polars/datasets/devilclean.txt', separator='\t', try_parse_dates=True))

shape: (539_305, 8)
┌───────────┬─────────┬─────────────┬───────┬─────────────┬─────────────┬─────────────┬────────────┐
│ agency_cd ┆ site_no ┆ datetime    ┆ tz_cd ┆ 144166_0006 ┆ 144166_0006 ┆ 144167_0006 ┆ 144167_000 │
│ ---       ┆ ---     ┆ ---         ┆ ---   ┆ 0           ┆ 0_cd        ┆ 5           ┆ 65_cd      │
│ str       ┆ i64     ┆ datetime[μs ┆ str   ┆ ---         ┆ ---         ┆ ---         ┆ ---        │
│           ┆         ┆ ]           ┆       ┆ f64         ┆ str         ┆ str         ┆ str        │
╞═══════════╪═════════╪═════════════╪═══════╪═════════════╪═════════════╪═════════════╪════════════╡
│ USGS      ┆ 9333500 ┆ 2001-05-07  ┆ MDT   ┆ 71.0        ┆ A:[91]      ┆ null        ┆ null       │
│           ┆         ┆ 01:00:00    ┆       ┆             ┆             ┆             ┆            │
│ USGS      ┆ 9333500 ┆ 2001-05-07  ┆ MDT   ┆ 71.0        ┆ A:[91]      ┆ null        ┆ null       │
│           ┆         ┆ 01:15:00    ┆       ┆             ┆            

In [13]:
# Dates Proper Conversion, as follows:
format = '%Y-%m-%d %H:%M'

In [14]:
# Contrasting All Conversion
print(raw
      .select(original=pl.col('datetime'),
              to_datetime=pl.col('datetime').str.to_datetime(format),
              to_date=pl.col('datetime').str.to_date(format),
              strptime=pl.col('datetime').str.strptime(pl.Datetime, format),
              # Line below fails
              #cast=pl.col('datetime').cast(pl.Datetime)
             )
     )

shape: (539_305, 4)
┌──────────────────┬─────────────────────┬────────────┬─────────────────────┐
│ original         ┆ to_datetime         ┆ to_date    ┆ strptime            │
│ ---              ┆ ---                 ┆ ---        ┆ ---                 │
│ str              ┆ datetime[μs]        ┆ date       ┆ datetime[μs]        │
╞══════════════════╪═════════════════════╪════════════╪═════════════════════╡
│ 2001-05-07 01:00 ┆ 2001-05-07 01:00:00 ┆ 2001-05-07 ┆ 2001-05-07 01:00:00 │
│ 2001-05-07 01:15 ┆ 2001-05-07 01:15:00 ┆ 2001-05-07 ┆ 2001-05-07 01:15:00 │
│ 2001-05-07 01:30 ┆ 2001-05-07 01:30:00 ┆ 2001-05-07 ┆ 2001-05-07 01:30:00 │
│ 2001-05-07 01:45 ┆ 2001-05-07 01:45:00 ┆ 2001-05-07 ┆ 2001-05-07 01:45:00 │
│ 2001-05-07 02:00 ┆ 2001-05-07 02:00:00 ┆ 2001-05-07 ┆ 2001-05-07 02:00:00 │
│ …                ┆ …                   ┆ …          ┆ …                   │
│ 2020-09-28 08:30 ┆ 2020-09-28 08:30:00 ┆ 2020-09-28 ┆ 2020-09-28 08:30:00 │
│ 2020-09-28 08:45 ┆ 2020-09-28 08:45:00 ┆ 2

### Combining Columns to Create

In [15]:
print(raw
      .select(to_datetime=pl.col('datetime').str.to_datetime(format))
      .with_columns(month=pl.col('to_datetime').dt.strftime('%m'),
                    year=pl.col('to_datetime').dt.strftime('%Y'))
     )

shape: (539_305, 3)
┌─────────────────────┬───────┬──────┐
│ to_datetime         ┆ month ┆ year │
│ ---                 ┆ ---   ┆ ---  │
│ datetime[μs]        ┆ str   ┆ str  │
╞═════════════════════╪═══════╪══════╡
│ 2001-05-07 01:00:00 ┆ 05    ┆ 2001 │
│ 2001-05-07 01:15:00 ┆ 05    ┆ 2001 │
│ 2001-05-07 01:30:00 ┆ 05    ┆ 2001 │
│ 2001-05-07 01:45:00 ┆ 05    ┆ 2001 │
│ 2001-05-07 02:00:00 ┆ 05    ┆ 2001 │
│ …                   ┆ …     ┆ …    │
│ 2020-09-28 08:30:00 ┆ 09    ┆ 2020 │
│ 2020-09-28 08:45:00 ┆ 09    ┆ 2020 │
│ 2020-09-28 09:00:00 ┆ 09    ┆ 2020 │
│ 2020-09-28 09:15:00 ┆ 09    ┆ 2020 │
│ 2020-09-28 09:30:00 ┆ 09    ┆ 2020 │
└─────────────────────┴───────┴──────┘


In [16]:
print(raw
      .select(to_datetime=pl.col('datetime').str.to_datetime(format))
      .with_columns(month=pl.col('to_datetime').dt.strftime('%m'),year=pl.col('to_datetime').dt.strftime('%Y'))
      .select(pl.date(pl.col('year'), pl.col('month'), 1))
     )

shape: (539_305, 1)
┌────────────┐
│ date       │
│ ---        │
│ date       │
╞════════════╡
│ 2001-05-01 │
│ 2001-05-01 │
│ 2001-05-01 │
│ 2001-05-01 │
│ 2001-05-01 │
│ …          │
│ 2020-09-01 │
│ 2020-09-01 │
│ 2020-09-01 │
│ 2020-09-01 │
│ 2020-09-01 │
└────────────┘


### Changing Time-Zones

In [17]:
# Inspecting Current Time
print(tweak_river(raw)
      ['datetime']
      .dtype.time_zone
     )

None


In [18]:
# Checking PyZone Attributes
pytz.all_timezones[:5]

['Africa/Abidjan',
 'Africa/Accra',
 'Africa/Addis_Ababa',
 'Africa/Algiers',
 'Africa/Asmara']

In [19]:
# Dates Proper Conversion, as follows:
format = '%Y-%m-%d %H:%M'

In [20]:
print(raw
      .select(original=pl.col('datetime'), naive=pl.col('datetime').str.to_datetime(format), 
              utc=pl.col('datetime').str.to_datetime(format)
              .dt.replace_time_zone('UTC'), Denver=pl.col('datetime').str.to_datetime(format)
              .dt.replace_time_zone('UTC')
              .dt.convert_time_zone('America/Denver'), Denver2=(pl.col('datetime') + ' ' + (pl.col('tz_cd')
                .str.replace('MST', '-0700').str.replace('MDT', '-0600')))
              .str.to_datetime('%Y-%m-%d %H:%M %z')
              .dt.convert_time_zone('America/Denver'), 
              Denver3=(pl.col('datetime').str.to_datetime(format, time_zone='America/Denver', ambiguous='earliest'))
             )
     )

shape: (539_305, 6)
┌────────────┬─────────────────┬────────────────┬────────────────┬────────────────┬────────────────┐
│ original   ┆ naive           ┆ utc            ┆ Denver         ┆ Denver2        ┆ Denver3        │
│ ---        ┆ ---             ┆ ---            ┆ ---            ┆ ---            ┆ ---            │
│ str        ┆ datetime[μs]    ┆ datetime[μs,   ┆ datetime[μs,   ┆ datetime[μs,   ┆ datetime[μs,   │
│            ┆                 ┆ UTC]           ┆ America/Denver ┆ America/Denver ┆ America/Denver │
│            ┆                 ┆                ┆ ]              ┆ ]              ┆ ]              │
╞════════════╪═════════════════╪════════════════╪════════════════╪════════════════╪════════════════╡
│ 2001-05-07 ┆ 2001-05-07      ┆ 2001-05-07     ┆ 2001-05-06     ┆ 2001-05-07     ┆ 2001-05-07     │
│ 01:00      ┆ 01:00:00        ┆ 01:00:00 UTC   ┆ 19:00:00 MDT   ┆ 01:00:00 MDT   ┆ 01:00:00 MDT   │
│ 2001-05-07 ┆ 2001-05-07      ┆ 2001-05-07     ┆ 2001-05-06     ┆ 2001

In [21]:
# Dates Proper Conversion, as follows:
format = '%Y-%m-%d %H:%M'

In [22]:
print(raw
      .select(
          Denver2=(pl.col('datetime') + ' ' + (pl.col('tz_cd')
                                               .str.replace('MST', '-0700').str.replace('MDT', '-0600')))
          .str.to_datetime('%Y-%m-%d %H:%M %z')
          .dt.convert_time_zone('America/Denver'),            
          Denver3=(pl.col('datetime').str.to_datetime(format, time_zone='America/Denver', ambiguous='latest'))
      )
      .filter(pl.col('Denver3') != pl.col('Denver2'))
     )

shape: (56, 2)
┌──────────────────────────────┬──────────────────────────────┐
│ Denver2                      ┆ Denver3                      │
│ ---                          ┆ ---                          │
│ datetime[μs, America/Denver] ┆ datetime[μs, America/Denver] │
╞══════════════════════════════╪══════════════════════════════╡
│ 2003-10-26 01:00:00 MDT      ┆ 2003-10-26 01:00:00 MST      │
│ 2003-10-26 01:15:00 MDT      ┆ 2003-10-26 01:15:00 MST      │
│ 2003-10-26 01:30:00 MDT      ┆ 2003-10-26 01:30:00 MST      │
│ 2003-10-26 01:45:00 MDT      ┆ 2003-10-26 01:45:00 MST      │
│ 2007-11-04 01:00:00 MDT      ┆ 2007-11-04 01:00:00 MST      │
│ …                            ┆ …                            │
│ 2018-11-04 01:45:00 MDT      ┆ 2018-11-04 01:45:00 MST      │
│ 2019-11-03 01:00:00 MDT      ┆ 2019-11-03 01:00:00 MST      │
│ 2019-11-03 01:15:00 MDT      ┆ 2019-11-03 01:15:00 MST      │
│ 2019-11-03 01:30:00 MDT      ┆ 2019-11-03 01:30:00 MST      │
│ 2019-11-03 01:45:00 MDT

#### Adding Time-Zone within The `tweak` Function

In [23]:
def tweak_river(df_, cfs_col, gage_height_col):
    return (df_
        .select(
            'agency_cd', 'site_no', 
            cfs=pl.col(cfs_col),
            gage_height=pl.col(gage_height_col).cast(pl.Float64),
            datetime=(pl.col('datetime') + ' ' + (pl.col('tz_cd')
                .str.replace('MST', '-0700').str.replace('MDT', '-0600')))
                .str.to_datetime('%Y-%m-%d %H:%M %z')
                .dt.convert_time_zone('America/Denver')
            )
        )

In [24]:
dd = tweak_river(raw, cfs_col='144166_00060', gage_height_col='144167_00065')

### Time Aggregation

In [25]:
print(dd
      .with_columns(year=pl.col('datetime').dt.year())
      .group_by('year')
      .agg(pl.col(pl.Float64).mean())
     )

shape: (20, 3)
┌──────┬────────────┬─────────────┐
│ year ┆ cfs        ┆ gage_height │
│ ---  ┆ ---        ┆ ---         │
│ i32  ┆ f64        ┆ f64         │
╞══════╪════════════╪═════════════╡
│ 2016 ┆ 98.565988  ┆ 5.542945    │
│ 2020 ┆ 58.426221  ┆ 6.470503    │
│ 2011 ┆ 159.9936   ┆ 3.968769    │
│ 2005 ┆ 177.604843 ┆ NaN         │
│ 2010 ┆ 81.934596  ┆ 3.138236    │
│ …    ┆ …          ┆ …           │
│ 2001 ┆ 74.304452  ┆ NaN         │
│ 2004 ┆ 113.173465 ┆ NaN         │
│ 2017 ┆ 82.364265  ┆ 5.581915    │
│ 2018 ┆ 71.074642  ┆ 6.423744    │
│ 2007 ┆ 95.310375  ┆ 3.641389    │
└──────┴────────────┴─────────────┘


In [26]:
print(dd
      .with_columns(year=pl.col('datetime').dt.year())
      .group_by('year', maintain_order=True)
      .agg(pl.col(pl.Float64).mean())
     )

shape: (20, 3)
┌──────┬────────────┬─────────────┐
│ year ┆ cfs        ┆ gage_height │
│ ---  ┆ ---        ┆ ---         │
│ i32  ┆ f64        ┆ f64         │
╞══════╪════════════╪═════════════╡
│ 2001 ┆ 74.304452  ┆ NaN         │
│ 2002 ┆ 116.211979 ┆ NaN         │
│ 2003 ┆ 86.957083  ┆ NaN         │
│ 2004 ┆ 113.173465 ┆ NaN         │
│ 2005 ┆ 177.604843 ┆ NaN         │
│ …    ┆ …          ┆ …           │
│ 2016 ┆ 98.565988  ┆ 5.542945    │
│ 2017 ┆ 82.364265  ┆ 5.581915    │
│ 2018 ┆ 71.074642  ┆ 6.423744    │
│ 2019 ┆ 92.886454  ┆ 6.670158    │
│ 2020 ┆ 58.426221  ┆ 6.470503    │
└──────┴────────────┴─────────────┘


In [27]:
# Running Naïvely
(dd
 .group_by_dynamic(index_column='datetime', every='1y')
 .agg(pl.col(pl.Float64).mean())
)

InvalidOperationError: argument in operation 'group_by_dynamic' is not sorted, please sort the 'expr/series/column' first

In [28]:
print(dd
      .sort('datetime')  
      .group_by_dynamic(index_column='datetime', every='1y')
      .agg(pl.col(pl.Float64).mean())
     )

shape: (20, 3)
┌──────────────────────────────┬────────────┬─────────────┐
│ datetime                     ┆ cfs        ┆ gage_height │
│ ---                          ┆ ---        ┆ ---         │
│ datetime[μs, America/Denver] ┆ f64        ┆ f64         │
╞══════════════════════════════╪════════════╪═════════════╡
│ 2001-01-01 00:00:00 MST      ┆ 74.304452  ┆ null        │
│ 2002-01-01 00:00:00 MST      ┆ 116.211979 ┆ null        │
│ 2003-01-01 00:00:00 MST      ┆ 86.957083  ┆ null        │
│ 2004-01-01 00:00:00 MST      ┆ 113.173465 ┆ null        │
│ 2005-01-01 00:00:00 MST      ┆ 177.604843 ┆ null        │
│ …                            ┆ …          ┆ …           │
│ 2016-01-01 00:00:00 MST      ┆ 98.565988  ┆ 5.542945    │
│ 2017-01-01 00:00:00 MST      ┆ 82.364265  ┆ 5.581915    │
│ 2018-01-01 00:00:00 MST      ┆ 71.074642  ┆ 6.423744    │
│ 2019-01-01 00:00:00 MST      ┆ 92.886454  ┆ 6.670158    │
│ 2020-01-01 00:00:00 MST      ┆ 58.426221  ┆ 6.470503    │
└────────────────────────

In [29]:
print(dd
      .set_sorted('datetime')
      .group_by_dynamic(index_column='datetime', every='1y')
      .agg(pl.col(pl.Float64).mean())
     )

shape: (20, 3)
┌──────────────────────────────┬────────────┬─────────────┐
│ datetime                     ┆ cfs        ┆ gage_height │
│ ---                          ┆ ---        ┆ ---         │
│ datetime[μs, America/Denver] ┆ f64        ┆ f64         │
╞══════════════════════════════╪════════════╪═════════════╡
│ 2001-01-01 00:00:00 MST      ┆ 74.304452  ┆ null        │
│ 2002-01-01 00:00:00 MST      ┆ 116.211979 ┆ null        │
│ 2003-01-01 00:00:00 MST      ┆ 86.957083  ┆ null        │
│ 2004-01-01 00:00:00 MST      ┆ 113.173465 ┆ null        │
│ 2005-01-01 00:00:00 MST      ┆ 177.604843 ┆ null        │
│ …                            ┆ …          ┆ …           │
│ 2016-01-01 00:00:00 MST      ┆ 98.565988  ┆ 5.542945    │
│ 2017-01-01 00:00:00 MST      ┆ 82.364265  ┆ 5.581915    │
│ 2018-01-01 00:00:00 MST      ┆ 71.074642  ┆ 6.423744    │
│ 2019-01-01 00:00:00 MST      ┆ 92.886454  ┆ 6.670158    │
│ 2020-01-01 00:00:00 MST      ┆ 58.426221  ┆ 6.470503    │
└────────────────────────

### Time Intervals

In [30]:
# 2-Month Interval Period
print(dd
      .set_sorted('datetime')  
      .group_by_dynamic(index_column='datetime', every='2mo')
      .agg(pl.col(pl.Float64).mean())
     )

shape: (117, 3)
┌──────────────────────────────┬────────────┬─────────────┐
│ datetime                     ┆ cfs        ┆ gage_height │
│ ---                          ┆ ---        ┆ ---         │
│ datetime[μs, America/Denver] ┆ f64        ┆ f64         │
╞══════════════════════════════╪════════════╪═════════════╡
│ 2001-05-01 00:00:00 MDT      ┆ 41.999693  ┆ null        │
│ 2001-07-01 00:00:00 MDT      ┆ 165.864435 ┆ null        │
│ 2001-09-01 00:00:00 MDT      ┆ 43.783245  ┆ null        │
│ 2001-11-01 00:00:00 MST      ┆ 108.079392 ┆ null        │
│ 2002-01-01 00:00:00 MST      ┆ 132.790323 ┆ null        │
│ …                            ┆ …          ┆ …           │
│ 2020-01-01 00:00:00 MST      ┆ 125.290451 ┆ 6.925117    │
│ 2020-03-01 00:00:00 MST      ┆ 99.053008  ┆ 6.960658    │
│ 2020-05-01 00:00:00 MDT      ┆ 16.793101  ┆ 6.381607    │
│ 2020-07-01 00:00:00 MDT      ┆ 14.582522  ┆ 5.913772    │
│ 2020-09-01 00:00:00 MDT      ┆ 4.900842   ┆ 5.839076    │
└───────────────────────

In [31]:
# Every 3 Hours, 4 Minutes & 5 Seconds
print(dd
      .set_sorted('datetime')
      .group_by_dynamic(index_column='datetime', every='3h4m5s')
      .agg(pl.col(pl.Float64).mean())
     )

shape: (44_796, 3)
┌──────────────────────────────┬───────────┬─────────────┐
│ datetime                     ┆ cfs       ┆ gage_height │
│ ---                          ┆ ---       ┆ ---         │
│ datetime[μs, America/Denver] ┆ f64       ┆ f64         │
╞══════════════════════════════╪═══════════╪═════════════╡
│ 2001-05-06 23:03:20 MDT      ┆ 70.6      ┆ null        │
│ 2001-05-07 02:07:25 MDT      ┆ 67.916667 ┆ null        │
│ 2001-05-07 05:11:30 MDT      ┆ 64.461538 ┆ null        │
│ 2001-05-07 08:15:35 MDT      ┆ 65.333333 ┆ null        │
│ 2001-05-07 11:19:40 MDT      ┆ 73.333333 ┆ null        │
│ …                            ┆ …         ┆ …           │
│ 2020-09-27 20:17:10 MDT      ┆ 9.835833  ┆ 6.169167    │
│ 2020-09-27 23:21:15 MDT      ┆ 9.6675    ┆ 6.164167    │
│ 2020-09-28 02:25:20 MDT      ┆ 9.6675    ┆ 6.164167    │
│ 2020-09-28 05:29:25 MDT      ┆ 9.505385  ┆ 6.159231    │
│ 2020-09-28 08:33:30 MDT      ┆ 9.2       ┆ 6.15        │
└──────────────────────────────┴─────

##### Previos snippet: `check_sorted attributed has been removed`
```sh
print(dd
      .set_sorted('datetime')  
      .group_by_dynamic(index_column='datetime', every='7d', period='5d', start_by='monday', check_sorted=False)
      .agg(pl.col(pl.Float64).mean(),cfs_range=(pl.col('cfs').max() - pl.col('cfs').min()))
     )

In [32]:
print(dd
      .set_sorted('datetime')  
      .group_by_dynamic(index_column='datetime', every='7d', period='5d', start_by='monday')
      .agg(
          pl.col(pl.Float64).mean(),
          cfs_range=(pl.col('cfs').max() - pl.col('cfs').min())
      )
)

shape: (877, 4)
┌──────────────────────────────┬───────────┬─────────────┬───────────┐
│ datetime                     ┆ cfs       ┆ gage_height ┆ cfs_range │
│ ---                          ┆ ---       ┆ ---         ┆ ---       │
│ datetime[μs, America/Denver] ┆ f64       ┆ f64         ┆ f64       │
╞══════════════════════════════╪═══════════╪═════════════╪═══════════╡
│ 2001-05-07 00:00:00 MDT      ┆ 62.186975 ┆ null        ┆ 29.0      │
│ 2001-05-14 00:00:00 MDT      ┆ 50.529167 ┆ null        ┆ 20.0      │
│ 2001-05-21 00:00:00 MDT      ┆ 43.372917 ┆ null        ┆ 17.0      │
│ 2001-05-28 00:00:00 MDT      ┆ 36.9125   ┆ null        ┆ 27.0      │
│ 2001-06-04 00:00:00 MDT      ┆ 20.023438 ┆ null        ┆ 14.0      │
│ …                            ┆ …         ┆ …           ┆ …         │
│ 2020-08-31 00:00:00 MDT      ┆ 0.030521  ┆ 5.480208    ┆ 0.07      │
│ 2020-09-07 00:00:00 MDT      ┆ 0.006221  ┆ 5.48375     ┆ 0.02      │
│ 2020-09-14 00:00:00 MDT      ┆ 5.806146  ┆ 6.028563    ┆ 2.

In [33]:
# Weekend Flow
print(dd
      .set_sorted('datetime')
      .group_by_dynamic(index_column='datetime', every='7d', period='2d', start_by='saturday')
      .agg(pl.col(pl.Float64).mean(), cfs_range=(pl.col('cfs').max() - pl.col('cfs').min()))
     )

shape: (837, 4)
┌──────────────────────────────┬───────────┬─────────────┬───────────┐
│ datetime                     ┆ cfs       ┆ gage_height ┆ cfs_range │
│ ---                          ┆ ---       ┆ ---         ┆ ---       │
│ datetime[μs, America/Denver] ┆ f64       ┆ f64         ┆ f64       │
╞══════════════════════════════╪═══════════╪═════════════╪═══════════╡
│ 2001-05-12 00:00:00 MDT      ┆ 52.479167 ┆ null        ┆ 15.0      │
│ 2001-05-19 00:00:00 MDT      ┆ 49.177083 ┆ null        ┆ 18.0      │
│ 2001-05-26 00:00:00 MDT      ┆ 42.140625 ┆ null        ┆ 12.0      │
│ 2001-06-02 00:00:00 MDT      ┆ 25.578125 ┆ null        ┆ 9.0       │
│ 2001-06-30 00:00:00 MDT      ┆ 18.75     ┆ null        ┆ 6.0       │
│ …                            ┆ …         ┆ …           ┆ …         │
│ 2020-08-29 00:00:00 MDT      ┆ 0.05099   ┆ 5.473958    ┆ 0.06      │
│ 2020-09-05 00:00:00 MDT      ┆ 0.011912  ┆ 5.476875    ┆ 0.03      │
│ 2020-09-12 00:00:00 MDT      ┆ 5.930538  ┆ 6.003281    ┆ 3.

In [34]:
# Average Yearly Flow
print(dd
      .with_columns(year=pl.col('datetime').dt.year())
      .with_columns(year_mean_cfs=pl.col('cfs').mean().over('year'))
      .with_columns(pct_of_avg=(pl.col('cfs') / pl.col('year_mean_cfs')).mul(100).round(2))
     )

shape: (539_305, 8)
┌───────────┬─────────┬──────┬─────────────┬───────────────────┬──────┬───────────────┬────────────┐
│ agency_cd ┆ site_no ┆ cfs  ┆ gage_height ┆ datetime          ┆ year ┆ year_mean_cfs ┆ pct_of_avg │
│ ---       ┆ ---     ┆ ---  ┆ ---         ┆ ---               ┆ ---  ┆ ---           ┆ ---        │
│ str       ┆ i64     ┆ f64  ┆ f64         ┆ datetime[μs,      ┆ i32  ┆ f64           ┆ f64        │
│           ┆         ┆      ┆             ┆ America/Denver]   ┆      ┆               ┆            │
╞═══════════╪═════════╪══════╪═════════════╪═══════════════════╪══════╪═══════════════╪════════════╡
│ USGS      ┆ 9333500 ┆ 71.0 ┆ null        ┆ 2001-05-07        ┆ 2001 ┆ 74.304452     ┆ 95.55      │
│           ┆         ┆      ┆             ┆ 01:00:00 MDT      ┆      ┆               ┆            │
│ USGS      ┆ 9333500 ┆ 71.0 ┆ null        ┆ 2001-05-07        ┆ 2001 ┆ 74.304452     ┆ 95.55      │
│           ┆         ┆      ┆             ┆ 01:15:00 MDT      ┆      ┆

In [59]:
print(dd
      .with_columns(year=pl.col('datetime').dt.year(), quarter=pl.col('datetime').dt.quarter())
      .with_columns(pl.col(['cfs', 'gage_height']).mean().over('year').name.suffix('_mean_year'),
                    pl.col(['cfs', 'gage_height']).mean().over('quarter').name.suffix('_mean_quarter'))
     )

shape: (539_305, 11)
┌───────────┬─────────┬──────┬─────────────┬───┬────────────┬────────────┬────────────┬────────────┐
│ agency_cd ┆ site_no ┆ cfs  ┆ gage_height ┆ … ┆ cfs_mean_y ┆ gage_heigh ┆ cfs_mean_q ┆ gage_heigh │
│ ---       ┆ ---     ┆ ---  ┆ ---         ┆   ┆ ear        ┆ t_mean_yea ┆ uarter     ┆ t_mean_qua │
│ str       ┆ i64     ┆ f64  ┆ f64         ┆   ┆ ---        ┆ r          ┆ ---        ┆ rter       │
│           ┆         ┆      ┆             ┆   ┆ f64        ┆ ---        ┆ f64        ┆ ---        │
│           ┆         ┆      ┆             ┆   ┆            ┆ f64        ┆            ┆ f64        │
╞═══════════╪═════════╪══════╪═════════════╪═══╪════════════╪════════════╪════════════╪════════════╡
│ USGS      ┆ 9333500 ┆ 71.0 ┆ null        ┆ … ┆ 74.304452  ┆ null       ┆ 73.396286  ┆ 4.654955   │
│ USGS      ┆ 9333500 ┆ 71.0 ┆ null        ┆ … ┆ 74.304452  ┆ null       ┆ 73.396286  ┆ 4.654955   │
│ USGS      ┆ 9333500 ┆ 71.0 ┆ null        ┆ … ┆ 74.304452  ┆ null    

### Multiple Groupings with Time Series

In [82]:

url = "https://waterservices.usgs.gov/nwis/iv/?sites=13022500&agencyCd=USGS&startDT=2024-11-29T06:39:33.961-07:00&endDT=2024-12-06T06:39:33.961-07:00&parameterCd=00065&format=rdb"

In [83]:

save_path = "/Users/isisromero/desktop/polars/datasets/"

In [84]:
#
file_name = "nwis.waterservices.usgs.gov.txt"

In [85]:
os.makedirs(save_path, exist_ok=True)

In [86]:
file_path = os.path.join(save_path, file_name)

In [91]:
import requests
import polars as pl
import os

# URL Location
url = "https://waterservices.usgs.gov/nwis/iv/?sites=13022500&agencyCd=USGS&startDT=2024-11-29T06:39:33.961-07:00&endDT=2024-12-06T06:39:33.961-07:00&parameterCd=00065&format=rdb"  

# # Placing Path
save_path = "/Users/isisromero/desktop/polars/datasets/"
file_name = "waterdata.txt"

# Placing File
os.makedirs(save_path, exist_ok=True)

# Full Path
file_path = os.path.join(save_path, file_name)

try:
    print("Downloading data...")
    response = requests.get(url)
    response.raise_for_status()  

    # Saving & Storaging
    with open(file_path, 'wb') as file:
        file.write(response.content)
    print(f"Data successfully downloaded and saved at: {file_path}")
    
    # Processing File
    print("Processing the file with Polars...")
    snake_raw = pl.read_csv(
        file_path, 
        skip_rows=27, 
        separator='\t', 
        skip_rows_after_header=1
    )
    print("File successfully processed.")
    print(snake_raw.head())  
except requests.exceptions.RequestException as e:
    print(f"Error during data download: {e}")
except Exception as e:
    print(f"Error processing the file with Polars: {e}")

Downloading data...
Data successfully downloaded and saved at: /Users/isisromero/desktop/polars/datasets/waterdata.txt
Processing the file with Polars...
File successfully processed.
shape: (5, 6)
┌──────┬──────────┬──────────────────┬─────┬──────┬─────┐
│ 5s   ┆ 15s      ┆ 20d              ┆ 6s  ┆ 14n  ┆ 10s │
│ ---  ┆ ---      ┆ ---              ┆ --- ┆ ---  ┆ --- │
│ str  ┆ i64      ┆ str              ┆ str ┆ f64  ┆ str │
╞══════╪══════════╪══════════════════╪═════╪══════╪═════╡
│ USGS ┆ 13022500 ┆ 2024-11-29 07:00 ┆ MST ┆ 2.68 ┆ P   │
│ USGS ┆ 13022500 ┆ 2024-11-29 07:15 ┆ MST ┆ 2.67 ┆ P   │
│ USGS ┆ 13022500 ┆ 2024-11-29 07:30 ┆ MST ┆ 2.67 ┆ P   │
│ USGS ┆ 13022500 ┆ 2024-11-29 07:45 ┆ MST ┆ 2.66 ┆ P   │
│ USGS ┆ 13022500 ┆ 2024-11-29 08:00 ┆ MST ┆ 2.66 ┆ P   │
└──────┴──────────┴──────────────────┴─────┴──────┴─────┘


In [99]:
snake = (tweak_river(snake_raw
                     .with_columns(cfs=pl.lit(None).cast(pl.Float64)),cfs_col='cfs', gage_height_col='319803_00065') 
        )

ColumnNotFoundError: agency_cd

Resolved plan until failure:

	---> FAILED HERE RESOLVING 'select' <---
DF ["5s", "15s", "20d", "6s"]; PROJECT */7 COLUMNS; SELECTION: None

In [129]:

file_path = "/Users/isisromero/desktop/polars/datasets/waterdata.txt"

snake_raw = pl.read_csv(
    file_path,
    separator='\t',
    skip_rows=27,  
    truncate_ragged_lines=True  
)


print(snake_raw.head())

shape: (5, 6)
┌──────┬──────────┬──────────────────┬─────┬──────┬─────┐
│ 5s   ┆ 15s      ┆ 20d              ┆ 6s  ┆ 14n  ┆ 10s │
│ ---  ┆ ---      ┆ ---              ┆ --- ┆ ---  ┆ --- │
│ str  ┆ i64      ┆ str              ┆ str ┆ f64  ┆ str │
╞══════╪══════════╪══════════════════╪═════╪══════╪═════╡
│ USGS ┆ 13022500 ┆ 2024-11-29 06:45 ┆ MST ┆ 2.68 ┆ P   │
│ USGS ┆ 13022500 ┆ 2024-11-29 07:00 ┆ MST ┆ 2.68 ┆ P   │
│ USGS ┆ 13022500 ┆ 2024-11-29 07:15 ┆ MST ┆ 2.67 ┆ P   │
│ USGS ┆ 13022500 ┆ 2024-11-29 07:30 ┆ MST ┆ 2.67 ┆ P   │
│ USGS ┆ 13022500 ┆ 2024-11-29 07:45 ┆ MST ┆ 2.66 ┆ P   │
└──────┴──────────┴──────────────────┴─────┴──────┴─────┘


### Placing Names Manually

In [130]:

file_path = "/Users/isisromero/desktop/polars/datasets/waterdata.txt"

raw_data = pl.read_csv(
    file_path,
    separator='\t',
    skip_rows=27,  # Ajusta según la cantidad de filas de comentarios
    truncate_ragged_lines=True
)

common_cols = ['agency_cd', 'site_no', 'datetime', 'cfs', 'gage_height', 'source']

snake = raw_data.rename({old: new for old, new in zip(raw_data.columns, common_cols)})

print(snake.head())

shape: (5, 6)
┌───────────┬──────────┬──────────────────┬─────┬─────────────┬────────┐
│ agency_cd ┆ site_no  ┆ datetime         ┆ cfs ┆ gage_height ┆ source │
│ ---       ┆ ---      ┆ ---              ┆ --- ┆ ---         ┆ ---    │
│ str       ┆ i64      ┆ str              ┆ str ┆ f64         ┆ str    │
╞═══════════╪══════════╪══════════════════╪═════╪═════════════╪════════╡
│ USGS      ┆ 13022500 ┆ 2024-11-29 06:45 ┆ MST ┆ 2.68        ┆ P      │
│ USGS      ┆ 13022500 ┆ 2024-11-29 07:00 ┆ MST ┆ 2.68        ┆ P      │
│ USGS      ┆ 13022500 ┆ 2024-11-29 07:15 ┆ MST ┆ 2.67        ┆ P      │
│ USGS      ┆ 13022500 ┆ 2024-11-29 07:30 ┆ MST ┆ 2.67        ┆ P      │
│ USGS      ┆ 13022500 ┆ 2024-11-29 07:45 ┆ MST ┆ 2.66        ┆ P      │
└───────────┴──────────┴──────────────────┴─────┴─────────────┴────────┘


In [131]:
print(abc.with_columns(['a', 'b', 'c']))

shape: (3, 3)
┌─────┬─────┬─────┐
│ b   ┆ c   ┆ a   │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ 1   ┆ 4   ┆ 7   │
│ 2   ┆ 5   ┆ 8   │
│ 3   ┆ 6   ┆ 9   │
└─────┴─────┴─────┘


In [132]:
print(abc.select(['a', 'b', 'c']))

shape: (3, 3)
┌─────┬─────┬─────┐
│ a   ┆ b   ┆ c   │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ 7   ┆ 1   ┆ 4   │
│ 8   ┆ 2   ┆ 5   │
│ 9   ┆ 3   ┆ 6   │
└─────┴─────┴─────┘


In [133]:
print(snake
      .with_columns(source=pl.lit('snake'))
      .select(common_cols)
      .vstack(dd
              .with_columns(source=pl.lit('devil'))
              .select(common_cols)
             )
     )

SchemaError: type Datetime(Microseconds, Some("America/Denver")) is incompatible with expected type String

In [135]:
print("snake dtypes:")
print(snake.schema)

print("\ndd dtypes:")
print(dd.schema)

snake dtypes:
Schema([('agency_cd', String), ('site_no', Int64), ('datetime', String), ('cfs', String), ('gage_height', Float64), ('source', String)])

dd dtypes:
Schema([('agency_cd', String), ('site_no', Int64), ('cfs', Float64), ('gage_height', Float64), ('datetime', String)])


In [136]:
import polars as pl

snake = snake.with_columns(
    pl.col("cfs").cast(pl.Float64)
)

dd = dd.with_columns(
    pl.lit("devil").alias("source")
)

common_cols = ["agency_cd", "site_no", "datetime", "cfs", "gage_height", "source"]

result = snake.select(common_cols).vstack(dd.select(common_cols))

print(result)

InvalidOperationError: conversion from `str` to `f64` failed in column 'cfs' for 97 out of 97 values: ["MST", "MST", … "MST"]

In [137]:
print(snake.select("cfs").unique())

shape: (1, 1)
┌─────┐
│ cfs │
│ --- │
│ str │
╞═════╡
│ MST │
└─────┘


In [138]:
snake = snake.with_columns(
    pl.col("cfs")
    .map_elements(lambda x: None if isinstance(x, str) else x)  
    .cast(pl.Float64)
)



In [152]:
snake = snake.with_columns(
    pl.lit(None).cast(pl.Float64).alias("cfs")  
)

dd = dd.with_columns(
    pl.lit("devil").alias("source")
)

common_cols = ["agency_cd", "site_no", "datetime", "cfs", "gage_height", "source"]

result = snake.select(common_cols).vstack(
    dd.select(common_cols)
)

print(result)

shape: (539_977, 6)
┌───────────┬──────────┬─────────────────────────────────┬──────┬─────────────┬────────┐
│ agency_cd ┆ site_no  ┆ datetime                        ┆ cfs  ┆ gage_height ┆ source │
│ ---       ┆ ---      ┆ ---                             ┆ ---  ┆ ---         ┆ ---    │
│ str       ┆ i64      ┆ str                             ┆ f64  ┆ f64         ┆ str    │
╞═══════════╪══════════╪═════════════════════════════════╪══════╪═════════════╪════════╡
│ USGS      ┆ 13022500 ┆ 2024-11-29 06:45                ┆ null ┆ 2.68        ┆ P      │
│ USGS      ┆ 13022500 ┆ 2024-11-29 07:00                ┆ null ┆ 2.68        ┆ P      │
│ USGS      ┆ 13022500 ┆ 2024-11-29 07:15                ┆ null ┆ 2.67        ┆ P      │
│ USGS      ┆ 13022500 ┆ 2024-11-29 07:30                ┆ null ┆ 2.67        ┆ P      │
│ USGS      ┆ 13022500 ┆ 2024-11-29 07:45                ┆ null ┆ 2.66        ┆ P      │
│ …         ┆ …        ┆ …                               ┆ …    ┆ …           ┆ …      │
│

In [153]:
print(snake
      .with_columns(source=pl.lit('snake'))
      .select(common_cols)
      .vstack(dd
              .with_columns(source=pl.lit('devil'))
              .select(common_cols)
             )
      .sort('datetime')
      .group_by_dynamic(index_column='datetime', every='1mo', by='source')
      .agg(pl.col('gage_height').mean())
     )

  .group_by_dynamic(index_column='datetime', every='1mo', by='source')


InvalidOperationError: unsupported data type: str for `every`, expected UInt64, UInt32, Int64, Int32, Datetime, Date, Duration, or Time

In [156]:
common_cols = ['agency_cd', 'site_no', 'datetime', 'cfs', 'gage_height', 'source']

### Window Function in Polars

In [158]:
print(dd
      .set_sorted('datetime')  
      .group_by_dynamic(index_column='datetime', every='1mo')
      .agg(pl.col('cfs').mean())
      .with_columns(mean_cfs_3mo=pl.col('cfs').mean()
                    .rolling('datetime', period='3mo'))
     )

In [159]:
hvplot.extension('plotly')

In [160]:
(dd
 .set_sorted('datetime')
 .group_by_dynamic(index_column='datetime', every='1mo')
 .agg(pl.col('cfs').mean())
 .with_columns(mean_cfs_3mo=pl.col('cfs').mean()
               .rolling('datetime', period='3mo'))
 .plot(x='datetime', y=['cfs', 'mean_cfs_3mo'])
)

In [161]:
cfs_max = 1_000

In [162]:
(dd
 .set_sorted('datetime')
 .group_by_dynamic(index_column='datetime', every='1mo')
 .agg(pl.col('cfs').mean())
 .with_columns(pl.col('cfs').clip(upper_bound=cfs_max), mean_cfs_3mo=pl.col('cfs').mean()
               .rolling('datetime', period='3mo')
               .clip(upper_bound=cfs_max))
 .plot(x='datetime', y=['cfs', 'mean_cfs_3mo'])
)

### Interpolation

In [163]:
print(dd
      .filter(pl.col('cfs').is_null())
     )

In [164]:
denver = pytz.timezone('America/Denver')

jul_7 = datetime.datetime(2018, 7, 7).replace(tzinfo=denver)

jul_9 = datetime.datetime(2018, 7, 9).replace(tzinfo=denver)

In [165]:
print(dd
      .filter(pl.col('datetime').is_between(jul_7, jul_9))
      .filter(pl.col('cfs').is_null())
     )

In [166]:
(dd
 .filter(pl.col('datetime').is_between(jul_7, jul_9))
 .plot(x='datetime', y=['cfs'])
)

In [167]:
offset = .1

In [168]:
(dd
 .filter(pl.col('datetime').is_between(jul_7, jul_9))
 .with_columns(
     'datetime', 'cfs',
     fill0=pl.col('cfs').fill_null(0).add(offset),
     interpolate=pl.col('cfs').interpolate().add(offset*2),
     forward=pl.col('cfs').fill_null(strategy='forward').add(offset*3),
     backward=pl.col('cfs').fill_null(strategy='backward').add(offset*4)
 )
 .plot(x='datetime', y=['cfs', 'fill0', 'interpolate', 'forward', 'backward'])
)

### Upsampling & Downsampling

In [169]:
print(dd
      .set_sorted('datetime')
      .upsample('datetime', every='5m')
      .interpolate()
     )

In [170]:
print(dd
      .set_sorted('datetime')
      .group_by_dynamic(index_column='datetime', every='5m')
      .agg(pl.col('cfs').mean())
     )

In [171]:
print(dd
      .set_sorted('datetime')
      .group_by_dynamic(index_column='datetime', every='30m')
      .agg(pl.col('cfs').mean())
     )

### Joining Time Series Data

In [172]:
url = 'https://github.com/mattharrison/datasets/raw/master/data/hanksville.csv'

In [173]:
def tweak_temp(df_):
    return (df_
            .select(pl.col('DATE').str.to_datetime()
                    .dt.replace_time_zone('America/Denver'),
                    'PRCP', 'TMIN', 'TMAX', 'TOBS')
           )

In [174]:
raw_temp = pl.read_csv(url)

In [175]:
print(tweak_temp(raw_temp))

In [146]:
dd_daily = (dd
            .set_sorted('datetime')
            .group_by_dynamic(index_column='datetime', every='1d', 
                              check_sorted=False)
            .agg(pl.col('gage_height', 'cfs').mean())
           )

In [176]:
both = (tweak_temp(raw_temp)
        .join(dd_daily, left_on='DATE', right_on='datetime', validate='1:1')
       )

### Visualizing The Merged Data

In [177]:
hvplot.extension('bokeh')

In [178]:
year_agg = (both
            .with_columns(day_of_year=pl.col('DATE')
                          .dt.strftime('%j').cast(pl.Int16),year=pl.col('DATE').dt.year())
            .pivot(index='day_of_year', columns='year', values='TOBS')
            .sort('day_of_year')
           )

In [179]:
p1 = (year_agg
      .with_columns(
          pl.col(['2001', '2002', '2003', '2004', '2005', '2006', 
                  '2007', '2008', '2009', '2010', '2011', '2012', 
                  '2014', '2015', '2016', '2017', '2018', '2019',# '2020'
                 ])
          .rolling_median(7)
      )
      .plot(x='day_of_year',  alpha=.5, line_width=1, 
            color=hv.Palette('Greys'), 
            title='Weekly Temperature (F)', width=1_000, height=500)
     )

In [180]:
p2 = p1 * (year_agg
           .select(pl.col('day_of_year', '2020'))
           .plot(x='day_of_year', y='2020', color='blue', line_width=2, label='2020')
          )

In [181]:
p2 * (both
      .with_columns(day_of_year=pl.col('DATE')
                    .dt.strftime('%j').cast(pl.Int16),
                    median=pl.lit('Median'))
      .pivot(index='day_of_year', columns='median', values='TOBS', aggregate_function='median')
      .sort('day_of_year')
      .plot(x='day_of_year', y='Median', c='r', label='Median')
     )

In [182]:
def plot_year_last_year_median(df, col, upper_limit=None, lower_limit=None, width=1_000, height=500):
    if upper_limit is None:
        upper_limit = df[col].max()
        year_agg = (df
                    .with_columns(day_of_year=pl.col('DATE')
                                  .dt.strftime('%j').cast(pl.Int16), year=pl.col('DATE').dt.year())
                    .pivot(index='day_of_year', columns='year', values=col)
                    .sort('day_of_year')
                   )
        
        p1 = (year_agg
              .with_columns(pl.col(
                  ['2001', '2002', '2003', '2004', '2005', '2006', 
                   '2007', '2008', '2009', '2010', '2011', '2012', 
                   '2014', '2015', '2016', '2017', '2018', '2019',
                  ])
                            .rolling_median(7).clip(upper_bound=upper_limit))
              .plot(x='day_of_year', alpha=1, color=hv.Palette('Greys'), 
                    line_width=.5, width=width, height=height)
             )
        
        p2 = p1*(year_agg
                 .select('day_of_year', pl.col('2020')
                         .rolling_median(7).clip(upper_bound=upper_limit))      
                 .plot(x='day_of_year', y='2020', color='blue', line_width=2, 
                       title=f'Weekly {col}', label='2020')
                )
        
        p3 = p2*(df
                 .with_columns(day_of_year=pl.col('DATE')
                               .dt.strftime('%j').cast(pl.Int16), median=pl.lit('Median'))
                 .pivot(index='day_of_year', columns='median', values=col, aggregate_function='median')
                 .sort('day_of_year')
                 .with_columns(pl.col('Median')
                               .rolling_mean(7).clip(upper_bound=upper_limit))
                 .plot(x='day_of_year', y='Median', color='red', label='Median')
                )
        return p3

In [184]:
plot_year_last_year_median(both, 'cfs', upper_limit=200)