In [50]:
# Take the eight CSV files and turn them into a data frame.

# We are only interested in the first three columns from each CSV file: the date
# and time, the max temperature, and the min temperature.

# Add city and state columns that contain the city and state from the filename
# and allow us to distinguish between rows.


import pandas as pd
from pathlib import Path
path = "data/weather"
files = Path(path).glob('*.csv')

df_parts = []
names = ["date_time", "max_temp", "min_temp"]
for f in files:
    # take city and state from filename
    file_name = str(f).split('/')
    city, state = file_name[-1].split('.')[0].replace("+", " ").split(',')
    
    # read csv and rename columns
    df_part = pd.read_csv(f)
    cols = df_part.columns.to_list()
    cols[0:3] = names
    df_part.columns = cols
    
    # drop other unused colums and add city, state
    (df_parts
     .append(
         df_part
         .drop(df_part.columns[3:], axis=1)
         .assign(city=city, state=state)))

# concat tables to create df
df = pd.concat(df_parts, ignore_index=True, axis=0)

In [61]:
# Does the data for each city and state start and end at (roughly) the same 
# time? How do you know?
df.groupby(["state", "city"]).agg(min=("date_time", "min"), max=("date_time", "max"))

Unnamed: 0_level_0,Unnamed: 1_level_0,min,max
state,city,Unnamed: 2_level_1,Unnamed: 3_level_1
ca,los angeles,2018-12-11 00:00:00,2019-03-11 21:00:00
ca,san francisco,2018-12-11 00:00:00,2019-03-11 21:00:00
il,chicago,2018-12-11 00:00:00,2019-03-11 21:00:00
il,springfield,2018-12-11 00:00:00,2019-03-11 21:00:00
ma,boston,2018-12-11 00:00:00,2019-03-11 21:00:00
ma,springfield,2018-12-11 00:00:00,2019-03-11 21:00:00
ny,albany,2018-12-11 00:00:00,2019-03-11 21:00:00
ny,new york,2018-12-11 00:00:00,2019-03-11 21:00:00


In [60]:
# What is the lowest minimum temperature recorded for each city in the data set?
df.groupby(["state", "city"]).agg(lowest_temp=("min_temp", "min"))

Unnamed: 0_level_0,Unnamed: 1_level_0,lowest_temp
state,city,Unnamed: 2_level_1
ca,los angeles,4
ca,san francisco,3
il,chicago,-28
il,springfield,-25
ma,boston,-14
ma,springfield,-20
ny,albany,-19
ny,new york,-14


In [None]:
# What is the highest maximum temperature recorded in each state in the data set?
df.groupby("state").agg(highest_temp=("max_temp", "max"))

Unnamed: 0_level_0,highest_temp
state,Unnamed: 1_level_1
ca,23
il,16
ma,17
ny,15


In [64]:
# What is the average difference in temperature (i.e., max – min) for each of the
# cities in our data set
(
    df
    .assign(delta=(df["max_temp"] - df["min_temp"]))
    .groupby(["state", "city"])
    .agg(mean_delta=("delta", "mean"))
)

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_delta
state,city,Unnamed: 2_level_1
ca,los angeles,6.417582
ca,san francisco,4.351648
il,chicago,4.340659
il,springfield,6.934066
ma,boston,6.010989
ma,springfield,7.505495
ny,albany,6.318681
ny,new york,5.263736


In [153]:
import polars as pl
from pathlib import Path
path = "data/weather"
files = Path(path).glob('*.csv')

df_parts = []
for file in files:
    file_name = str(file).split('/')
    city, state = file_name[-1].split('.')[0].replace("+", " ").split(',')
    print(city, state)
    df_part = (
            pl
            .scan_csv(file, try_parse_dates=True)
            .select(
                pl.col("date_time"), 
                pl.col("^[a-z,+]+_maxtempC$").alias("max_temp"), 
                pl.col("^[a-z,+]+_mintempC$").alias("min_temp"))
            .with_columns(
                pl.lit(city).alias("city"), 
                pl.lit(state).alias("state"))
        )
    df_parts.append(df_part)

df = pl.concat(df_parts).collect()

san francisco ca
new york ny
springfield ma
boston ma
springfield il
albany ny
los angeles ca
chicago il


In [154]:
# Does the data for each city and state start and end at (roughly) the same 
# time? How do you know?
(
    df
    .group_by(pl.col("state"), pl.col("city"))
    .agg(
        pl.min("date_time").alias("min"), 
        pl.max("date_time").alias("max")
    )
)


state,city,min,max
str,str,datetime[μs],datetime[μs]
"""ma""","""boston""",2018-12-11 00:00:00,2019-03-11 21:00:00
"""ca""","""san francisco""",2018-12-11 00:00:00,2019-03-11 21:00:00
"""il""","""springfield""",2018-12-11 00:00:00,2019-03-11 21:00:00
"""ny""","""albany""",2018-12-11 00:00:00,2019-03-11 21:00:00
"""ma""","""springfield""",2018-12-11 00:00:00,2019-03-11 21:00:00
"""ca""","""los angeles""",2018-12-11 00:00:00,2019-03-11 21:00:00
"""ny""","""new york""",2018-12-11 00:00:00,2019-03-11 21:00:00
"""il""","""chicago""",2018-12-11 00:00:00,2019-03-11 21:00:00


In [155]:
# What is the lowest minimum temperature recorded for each city in the data set?
df.group_by(pl.col("state"), pl.col("city")).agg(pl.min("min_temp").alias("min"))

state,city,min
str,str,i64
"""ny""","""albany""",-19
"""ma""","""boston""",-14
"""il""","""chicago""",-28
"""ca""","""los angeles""",4
"""ny""","""new york""",-14
"""il""","""springfield""",-25
"""ca""","""san francisco""",3
"""ma""","""springfield""",-20


In [156]:
# What is the highest maximum temperature recorded in each state in the data set?
df.group_by(pl.col("state"), pl.col("city")).agg(pl.max("max_temp").alias("max"))

state,city,max
str,str,i64
"""ca""","""san francisco""",15
"""ny""","""new york""",15
"""il""","""springfield""",16
"""ma""","""boston""",17
"""ca""","""los angeles""",23
"""ny""","""albany""",13
"""il""","""chicago""",9
"""ma""","""springfield""",15


In [160]:
# What is the average difference in temperature (i.e., max – min) for each of the
# cities in our data set
(
    df
    .with_columns((pl.col("max_temp") - pl.col("min_temp")).alias("delta"))
    .group_by(pl.col("state"), pl.col("city"))
    .agg(pl.mean("delta").alias("mean"))
)

state,city,mean
str,str,f64
"""ca""","""san francisco""",4.351648
"""il""","""springfield""",6.934066
"""ny""","""albany""",6.318681
"""il""","""chicago""",4.340659
"""ma""","""springfield""",7.505495
"""ca""","""los angeles""",6.417582
"""ny""","""new york""",5.263736
"""ma""","""boston""",6.010989


In [225]:
import pyarrow as pa
import pyarrow.csv

from pathlib import Path
path = "data/weather"
files = list(Path(path).glob('*.csv'))

df_parts = []
for file in files:
    # take city and state from filename
    file_name = str(file).split('/')
    city, state = file_name[-1].split('.')[0].replace("+", " ").split(',')

    table = pyarrow.csv.read_csv(file)
    df_parts.append(
        table
        .drop_columns(table.column_names[3:])
        .rename_columns(["date_time", "max_temp", "min_temp"])
        .append_column("city", [[pa.scalar(city)]*table.num_rows])
        .append_column("state", [[pa.scalar(state)]*table.num_rows])
    )
df = pa.concat_tables(df_parts)