In [28]:
%%timeit
# Take the eight CSV files and turn them into a data frame.

# We are only interested in the first three columns from each CSV file: the date
# and time, the max temperature, and the min temperature.

# Add city and state columns that contain the city and state from the filename
# and allow us to distinguish between rows.


import pandas as pd
from pathlib import Path
path = "data/weather"
files = Path(path).glob('*.csv')

df_parts = []
names = ["date_time", "max_temp", "min_temp"]
for f in files:
    # take city and state from filename
    file_name = str(f).split('/')
    city, state = file_name[-1].split('.')[0].replace("+", " ").split(',')
    
    # read csv and rename columns
    df_part = pd.read_csv(f)
    cols = df_part.columns.to_list()
    cols[0:3] = names
    df_part.columns = cols
    
    # drop other unused colums and add city, state
    (df_parts
     .append(
         df_part
         .drop(df_part.columns[3:], axis=1)
         .assign(city=city, state=state)))

# concat tables to create df
df = pd.concat(df_parts, ignore_index=True, axis=0)

11.5 ms ± 201 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [29]:
%%timeit
# Does the data for each city and state start and end at (roughly) the same 
# time? How do you know?
df.groupby(["state", "city"]).agg(min=("date_time", "min"), max=("date_time", "max"))

1.9 ms ± 21.2 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [30]:
%%timeit
# What is the lowest minimum temperature recorded for each city in the data set?
df.groupby(["state", "city"]).agg(lowest_temp=("min_temp", "min"))

1.11 ms ± 19.8 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [31]:
%%timeit
# What is the highest maximum temperature recorded in each state in the data set?
df.groupby("state").agg(highest_temp=("max_temp", "max"))

833 μs ± 12.7 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [32]:
%%timeit
# What is the average difference in temperature (i.e., max – min) for each of the
# cities in our data set
(
    df
    .assign(delta=(df["max_temp"] - df["min_temp"]))
    .groupby(["state", "city"])
    .agg(mean_delta=("delta", "mean"))
)

1.53 ms ± 15.8 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [35]:
%%timeit
import polars as pl
from pathlib import Path
path = "data/weather"
files = Path(path).glob('*.csv')

df_parts = []
for file in files:
    file_name = str(file).split('/')
    city, state = file_name[-1].split('.')[0].replace("+", " ").split(',')
    df_part = (
            pl
            .scan_csv(file, try_parse_dates=True)
            .select(
                pl.col("date_time"), 
                pl.col("^[a-z,+]+_maxtempC$").alias("max_temp"), 
                pl.col("^[a-z,+]+_mintempC$").alias("min_temp"))
            .with_columns(
                pl.lit(city).alias("city"), 
                pl.lit(state).alias("state"))
        )
    df_parts.append(df_part)

df = pl.concat(df_parts).collect()

31.5 ms ± 1.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [36]:
%%timeit
# Does the data for each city and state start and end at (roughly) the same 
# time? How do you know?
(
    df
    .group_by(pl.col("state"), pl.col("city"))
    .agg(
        pl.min("date_time").alias("min"), 
        pl.max("date_time").alias("max")
    )
)


855 μs ± 12.2 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [37]:
%%timeit
# What is the lowest minimum temperature recorded for each city in the data set?
df.group_by(pl.col("state"), pl.col("city")).agg(pl.min("min_temp").alias("min"))

826 μs ± 11.9 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [38]:
%%timeit
# What is the highest maximum temperature recorded in each state in the data set?
df.group_by(pl.col("state"), pl.col("city")).agg(pl.max("max_temp").alias("max"))

825 μs ± 10.5 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [39]:
%%timeit
# What is the average difference in temperature (i.e., max – min) for each of the
# cities in our data set
(
    df
    .with_columns((pl.col("max_temp") - pl.col("min_temp")).alias("delta"))
    .group_by(pl.col("state"), pl.col("city"))
    .agg(pl.mean("delta").alias("mean"))
)

1.42 ms ± 13.6 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [41]:
%%timeit
import pyarrow as pa
import pyarrow.csv

from pathlib import Path
path = "data/weather"
files = list(Path(path).glob('*.csv'))

df_parts = []
for file in files:
    # take city and state from filename
    file_name = str(file).split('/')
    city, state = file_name[-1].split('.')[0].replace("+", " ").split(',')

    table = pyarrow.csv.read_csv(file)
    df_parts.append(
        table
        .drop_columns(table.column_names[3:])
        .rename_columns(["date_time", "max_temp", "min_temp"])
        .append_column("city", [[pa.scalar(city)]*table.num_rows])
        .append_column("state", [[pa.scalar(state)]*table.num_rows])
    )
df = pa.concat_tables(df_parts)

6.7 ms ± 72 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
# import pyarrow.parquet as pq
# pq.write_table(table, 'example.parquet')

In [42]:
%%timeit
# Does the data for each city and state start and end at (roughly) the same 
# time? How do you know?
(
    df
    .group_by(["state","city"])
    .aggregate([("date_time", "min"), ("date_time", "max")]
    )
)

206 μs ± 2.34 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [None]:
%%timeit
# What is the lowest minimum temperature recorded for each city in the data set?
df.group_by(["city"]).aggregate([("min_temp", "min")])

190 μs ± 910 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [None]:
%%timeit
# What is the highest maximum temperature recorded in each state in the data set?
df.group_by(["state"]).aggregate([("max_temp", "max")])

191 μs ± 3.78 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [45]:
%%timeit
# What is the average difference in temperature (i.e., max – min) for each of the
# cities in our data set
import pyarrow.compute as pc
pc.subtract(df["max_temp"], df["min_temp"])
(
    df
    .append_column(
        pa.field("delta", pa.int64()), 
        pc.subtract(df["max_temp"], df["min_temp"])
    )
    .group_by(["state","city"])
    .aggregate([("delta", "mean")])
)

216 μs ± 1.37 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


SAT scores, revisited

In [78]:
import pandas as pd
# Read in the scores file (sat-scores.csv). This time, you want the following 
# columns

columns = ["Year", "State.Code", "Total.Math", "Family Income.Less than 20k.Math",
"Family Income.Between 20-40k.Math", "Family Income.Between 40-60k.Math",
"Family Income.Between 60-80k.Math", "Family Income.Between 80-100k.Math",
"Family Income.More than 100k.Math"]

# Rename the income-related column names to something shorter

new_names = ["Year", "State.Code", "Total.Math", "income<20k", "20k<income<40k", 
        "40k<income<60k", "60k<income<80k", "80k<income<100k", "income>100k"]
cols = dict(zip(columns, new_names))

df = pd.read_csv("data/sat-scores.csv", usecols=columns).rename(columns=cols)[new_names]

In [47]:
# Find the average SAT math score for each income level, grouped and then
# sorted by year
df1 = (
    df
    .drop(columns=["State.Code", "Total.Math"])
    .groupby("Year").agg("mean")
    .sort_index(ascending=True)
)


In [107]:
# For each year in the data set, determine how much better each income group
# did, on average, than the next-poorer group of students. Do you see
# any income group that did worse, in any year, than the next-poorer students

df2 = df1.shift(-1, axis=1)
new_names = ["(20, 40) vs (0, 20)", "(40, 60) vs (20, 40)", 
             "(60, 80) vs (40, 60)", "(80, 100) vs (60, 80)", 
             "(100, ) vs (80, 100)"]
diff = (df2 - df1).dropna(axis=1)
diff.columns = new_names
diff < 0

Unnamed: 0_level_0,"(20, 40) vs (0, 20)","(40, 60) vs (20, 40)","(60, 80) vs (40, 60)","(80, 100) vs (60, 80)","(100, ) vs (80, 100)"
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2005,False,False,False,False,False
2006,False,False,False,False,False
2007,False,False,False,False,False
2008,False,False,False,False,False
2009,False,False,False,False,False
2010,False,False,False,False,False
2011,False,False,False,False,False
2012,False,False,False,False,False
2013,False,False,False,False,False
2014,False,False,False,False,False


In [51]:
# Which income bracket, on average, had the greatest advantage over the next-
# poorer income bracket?
diff.mean(axis=0).idxmax()


'(20, 40) vs (0, 20)'

In [124]:
# Which five states have the greatest gap in SAT math scores between the richest
# and poorest students?
(
    df
    .assign(delta=df["income>100k"] - df["income<20k"])
    .groupby("State.Code")["delta"]
    .mean()
    .sort_values(ascending=False)[:5]
)

State.Code
ND    341.909091
WY    246.454545
DC    208.818182
SD    157.000000
MS    140.000000
Name: delta, dtype: float64

In [135]:
import polars as pl
# Read in the scores file (sat-scores.csv). This time, you want the following 
# columns

columns = ["Year", "State.Code", "Total.Math", "Family Income.Less than 20k.Math",
"Family Income.Between 20-40k.Math", "Family Income.Between 40-60k.Math",
"Family Income.Between 60-80k.Math", "Family Income.Between 80-100k.Math",
"Family Income.More than 100k.Math"]

# Rename the income-related column names to something shorter

new_names = ["Year", "State.Code", "Total.Math", "income<20k", "20k<income<40k", 
        "40k<income<60k", "60k<income<80k", "80k<income<100k", "income>100k"]
cols = dict(zip(columns, new_names))

df = pl.read_csv("data/sat-scores.csv", columns=columns).rename(cols)[new_names]

In [143]:
# Find the average SAT math score for each income level, grouped and then
# sorted by year
df1 = (
    df
    .drop(["State.Code", "Total.Math"])
    .group_by("Year").mean()
    .sort(by="Year", descending=False)
)

In [186]:
# For each year in the data set, determine how much better each income group
# did, on average, than the next-poorer group of students. Do you see
# any income group that did worse, in any year, than the next-poorer students

import polars.selectors as cs
new_names = ["(20, 40) vs (0, 20)", "(40, 60) vs (20, 40)", 
             "(60, 80) vs (40, 60)", "(80, 100) vs (60, 80)", 
             "(100, ) vs (80, 100)"]

tmp = df1.drop("Year").transpose()
diff = (tmp.shift(-1) - tmp).transpose().drop(cs.by_index(-1))
diff.columns = new_names
diff < 0

"(20, 40) vs (0, 20)","(40, 60) vs (20, 40)","(60, 80) vs (40, 60)","(80, 100) vs (60, 80)","(100, ) vs (80, 100)"
bool,bool,bool,bool,bool
false,false,false,false,false
false,false,false,false,false
false,false,false,false,false
false,false,false,false,false
false,false,false,false,false
…,…,…,…,…
false,false,false,false,false
false,false,false,false,false
false,false,false,false,false
false,false,false,false,false


In [None]:
# Which income bracket, on average, had the greatest advantage over the next-
# poorer income bracket?
(
    diff
    .mean()
    .transpose(include_header=True)
    .sort(by="column_0", descending=True)[:1]
)

column,column_0
str,f64
"""(20, 40) vs (0, 20)""",38.517944


In [221]:
# Which five states have the greatest gap in SAT math scores between the richest
# and poorest students?
(
    df
    .with_columns((pl.col("income>100k") - pl.col("income<20k")).alias("delta"))
    .group_by("State.Code")
    .agg(pl.col("delta").mean())
    .sort(by="delta", descending=True)[:5]
)

State.Code,delta
str,f64
"""ND""",341.909091
"""WY""",246.454545
"""DC""",208.818182
"""SD""",157.0
"""MS""",140.0
