CHAPTER 5

In [1]:
import pandas as pd

In [70]:
path = "data/nyc-parking-violations-2020.csv"
df = pd.read_csv(path, usecols=[
    "Plate ID", "Registration State", "Vehicle Make", "Vehicle Color", 
    "Violation Time", "Street Name"])

In [72]:
%%timeit
total = len(df.index)
without_any_nans = len(df.dropna().index)
(total - without_any_nans) * 100

1.72 s ± 11.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [74]:
%%timeit
# loss with subset features
subset = ["Plate ID", "Registration State", "Vehicle Make", "Street Name"]
(len(df) - len(df.dropna(subset=subset))) * 100

subset = ["Plate ID", "Registration State", "Street Name"]
(len(df.index) - len(df.dropna(subset=subset).index)) * 100

2.87 s ± 15.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [76]:
%%timeit
# How many rows would you eliminate if you required at least three non-null 
# values from the four columns Plate ID, Registration State, Vehicle Make, and
# Street Name

rows_with_at_least_3_non_nans = len(
    df[
        (
            df['Plate ID'].notnull().astype(int) + 
            df['Registration State'].notnull().astype(int) + 
            df['Vehicle Make'].notnull().astype(int) + 
            df['Street Name'].notnull().astype(int)
        ) >= 3
    ]
)
total - rows_with_at_least_3_non_nans

1.32 s ± 13.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [77]:
%%timeit
# Which of the columns you've imported has the greatest number of NaN values
df.isnull().astype(int).sum()

1.41 s ± 15.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [78]:
%%timeit
# Null data is bad, but there is plenty of bad non-null data, too. For example,
# many cars with BLANKPLATE as a plate ID were ticketed. Turn these into NaN 
# values, and rerun the previous query.

df['Plate ID'] = df['Plate ID'].replace('BLANKPLATE', pd.NA)
df.isnull().astype(int).sum()

1.92 s ± 16.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [79]:
import polars as pl

In [80]:
path = "data/nyc-parking-violations-2020.csv"
df = pl.read_csv(path, columns=[
    "Plate ID", "Registration State", "Vehicle Make", "Vehicle Color", 
    "Violation Time", "Street Name"])

In [81]:
%%timeit
total = len(df)
without_any_nans = len(df.drop_nulls())
(total - without_any_nans) * 100

22.8 ms ± 709 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [83]:
%%timeit
# loss with subset features
subset = ["Plate ID", "Registration State", "Vehicle Make", "Street Name"]
(len(df) - len(df.drop_nulls(subset=subset))) * 100

subset = ["Plate ID", "Registration State", "Street Name"]
(len(df) - len(df.drop_nulls(subset=subset))) * 100

44 ms ± 2.32 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [85]:
%%timeit
# How many rows would you eliminate if you required at least three non-null 
# values from the four columns Plate ID, Registration State, Vehicle Make, and
# Street Name

rows_with_at_least_3_non_nans = len(
    df.filter(
        (
            df['Plate ID'].is_not_null().cast(int) + 
            df['Registration State'].is_not_null().cast(int) + 
            df['Vehicle Make'].is_not_null().cast(int) + 
            df['Street Name'].is_not_null().cast(int)
        ) >= 3)
)
len(df) - rows_with_at_least_3_non_nans

192 ms ± 11.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [86]:
%%timeit
# Which of the columns you've imported has the greatest number of NaN values
df.null_count()

846 ns ± 9.27 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [87]:
%%timeit
# Null data is bad, but there is plenty of bad non-null data, too. For example,
# many cars with BLANKPLATE as a plate ID were ticketed. Turn these into NaN 
# values, and rerun the previous query.

(
    df
    .with_columns(df['Plate ID'].replace('BLANKPLATE', None).alias('Plate ID'))
    .null_count()
)

40.6 ms ± 87.9 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


2. The goal of this exercise is to find the average age of celebrities who died February–July 2016

In [66]:
import pandas as pd
path = "data/celebrity_deaths_2016.csv"
df = pd.read_csv(path, usecols=["dateofdeath", "age"], parse_dates=["dateofdeath"])

In [67]:
%%timeit
# add new column with month
df["month"] = df["dateofdeath"].map(lambda x: x.month)

2.51 ms ± 24.7 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [68]:
# change index to month
df.reset_index(inplace=True)
df = df.set_index(keys=['month'], drop=True).drop(columns=["index"])

In [69]:
# sort df by index
df.sort_index(inplace=True)

In [70]:
%%time
# clean all nonintegers in age column
df["age"] = df["age"].replace(r"[a-zA-Z/ ._-]*", "", regex=True).str.slice(0, 2)

df["age"] = pd.to_numeric(df["age"], errors='coerce')
df.dropna(inplace=True)

CPU times: user 9.28 ms, sys: 1.12 ms, total: 10.4 ms
Wall time: 9.91 ms


In [72]:
%%timeit
# age to int
df["age"] = df["age"].astype(int)

25.9 μs ± 562 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [73]:
%%timeit
# find avg age from feb to july
df.loc[2:7, "age"].mean()

25.7 μs ± 156 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [74]:
%%timeit
import datetime as dt
# mean age value in period [2016-02-15, 2016-07-15]
df.reset_index(inplace=True, drop=False)
df.set_index(keys=["dateofdeath"], drop=True, inplace=True)

start = df.index.searchsorted(dt.datetime(2016, 2, 15))
end = df.index.searchsorted(dt.datetime(2016, 7, 15))
df[start:end]['age'].mean()

153 μs ± 2.03 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [79]:
%%timeit
#top 5 cause of death
df = pd.read_csv(path, usecols=["dateofdeath", "age", "causeofdeath"], parse_dates=["dateofdeath"])
df["causeofdeath"].value_counts()[:5]

4.21 ms ± 91 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [78]:
%%timeit
#top 5 cause of death with unknown
df["causeofdeath"].replace(pd.NA, "unknown").value_counts()[:5]

444 μs ± 7.47 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [80]:
import polars as pl

In [81]:
path = "data/celebrity_deaths_2016.csv"
df = pl.read_csv(
    path, columns=["dateofdeath", "age"], 
    try_parse_dates=True, 
    ignore_errors=True, 
    schema=pl.Schema(
        {
            "dateofdeath": pl.Date,
            "name": pl.String, 
            "age": pl.String,
            "bio": pl.String,
            "causeofdeath": pl.String
        }))

In [85]:
%%time
# add new column with month
df = df.with_columns(pl.col("dateofdeath").dt.month().alias("month"))

CPU times: user 1.59 ms, sys: 2.33 ms, total: 3.92 ms
Wall time: 950 μs


In [86]:
%%timeit
# sort by month
df.sort(by=["month"])

173 μs ± 1.35 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [88]:
%%time
# clean all nonintegers in age column
df = (
        df
        .with_columns(pl.col("age").replace(r"^[0-9]*", None).str.slice(0,2)
                      .cast(int, wrap_numerical=True)).drop_nulls()
    )


CPU times: user 4.17 ms, sys: 25.4 ms, total: 29.6 ms
Wall time: 23.9 ms


In [89]:
%%timeit
# find avg age from feb to july
df.filter((pl.col("month")>=2) & (pl.col("month") <= 7)).select("age").mean()

395 μs ± 8.12 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [90]:
%%timeit
import datetime as dt
# mean age value in period [2016-02-15, 2016-07-15]
start = dt.datetime(2016, 2, 15)
end = dt.datetime(2016, 7, 15)
df.filter((pl.col("dateofdeath")>=start) & (pl.col("dateofdeath") <= end)).select("age").mean()


423 μs ± 6.14 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [95]:
%%timeit
#top 5 cause of death
df = pl.read_csv(path, columns=["causeofdeath"])
df["causeofdeath"].drop_nulls().value_counts(sort=True)[:5]

564 μs ± 38.9 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [96]:
%%timeit
#top 5 cause of death with unknown
df["causeofdeath"].fill_null("unknown").value_counts(sort=True)[:5]

259 μs ± 2.71 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


3. Fill in missing data from the famous Titanic data set

In [107]:
path = "data/titanic3.xls"
df = pd.read_excel(path)

# df.to_csv('data/titanic3.csv', index=False) # for arrow

In [108]:
%%timeit
# which columns contain null values
df.isna().sum()

214 μs ± 2.01 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [109]:
# %%time
# For each column containing null values, decide whether you will fill it with a
# value—and if so, with what value, calculated or otherwise
df["age"] = df["age"].interpolate()
df[df["fare"].isna()]  = df[df["fare"] < 400]['fare'].mean().astype(int)
df = df.dropna(subset=["embarked"])
df["home.dest"] = df["home.dest"].fillna(df["home.dest"].mode()[0])

In [125]:
%%timeit
# Create a series (most_common_destinations) in which the index contains the
# unique values from the embarked column and the values are the most common
# destination for each value of embarked.

most_common_destinations = pd.Series()

for name in df['embarked'].dropna().unique():
    most_common_destinations.loc[name] = \
        df[df['embarked']==name]['home.dest'].value_counts().index[0]
    

3.4 ms ± 28.6 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [126]:
%%timeit
# Replace NaN values in the home.dest column with values from embarked
df['home.dest'] = df['home.dest'].fillna(df['embarked'])

82.4 μs ± 915 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [127]:
%%timeit
# Use the most_common_destinations series to replace values in home.dest with
# the most common values for each embarkation point
df['home.dest'] = df['home.dest'].replace(most_common_destinations)

169 μs ± 1.94 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [129]:
import polars as pl

In [159]:
df = pl.read_excel("data/titanic3.xls")

In [162]:
%%time
# Create a series (most_common_destinations) in which the index contains the
# unique values from the embarked column and the values are the most common
# destination for each value of embarked.
d1 = df.group_by("embarked", "home.dest").len().drop_nulls()
d2 = d1.select(pl.col("embarked"), pl.col("len")).group_by("embarked").max()
most_common_destinations = d1.join(d2, on=["len", "embarked"]).select("embarked", "home.dest")

CPU times: user 5.06 ms, sys: 10.8 ms, total: 15.9 ms
Wall time: 2.99 ms


In [167]:
%%time
# Replace NaN values in the home.dest column with values from embarked
df = df.with_columns(pl.col("home.dest").fill_null(pl.col("embarked")).alias("home.dest"))


CPU times: user 1.44 ms, sys: 2.1 ms, total: 3.54 ms
Wall time: 887 μs


In [175]:
%%timeit
# Use the most_common_destinations series to replace values in home.dest with
# the most common values for each embarkation point
(
    df
    .join(most_common_destinations, 
          left_on="home.dest", right_on="embarked", how="left")
    .with_columns(
        pl.col("home.dest_right").fill_null(pl.col("home.dest"))
        .alias("home.dest")
    )
    .drop(["home.dest_right"])
)


352 μs ± 9.96 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


4. Inconsistent data

In [185]:
import pandas as pd
df = pd.read_csv('data/nyc-parking-violations-2020.csv', 
                 usecols=["Plate ID", "Registration State", "Vehicle Make", 
                          "Vehicle Color", "Street Name"])

In [177]:
%%timeit
# distinct vehicle colors
len(df["Vehicle Color"].drop_duplicates())

217 ms ± 2.18 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [178]:
%%timeit
# top 30 colors
df["Vehicle Color"].value_counts()[:30]

371 ms ± 1.67 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [179]:
# dictionary
colormap = {'WH': 'WHITE', 'GY':'GRAY',
'BK':'BLACK', 'BL':'BLUE',
'RD':'RED', 'GR':'GRAY',
'TN':'TAN', 'BR':'BROWN',
'YW':'YELLO', 'BLK':'BLACK',
'GRY':'GRAY', 'WHT':'WHITE',
'WHI':'WHITE', 'OR':'ORANG',
'BK.':'BLACK', 'WT':'WHITE',
'WT.':'WHITE'}

In [180]:
%%timeit
# Replace the existing (old) colors with your translations. How many colors are
# there now
df["Vehicle Color"] = df["Vehicle Color"].replace(colormap)
len(df["Vehicle Color"].drop_duplicates())

3.53 s ± 52.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
df[:50]

In [187]:
%%timeit
# Run value_counts on the Vehicle Make column, and look at some vehicle
# names. Write a function that, given a value, cleans up the data: putting 
# the name in all caps, removing punctuation, and standardizing whatever names 
# you can. Then use the apply method to fix the column. How many distinct 
# vehicle makes are there when you’re done?
df['Vehicle Make'].value_counts()[:30]
# print(len(df["Vehicle Make"].drop_duplicates()))

# simple udf
import re
def clean(value: str) -> str:
    if not isinstance(value, str):
        return None
    return re.sub(r'[^\w\s]','', str(value).upper())

df['Vehicle Make'] = df['Vehicle Make'].apply(clean)

# print(len(df["Vehicle Make"].dropna().drop_duplicates()))

4.93 s ± 45 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [188]:
import polars as pl

In [189]:
df = pl.read_csv('data/nyc-parking-violations-2020.csv', 
                 columns=["Plate ID", "Registration State", "Vehicle Make", 
                          "Vehicle Color", "Street Name"])

In [193]:
%%timeit
# distinct vehicle colors
df["Vehicle Color"].n_unique()

108 ms ± 937 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [196]:
%%timeit
# top 30 colors
df["Vehicle Color"].value_counts(sort=True)[:30]

222 ms ± 4.58 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [197]:
# dictionary
colormap = {'WH': 'WHITE', 'GY':'GRAY',
'BK':'BLACK', 'BL':'BLUE',
'RD':'RED', 'GR':'GRAY',
'TN':'TAN', 'BR':'BROWN',
'YW':'YELLO', 'BLK':'BLACK',
'GRY':'GRAY', 'WHT':'WHITE',
'WHI':'WHITE', 'OR':'ORANG',
'BK.':'BLACK', 'WT':'WHITE',
'WT.':'WHITE'}

In [201]:
%%timeit
# Replace the existing (old) colors with your translations. How many colors are
# there now
(
    df
    .with_columns(
        pl.col("Vehicle Color").replace(colormap).alias("Vehicle Color"))
    .n_unique()
)

1.14 s ± 51.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [207]:
%%timeit
# Run value_counts on the Vehicle Make column, and look at some vehicle
# names. Write a function that, given a value, cleans up the data: putting 
# the name in all caps, removing punctuation, and standardizing whatever names 
# you can. Then use the apply method to fix the column. How many distinct 
# vehicle makes are there when you’re done?
df['Vehicle Make'].value_counts(sort=True)[:30]
# print(df["Vehicle Make"].n_unique())

# simple udf
import re
def clean(value: str) -> str:
    if not isinstance(value, str):
        return None
    return re.sub(r'[^\w\s]','', str(value).upper())

(
    df
    .with_columns(pl.col('Vehicle Make')
                  .map_elements(clean, return_dtype=pl.String)
                  .alias('Vehicle Make'))
    .select("Vehicle Make")
    .drop_nulls()
    .n_unique()
)

5.77 s ± 35.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
