# Debugging Pandas

#### Loading Libraries

In [1]:
# Data Manipulation
import pandas as pd
# Numerical Computing
import numpy as np
# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

#### Loading Data

In [2]:
url = 'https://github.com/mattharrison/datasets/raw/master'\
      '/data/dirtydevil.txt'
df = pd.read_csv(url, skiprows=lambda num: num <34 or num == 35,
                 sep='\t')

  df = pd.read_csv(url, skiprows=lambda num: num <34 or num == 35,


In [3]:
def to_denver_time(df_, time_col, tz_col):
    return (df_
       .assign(**{tz_col: df_[tz_col].replace('MDT', 'MST7MDT')})
       .groupby(tz_col)
       [time_col]
       .transform(lambda s: pd.to_datetime(s)
           .dt.tz_localize(s.name, ambiguous=True)
           .dt.tz_convert('America/Denver'))
    )

In [4]:
def tweak_river(df_):
    return (df_
     .assign(datetime=to_denver_time(df_, 'datetime', 'tz_cd'))
     .rename(columns={'144166_00060': 'cfs',
                      '144167_00065': 'gage_height'})
    )

In [5]:
dd = tweak_river(df)
dd

Unnamed: 0,agency_cd,site_no,datetime,tz_cd,cfs,144166_00060_cd,gage_height,144167_00065_cd
0,USGS,9333500,2001-05-07 01:00:00-06:00,MDT,71.00,A:[91],,
1,USGS,9333500,2001-05-07 01:15:00-06:00,MDT,71.00,A:[91],,
2,USGS,9333500,2001-05-07 01:30:00-06:00,MDT,71.00,A:[91],,
3,USGS,9333500,2001-05-07 01:45:00-06:00,MDT,70.00,A:[91],,
4,USGS,9333500,2001-05-07 02:00:00-06:00,MDT,70.00,A:[91],,
...,...,...,...,...,...,...,...,...
539300,USGS,9333500,2020-09-28 08:30:00-06:00,MDT,9.53,P,6.16,P
539301,USGS,9333500,2020-09-28 08:45:00-06:00,MDT,9.20,P,6.15,P
539302,USGS,9333500,2020-09-28 09:00:00-06:00,MDT,9.20,P,6.15,P
539303,USGS,9333500,2020-09-28 09:15:00-06:00,MDT,9.20,P,6.15,P


In [6]:
dd2 = pd.read_json(dd.to_json())
dd.equals(dd2)

  dd2 = pd.read_json(dd.to_json())


False

In [7]:
(dd
  .ne(dd2)
  .sum()
)

agency_cd               0
site_no                 0
datetime           539305
tz_cd                   0
cfs                 48048
144166_00060_cd     46181
gage_height        125656
144167_00065_cd    105928
dtype: int64

In [8]:
(dd
  .ne(dd2)
  .mean()
  .mul(100)
)

agency_cd            0.000000
site_no              0.000000
datetime           100.000000
tz_cd                0.000000
cfs                  8.909244
144166_00060_cd      8.563058
gage_height         23.299617
144167_00065_cd     19.641576
dtype: float64

In [9]:
pd.testing.assert_frame_equal(dd, dd2)

AssertionError: Attributes of DataFrame.iloc[:, 2] (column name="datetime") are different

Attribute "dtype" are different
[left]:  datetime64[ns, America/Denver]
[right]: datetime64[ns]

In [10]:
pd.testing.assert_frame_equal(dd,
 (dd2
    .assign(datetime=dd2.datetime
        .dt.tz_localize('UTC')
        .dt.tz_convert('America/Denver')))
)

  pd.testing.assert_frame_equal(dd,


In [11]:
dd.equals(dd2
 .assign(datetime=dd2.datetime
     .dt.tz_localize('UTC')
     .dt.tz_convert('America/Denver'))
)

False

In [12]:
pd.testing.assert_frame_equal(dd,
 (dd2
    .assign(datetime=dd2.datetime
        .dt.tz_localize('UTC')
        .dt.tz_convert('America/Denver'))),
 check_exact=True
)

AssertionError: DataFrame.iloc[:, 4] (column name="cfs") are different

DataFrame.iloc[:, 4] (column name="cfs") values are different (0.34619 %)
[index]: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]
[left]:  [71.0, 71.0, 71.0, 70.0, 70.0, 69.0, 70.0, 70.0, 70.0, 70.0, 69.0, 67.0, 67.0, 66.0, 66.0, 65.0, 66.0, 66.0, 66.0, 65.0, 64.0, 64.0, 64.0, 64.0, 64.0, 65.0, 64.0, 64.0, 64.0, 64.0, 63.0, 64.0, 63.0, 63.0, 65.0, 65.0, 66.0, 66.0, 66.0, 67.0, 68.0, 68.0, 69.0, 69.0, 70.0, 71.0, 72.0, 75.0, 76.0, 75.0, 75.0, 75.0, 76.0, 77.0, 78.0, 78.0, 78.0, 78.0, 78.0, 77.0, 78.0, 78.0, 77.0, 75.0, 73.0, 73.0, 73.0, 75.0, 76.0, 76.0, 76.0, 75.0, 75.0, 75.0, 75.0, 75.0, 75.0, 76.0, 76.0, 75.0, 75.0, 76.0, 76.0, 76.0, 75.0, 75.0, 73.0, 73.0, 73.0, 73.0, 75.0, 75.0, 75.0, 75.0, 75.0, 76.0, 75.0, 73.0, 75.0, 75.0, ...]
[right]: [71.0, 71.0, 71.0, 70.0, 70.0, 69.0, 70.0, 70.0, 70.0, 70.0, 69.0, 67.0, 67.0, 66.0, 66.0, 65.0, 66.0, 66.0, 66.0, 65.0, 64.0, 64.0, 64.0, 64.0, 64.0, 65.0, 64.0, 64.0, 64.0, 64.0, 63.0, 64.0, 63.0, 63.0, 65.0, 65.0, 66.0, 66.0, 66.0, 67.0, 68.0, 68.0, 69.0, 69.0, 70.0, 71.0, 72.0, 75.0, 76.0, 75.0, 75.0, 75.0, 76.0, 77.0, 78.0, 78.0, 78.0, 78.0, 78.0, 77.0, 78.0, 78.0, 77.0, 75.0, 73.0, 73.0, 73.0, 75.0, 76.0, 76.0, 76.0, 75.0, 75.0, 75.0, 75.0, 75.0, 75.0, 76.0, 76.0, 75.0, 75.0, 76.0, 76.0, 76.0, 75.0, 75.0, 73.0, 73.0, 73.0, 73.0, 75.0, 75.0, 75.0, 75.0, 75.0, 76.0, 75.0, 73.0, 75.0, 75.0, ...]

In [13]:
# .ne Method
dd[dd.cfs.ne(dd2.cfs)]

Unnamed: 0,agency_cd,site_no,datetime,tz_cd,cfs,144166_00060_cd,gage_height,144167_00065_cd
96246,USGS,9333500,2007-07-03 19:45:00-06:00,MDT,1.70,A:[91],,
96247,USGS,9333500,2007-07-03 20:00:00-06:00,MDT,1.70,A:[91],,
96248,USGS,9333500,2007-07-03 20:15:00-06:00,MDT,1.70,A:[91],,
96249,USGS,9333500,2007-07-03 20:30:00-06:00,MDT,1.70,A:[91],,
96250,USGS,9333500,2007-07-03 20:45:00-06:00,MDT,1.70,A:[91],,
...,...,...,...,...,...,...,...,...
538678,USGS,9333500,2020-09-21 21:00:00-06:00,MDT,6.56,P,6.06,P
538728,USGS,9333500,2020-09-22 09:30:00-06:00,MDT,6.56,P,6.06,P
538735,USGS,9333500,2020-09-22 11:15:00-06:00,MDT,6.56,P,6.06,P
538739,USGS,9333500,2020-09-22 12:15:00-06:00,MDT,6.56,P,6.06,P


In [14]:
dd.loc[96246].cfs, dd2.loc[96246].cfs

(1.7, 1.7000000000000002)

In [15]:
# Addressing Issues and Re-checking
dd.round(2).equals(
  dd2
    .assign(datetime=dd2.datetime
            .dt.tz_localize('UTC').
            dt.tz_convert('America/Denver'))
    .round(2)
 )

True

In [16]:
# Diagnose Function
def cmp_dfs(df1, df2, round_amt=3):
    diff_cols = set(df1.columns) ^ set(df2.columns)
    if diff_cols:
        print(f'Different columns {diff_cols}')
    if df1.shape != df2.shape:
        print(f'Different shapes {df1.shape} {df2.shape}')
    bad = False
    for col in df1.columns:
        s1 = df1[col]
        s2 = df2[col]
        if s1.equals(s2):
            continue
        bad = True            
        if s1.dtype != s2.dtype:
            print(f'{col} types differ {s1.dtype} vs {s2.dtype}')
        if s1.dtype == float:
            if s1.round(round_amt).equals(s2.round(round_amt)):
                print(f'{col} has rounding differences'
                      f'{df1[s1.ne(s2)][col].dropna().iloc[0]} '
                      f'vs {df2[s1.ne(s2)][col].dropna().iloc[0]}')
        else:
            print(f'{col} differs {df1[s1.ne(s2)][col].dropna()}')
    if not bad:
        print('Same')

In [17]:
cmp_dfs(dd, dd2)

datetime types differ datetime64[ns, America/Denver] vs datetime64[ns]
datetime differs 0        2001-05-07 01:00:00-06:00
1        2001-05-07 01:15:00-06:00
2        2001-05-07 01:30:00-06:00
3        2001-05-07 01:45:00-06:00
4        2001-05-07 02:00:00-06:00
                    ...           
539300   2020-09-28 08:30:00-06:00
539301   2020-09-28 08:45:00-06:00
539302   2020-09-28 09:00:00-06:00
539303   2020-09-28 09:15:00-06:00
539304   2020-09-28 09:30:00-06:00
Name: datetime, Length: 539305, dtype: datetime64[ns, America/Denver]
cfs has rounding differences1.7 vs 1.7000000000000002
gage_height has rounding differences3.28 vs 3.2800000000000002


#### Debugging Chains

In [18]:
autos = pd.read_csv('https://github.com/mattharrison/datasets/raw/'
    'master/data/vehicles.csv.zip')

  autos = pd.read_csv('https://github.com/mattharrison/datasets/raw/'


In [19]:
def to_tz(df_, time_col, tz_offset, tz_name):
    return (df_
             .groupby(tz_offset)
             [time_col]
             .transform(lambda s: pd.to_datetime(s)
                 .dt.tz_localize(s.name, ambiguous=True)
                 .dt.tz_convert(tz_name))
            )

In [20]:
def tweak_autos(autos):
    cols = ['city08', 'comb08', 'highway08', 'cylinders',
            'displ', 'drive', 'eng_dscr', 'fuelCost08',
            'make', 'model', 'trany', 'range', 'createdOn',
            'year']
    return (autos
     [cols]
     .assign(cylinders=autos.cylinders.fillna(0).astype('int8'),
             displ=autos.displ.fillna(0).astype('float16'),
             drive=autos.drive.fillna('Other').astype('category'),
             automatic=autos.trany.str.contains('Auto'),
             speeds=autos.trany.str.extract(r'(\d)+').fillna('20')
                    .astype('int8'),
             offset=autos.createdOn
                    .str.extract(r'\d\d:\d\d ([A-Z]{3}?)')
                    .replace('EDT', 'EST5EDT'),
             str_date=(autos.createdOn.str.slice(4,19) + ' ' +
                       autos.createdOn.str.slice(-4)),
             createdOn=lambda df_: to_tz(df_, 'str_date',
                       'offset', 'America/New_York'),
             ffs=autos.eng_dscr.str.contains('FFS')
            )
     .astype({'highway08': 'int8', 'city08': 'int16',
              'comb08': 'int16', 'fuelCost08': 'int16',
              'range': 'int16',  'year': 'int16',
              'make': 'category'})
     .drop(columns=['trany', 'eng_dscr'])
    )

In [21]:
tweak_autos(autos)

Unnamed: 0,city08,comb08,highway08,cylinders,displ,drive,fuelCost08,make,model,range,createdOn,year,automatic,speeds,offset,str_date,ffs
0,19,21,25,4,2.000000,Rear-Wheel Drive,2000,Alfa Romeo,Spider Veloce 2000,0,2013-01-01 00:00:00-05:00,1985,False,5,EST,Jan 01 00:00:00 2013,True
1,9,11,14,12,4.898438,Rear-Wheel Drive,3850,Ferrari,Testarossa,0,2013-01-01 00:00:00-05:00,1985,False,5,EST,Jan 01 00:00:00 2013,False
2,23,27,33,4,2.199219,Front-Wheel Drive,1550,Dodge,Charger,0,2013-01-01 00:00:00-05:00,1985,False,5,EST,Jan 01 00:00:00 2013,True
3,10,11,12,8,5.199219,Rear-Wheel Drive,3850,Dodge,B150/B250 Wagon 2WD,0,2013-01-01 00:00:00-05:00,1985,True,3,EST,Jan 01 00:00:00 2013,
4,17,19,23,4,2.199219,4-Wheel or All-Wheel Drive,2700,Subaru,Legacy AWD Turbo,0,2013-01-01 00:00:00-05:00,1993,False,5,EST,Jan 01 00:00:00 2013,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41139,19,22,26,4,2.199219,Front-Wheel Drive,1900,Subaru,Legacy,0,2013-01-01 00:00:00-05:00,1993,True,4,EST,Jan 01 00:00:00 2013,True
41140,20,23,28,4,2.199219,Front-Wheel Drive,1850,Subaru,Legacy,0,2013-01-01 00:00:00-05:00,1993,False,5,EST,Jan 01 00:00:00 2013,True
41141,18,21,24,4,2.199219,4-Wheel or All-Wheel Drive,2000,Subaru,Legacy AWD,0,2013-01-01 00:00:00-05:00,1993,True,4,EST,Jan 01 00:00:00 2013,True
41142,18,21,24,4,2.199219,4-Wheel or All-Wheel Drive,2000,Subaru,Legacy AWD,0,2013-01-01 00:00:00-05:00,1993,False,5,EST,Jan 01 00:00:00 2013,True


In [22]:
def tweak_autos(autos):
    cols = ['city08', 'comb08', 'highway08', 'cylinders',
            'displ', 'drive', 'eng_dscr', 'fuelCost08',
            'make', 'model', 'trany', 'range', 'createdOn',
            'year']
    return (autos
     [cols]
    # .assign(cylinders=autos.cylinders.fillna(0).astype('int8'),
    #         displ=autos.displ.fillna(0).astype('float16'),
    #         drive=autos.drive.fillna('Other').astype('category'),
    #         automatic=autos.trany.str.contains('Auto'),
    #         speeds=autos.trany.str.extract(r'(\d)+').fillna('20')
    #                .astype('int8'),
    #         offset=autos.createdOn
    #                .str.extract(r'\d\d:\d\d ([A-Z]{3}?)')
    #                .replace('EDT', 'EST5EDT'),
    #         str_date=(autos.createdOn.str.slice(4,19) + ' ' +
    #                   autos.createdOn.str.slice(-4)),
    #         createdOn=lambda df_: to_tz(df_, 'str_date',
    #                   'offset', 'America/New_York'),
    #         ffs=autos.eng_dscr.str.contains('FFS')
    #        )
    # .astype({'highway08': 'int8', 'city08': 'int16',
    #          'comb08': 'int16', 'fuelCost08': 'int16',
    #          'range': 'int16',  'year': 'int16',
    #          'make': 'category'})
    # .drop(columns=['trany', 'eng_dscr'])
    )

In [23]:
tweak_autos(autos)

Unnamed: 0,city08,comb08,highway08,cylinders,displ,drive,eng_dscr,fuelCost08,make,model,trany,range,createdOn,year
0,19,21,25,4.0,2.0,Rear-Wheel Drive,(FFS),2000,Alfa Romeo,Spider Veloce 2000,Manual 5-spd,0,Tue Jan 01 00:00:00 EST 2013,1985
1,9,11,14,12.0,4.9,Rear-Wheel Drive,(GUZZLER),3850,Ferrari,Testarossa,Manual 5-spd,0,Tue Jan 01 00:00:00 EST 2013,1985
2,23,27,33,4.0,2.2,Front-Wheel Drive,(FFS),1550,Dodge,Charger,Manual 5-spd,0,Tue Jan 01 00:00:00 EST 2013,1985
3,10,11,12,8.0,5.2,Rear-Wheel Drive,,3850,Dodge,B150/B250 Wagon 2WD,Automatic 3-spd,0,Tue Jan 01 00:00:00 EST 2013,1985
4,17,19,23,4.0,2.2,4-Wheel or All-Wheel Drive,"(FFS,TRBO)",2700,Subaru,Legacy AWD Turbo,Manual 5-spd,0,Tue Jan 01 00:00:00 EST 2013,1993
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41139,19,22,26,4.0,2.2,Front-Wheel Drive,(FFS),1900,Subaru,Legacy,Automatic 4-spd,0,Tue Jan 01 00:00:00 EST 2013,1993
41140,20,23,28,4.0,2.2,Front-Wheel Drive,(FFS),1850,Subaru,Legacy,Manual 5-spd,0,Tue Jan 01 00:00:00 EST 2013,1993
41141,18,21,24,4.0,2.2,4-Wheel or All-Wheel Drive,(FFS),2000,Subaru,Legacy AWD,Automatic 4-spd,0,Tue Jan 01 00:00:00 EST 2013,1993
41142,18,21,24,4.0,2.2,4-Wheel or All-Wheel Drive,(FFS),2000,Subaru,Legacy AWD,Manual 5-spd,0,Tue Jan 01 00:00:00 EST 2013,1993


In [25]:
def tweak_autos(autos):
    cols = ['city08', 'comb08', 'highway08', 'cylinders',
            'displ', 'drive', 'eng_dscr', 'fuelCost08',
            'make', 'model', 'trany', 'range', 'createdOn',
            'year']
    return (autos
     [cols]
     .assign(cylinders=autos.cylinders.fillna(0).astype('int8'),
             displ=autos.displ.fillna(0).astype('float16'),
             drive=autos.drive.fillna('Other').astype('category'),
             automatic=autos.trany.str.contains('Auto'),
             speeds=autos.trany.str.extract(r'(\d)+').fillna('20')
                    .astype('int8'),
             offset=autos.createdOn
                    .str.extract(r'\d\d:\d\d ([A-Z]{3}?)')
                    .replace('EDT', 'EST5EDT'),
             str_date=(autos.createdOn.str.slice(4,19) + ' ' +
                       autos.createdOn.str.slice(-4)),
             createdOn=lambda df_: to_tz(df_, 'str_date',
                       'offset', 'America/New_York'),
             ffs=autos.eng_dscr.str.contains('FFS')
            )
    # .astype({'highway08': 'int8', 'city08': 'int16',
    #          'comb08': 'int16', 'fuelCost08': 'int16',
    #          'range': 'int16',  'year': 'int16',
    #          'make': 'category'})
    # .drop(columns=['trany', 'eng_dscr'])
    )

In [26]:
tweak_autos(autos)

Unnamed: 0,city08,comb08,highway08,cylinders,displ,drive,eng_dscr,fuelCost08,make,model,trany,range,createdOn,year,automatic,speeds,offset,str_date,ffs
0,19,21,25,4,2.000000,Rear-Wheel Drive,(FFS),2000,Alfa Romeo,Spider Veloce 2000,Manual 5-spd,0,2013-01-01 00:00:00-05:00,1985,False,5,EST,Jan 01 00:00:00 2013,True
1,9,11,14,12,4.898438,Rear-Wheel Drive,(GUZZLER),3850,Ferrari,Testarossa,Manual 5-spd,0,2013-01-01 00:00:00-05:00,1985,False,5,EST,Jan 01 00:00:00 2013,False
2,23,27,33,4,2.199219,Front-Wheel Drive,(FFS),1550,Dodge,Charger,Manual 5-spd,0,2013-01-01 00:00:00-05:00,1985,False,5,EST,Jan 01 00:00:00 2013,True
3,10,11,12,8,5.199219,Rear-Wheel Drive,,3850,Dodge,B150/B250 Wagon 2WD,Automatic 3-spd,0,2013-01-01 00:00:00-05:00,1985,True,3,EST,Jan 01 00:00:00 2013,
4,17,19,23,4,2.199219,4-Wheel or All-Wheel Drive,"(FFS,TRBO)",2700,Subaru,Legacy AWD Turbo,Manual 5-spd,0,2013-01-01 00:00:00-05:00,1993,False,5,EST,Jan 01 00:00:00 2013,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41139,19,22,26,4,2.199219,Front-Wheel Drive,(FFS),1900,Subaru,Legacy,Automatic 4-spd,0,2013-01-01 00:00:00-05:00,1993,True,4,EST,Jan 01 00:00:00 2013,True
41140,20,23,28,4,2.199219,Front-Wheel Drive,(FFS),1850,Subaru,Legacy,Manual 5-spd,0,2013-01-01 00:00:00-05:00,1993,False,5,EST,Jan 01 00:00:00 2013,True
41141,18,21,24,4,2.199219,4-Wheel or All-Wheel Drive,(FFS),2000,Subaru,Legacy AWD,Automatic 4-spd,0,2013-01-01 00:00:00-05:00,1993,True,4,EST,Jan 01 00:00:00 2013,True
41142,18,21,24,4,2.199219,4-Wheel or All-Wheel Drive,(FFS),2000,Subaru,Legacy AWD,Manual 5-spd,0,2013-01-01 00:00:00-05:00,1993,False,5,EST,Jan 01 00:00:00 2013,True


#### Debugging Chains Part II

In [30]:
from IPython.display import display, HTML
def show(df_, rows=20, cols=30, title=None):
    if title:
        display(HTML(f'<h2>{title}</h2>'))
    with pd.option_context('display.min_rows', rows,
                           'display.max_columns', cols):
        display(df_)
    return df_

In [34]:
def shape(df_):
    print(df_.shape)
    return df_

### Debugging Chains Part III

In [35]:
def get_var(df, var_name):
    globals()[var_name] = df
    return df

In [36]:
def tweak_autos(autos):
    cols = ['city08', 'comb08', 'highway08', 'cylinders',
            'displ', 'drive', 'eng_dscr', 'fuelCost08',
            'make', 'model', 'trany', 'range', 'createdOn',
            'year']
    return (autos
     [cols]
     .assign(cylinders=autos.cylinders.fillna(0).astype('int8'),
             displ=autos.displ.fillna(0).astype('float16'),
             drive=autos.drive.fillna('Other').astype('category'),
             automatic=autos.trany.str.contains('Auto'),
             speeds=autos.trany.str.extract(r'(\d)+').fillna('20')
                    .astype('int8'),
             offset=autos.createdOn
                    .str.extract(r'\d\d:\d\d ([A-Z]{3}?)')
                    .replace('EDT', 'EST5EDT'),
             str_date=(autos.createdOn.str.slice(4,19) + ' ' +
                       autos.createdOn.str.slice(-4)),
             createdOn=lambda df_: to_tz(df_, 'str_date',
                       'offset', 'America/New_York'),
             ffs=autos.eng_dscr.str.contains('FFS')
            )
     .pipe(get_var, 'new_cols')
     .astype({'highway08': 'int8', 'city08': 'int16',
              'comb08': 'int16', 'fuelCost08': 'int16',
              'range': 'int16',  'year': 'int16',
              'make': 'category'})
     .drop(columns=['trany', 'eng_dscr'])
    )

In [37]:
res = tweak_autos(autos)

In [38]:
new_cols

Unnamed: 0,city08,comb08,highway08,cylinders,displ,drive,eng_dscr,fuelCost08,make,model,trany,range,createdOn,year,automatic,speeds,offset,str_date,ffs
0,19,21,25,4,2.000000,Rear-Wheel Drive,(FFS),2000,Alfa Romeo,Spider Veloce 2000,Manual 5-spd,0,2013-01-01 00:00:00-05:00,1985,False,5,EST,Jan 01 00:00:00 2013,True
1,9,11,14,12,4.898438,Rear-Wheel Drive,(GUZZLER),3850,Ferrari,Testarossa,Manual 5-spd,0,2013-01-01 00:00:00-05:00,1985,False,5,EST,Jan 01 00:00:00 2013,False
2,23,27,33,4,2.199219,Front-Wheel Drive,(FFS),1550,Dodge,Charger,Manual 5-spd,0,2013-01-01 00:00:00-05:00,1985,False,5,EST,Jan 01 00:00:00 2013,True
3,10,11,12,8,5.199219,Rear-Wheel Drive,,3850,Dodge,B150/B250 Wagon 2WD,Automatic 3-spd,0,2013-01-01 00:00:00-05:00,1985,True,3,EST,Jan 01 00:00:00 2013,
4,17,19,23,4,2.199219,4-Wheel or All-Wheel Drive,"(FFS,TRBO)",2700,Subaru,Legacy AWD Turbo,Manual 5-spd,0,2013-01-01 00:00:00-05:00,1993,False,5,EST,Jan 01 00:00:00 2013,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41139,19,22,26,4,2.199219,Front-Wheel Drive,(FFS),1900,Subaru,Legacy,Automatic 4-spd,0,2013-01-01 00:00:00-05:00,1993,True,4,EST,Jan 01 00:00:00 2013,True
41140,20,23,28,4,2.199219,Front-Wheel Drive,(FFS),1850,Subaru,Legacy,Manual 5-spd,0,2013-01-01 00:00:00-05:00,1993,False,5,EST,Jan 01 00:00:00 2013,True
41141,18,21,24,4,2.199219,4-Wheel or All-Wheel Drive,(FFS),2000,Subaru,Legacy AWD,Automatic 4-spd,0,2013-01-01 00:00:00-05:00,1993,True,4,EST,Jan 01 00:00:00 2013,True
41142,18,21,24,4,2.199219,4-Wheel or All-Wheel Drive,(FFS),2000,Subaru,Legacy AWD,Manual 5-spd,0,2013-01-01 00:00:00-05:00,1993,False,5,EST,Jan 01 00:00:00 2013,True


#### Debugging Chains Part IV

In [39]:
# %% Debug Procedure
def err(*args):
    1/0

In [40]:
def tweak_autos(autos):
    cols = ['city08', 'comb08', 'highway08', 'cylinders',
            'displ', 'drive', 'eng_dscr', 'fuelCost08',
            'make', 'model', 'trany', 'range', 'createdOn',
            'year']
    return (autos
     [cols]
     .assign(cylinders=autos.cylinders.fillna(0).astype('int8'),
             displ=autos.displ.fillna(0).astype('float16'),
             drive=autos.drive.fillna('Other').astype('category'),
             automatic=autos.trany.str.contains('Auto'),
             speeds=autos.trany.str.extract(r'(\d)+').fillna('20')
                    .astype('int8'),
             offset=autos.createdOn
                    .str.extract(r'\d\d:\d\d ([A-Z]{3}?)')
                    .replace('EDT', 'EST5EDT'),
             str_date=(autos.createdOn.str.slice(4,19) + ' ' +
                       autos.createdOn.str.slice(-4)),
             createdOn=lambda df_: to_tz(df_, 'str_date',
                       'offset', 'America/New_York'),
             ffs=autos.eng_dscr.str.contains('FFS')
            )
     .pipe(err)
     .astype({'highway08': 'int8', 'city08': 'int16',
              'comb08': 'int16', 'fuelCost08': 'int16',
              'range': 'int16',  'year': 'int16',
              'make': 'category'})
     .drop(columns=['trany', 'eng_dscr'])
    )

In [41]:
res = tweak_autos(autos)

ZeroDivisionError: division by zero

In [42]:
from IPython.core.debugger import set_trace
def err(*args):
    set_trace()

#### Debugging Apply (and Friends)

In [44]:
class DebugException(Exception):
    pass

In [45]:
def debug_var(thing, *, name='debug_item', raise_ex=True):
    globals()[name] = thing
    if raise_ex:
        raise DebugException
    return thing

In [46]:
def tweak_autos(autos):
    cols = ['city08', 'comb08', 'highway08', 'cylinders',
            'displ', 'drive', 'eng_dscr', 'fuelCost08',
            'make', 'model', 'trany', 'range', 'createdOn',
            'year']
    return (autos
     [cols]
     .assign(cylinders=autos.cylinders.fillna(0).astype('int8'),
             displ=autos.displ.fillna(0).astype('float16'),
             drive=autos.drive.fillna('Other').astype('category'),
             automatic=autos.trany.str.contains('Auto'),
             speeds=autos.trany.str.extract(r'(\d)+').fillna('20')
                    .astype('int8'),
             offset=autos.createdOn
                    .str.extract(r'\d\d:\d\d ([A-Z]{3}?)')
                    .replace('EDT', 'EST5EDT'),
             str_date=(autos.createdOn.str.slice(4,19) + ' ' +
                       autos.createdOn.str.slice(-4)),
             createdOn=lambda df_: to_tz(df_, 'str_date',
                       'offset', 'America/New_York'),
             ffs=autos.eng_dscr.str.contains('FFS')
            )
     .astype({'highway08': 'int8', 'city08': 'int16',
              'comb08': 'int16', 'fuelCost08': 'int16',
              'range': 'int16',  'year': 'int16',
              'make': 'category'})
     .drop(columns=['trany', 'eng_dscr'])
    )

In [47]:
autos2 = tweak_autos(autos)
autos2.apply(debug_var, name='this')

DebugException: 

In [48]:
this

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int16

In [49]:
autos2.apply(debug_var, axis=1)

DebugException: 

In [50]:
debug_item

city08                               19
comb08                               21
highway08                            25
cylinders                             4
displ                               2.0
drive                  Rear-Wheel Drive
fuelCost08                         2000
make                         Alfa Romeo
model                Spider Veloce 2000
range                                 0
createdOn     2013-01-01 00:00:00-05:00
year                               1985
automatic                         False
speeds                                5
offset                              EST
str_date           Jan 01 00:00:00 2013
ffs                                True
Name: 0, dtype: object

In [51]:
(autos2
 .assign(new_col=debug_var)
)

DebugException: 

In [52]:
debug_item

Unnamed: 0,city08,comb08,highway08,cylinders,displ,drive,fuelCost08,make,model,range,createdOn,year,automatic,speeds,offset,str_date,ffs
0,19,21,25,4,2.000000,Rear-Wheel Drive,2000,Alfa Romeo,Spider Veloce 2000,0,2013-01-01 00:00:00-05:00,1985,False,5,EST,Jan 01 00:00:00 2013,True
1,9,11,14,12,4.898438,Rear-Wheel Drive,3850,Ferrari,Testarossa,0,2013-01-01 00:00:00-05:00,1985,False,5,EST,Jan 01 00:00:00 2013,False
2,23,27,33,4,2.199219,Front-Wheel Drive,1550,Dodge,Charger,0,2013-01-01 00:00:00-05:00,1985,False,5,EST,Jan 01 00:00:00 2013,True
3,10,11,12,8,5.199219,Rear-Wheel Drive,3850,Dodge,B150/B250 Wagon 2WD,0,2013-01-01 00:00:00-05:00,1985,True,3,EST,Jan 01 00:00:00 2013,
4,17,19,23,4,2.199219,4-Wheel or All-Wheel Drive,2700,Subaru,Legacy AWD Turbo,0,2013-01-01 00:00:00-05:00,1993,False,5,EST,Jan 01 00:00:00 2013,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41139,19,22,26,4,2.199219,Front-Wheel Drive,1900,Subaru,Legacy,0,2013-01-01 00:00:00-05:00,1993,True,4,EST,Jan 01 00:00:00 2013,True
41140,20,23,28,4,2.199219,Front-Wheel Drive,1850,Subaru,Legacy,0,2013-01-01 00:00:00-05:00,1993,False,5,EST,Jan 01 00:00:00 2013,True
41141,18,21,24,4,2.199219,4-Wheel or All-Wheel Drive,2000,Subaru,Legacy AWD,0,2013-01-01 00:00:00-05:00,1993,True,4,EST,Jan 01 00:00:00 2013,True
41142,18,21,24,4,2.199219,4-Wheel or All-Wheel Drive,2000,Subaru,Legacy AWD,0,2013-01-01 00:00:00-05:00,1993,False,5,EST,Jan 01 00:00:00 2013,True


In [53]:
(autos2.groupby('make').agg({'city08': debug_var}))

  (autos2.groupby('make').agg({'city08': debug_var}))


DebugException: 

In [54]:
debug_item

358      16
369      13
19314    13
19316    18
20288    13
20289    18
Name: city08, dtype: int16

#### Memory Usage

In [55]:
dd.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 539305 entries, 0 to 539304
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype                         
---  ------           --------------   -----                         
 0   agency_cd        539305 non-null  object                        
 1   site_no          539305 non-null  int64                         
 2   datetime         539305 non-null  datetime64[ns, America/Denver]
 3   tz_cd            539305 non-null  object                        
 4   cfs              493124 non-null  float64                       
 5   144166_00060_cd  493124 non-null  object                        
 6   gage_height      433377 non-null  float64                       
 7   144167_00065_cd  433377 non-null  object                        
dtypes: datetime64[ns, America/Denver](1), float64(2), int64(1), object(4)
memory usage: 135.1 MB


In [57]:
%load_ext memory_profiler

In [58]:
%%memit     
dd = tweak_river(df)     

peak memory: 643.69 MiB, increment: 237.03 MiB


#### Timing Information

In [59]:
%%time     
dd = tweak_river(df)    

CPU times: user 181 ms, sys: 17.4 ms, total: 199 ms
Wall time: 199 ms


In [60]:
%%timeit     
dd = tweak_river(df)     

157 ms ± 1.56 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
