# Series Deep Dive

### Loading Libraries

In [1]:
# Numerical Computing
import math
import numpy as np

# Data Manipulation
import pandas as pd

# PyArrow
import pyarrow as pa

### Loading Data

In [2]:
url = 'https://github.com/mattharrison/datasets/raw/master/data/' \
      'vehicles.csv.zip'

In [3]:
df = pd.read_csv(url,
                 dtype_backend='pyarrow',
                 engine='pyarrow')

In [4]:
city_mpg = df.city08

In [5]:
highway_mpg = df.highway08

In [6]:
city_mpg

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int64[pyarrow]

In [7]:
highway_mpg

0        25
1        14
2        33
3        12
4        23
         ..
41139    26
41140    28
41141    24
41142    24
41143    21
Name: highway08, Length: 41144, dtype: int64[pyarrow]

### Series Attributes

In [8]:
# Checking Length-Up
len(dir(city_mpg))

391

# Operators & (`Dunder Methods`)

### Dunder Methods

In [9]:
2 + 4

6

In [10]:
(city_mpg + highway_mpg) / 2

0        22.0
1        11.5
2        28.0
3        11.0
4        20.0
         ... 
41139    22.5
41140    24.0
41141    21.0
41142    21.0
41143    18.5
Length: 41144, dtype: double[pyarrow]

### Index Alignment

In [11]:
s1 = pd.Series([10, 20, 30], index=[1, 2, 2])

s2 = pd.Series([35, 44, 53], index=[2, 2, 4], name='s2')

In [12]:
s1

1    10
2    20
2    30
dtype: int64

In [13]:
s2

2    35
2    44
4    53
Name: s2, dtype: int64

In [14]:
s1 + s2

1     NaN
2    55.0
2    64.0
2    65.0
2    74.0
4     NaN
dtype: float64

### Broadcasting

In [15]:
s2 + 5

2    40
2    49
4    58
Name: s2, dtype: int64

### Operators Methods

In [16]:
s1 + s2

1     NaN
2    55.0
2    64.0
2    65.0
2    74.0
4     NaN
dtype: float64

In [17]:
s1.add(s2)

1     NaN
2    55.0
2    64.0
2    65.0
2    74.0
4     NaN
dtype: float64

In [18]:
s1.add(s2, fill_value=0)

1    10.0
2    55.0
2    64.0
2    65.0
2    74.0
4    53.0
dtype: float64

### Chaining

In [19]:
((city_mpg +
  highway_mpg)
 / 2
)

0        22.0
1        11.5
2        28.0
3        11.0
4        20.0
         ... 
41139    22.5
41140    24.0
41141    21.0
41142    21.0
41143    18.5
Length: 41144, dtype: double[pyarrow]

In [20]:
(city_mpg
 .add(highway_mpg)
 .div(2)
)

0        22.0
1        11.5
2        28.0
3        11.0
4        20.0
         ... 
41139    22.5
41140    24.0
41141    21.0
41142    21.0
41143    18.5
Length: 41144, dtype: double[pyarrow]

# Aggregate Methods

### Aggregations

In [21]:
city_mpg.mean()

18.369045304297103

In [22]:
city_mpg.is_unique

False

In [23]:
city_mpg.is_monotonic_increasing

False

In [24]:
city_mpg.quantile()

17.0

In [25]:
city_mpg.quantile(.9)

24.0

In [26]:
city_mpg.quantile([.1, .5, .9])

0.1    13.0
0.5    17.0
0.9    24.0
Name: city08, dtype: double[pyarrow]

### Count & Mean of Attribute

In [27]:
(city_mpg
 .gt(20)
 .sum()
)

10272

In [28]:
# (city_mpg
#  .gt(20)
#  .mul(100)
#  .mean()
# )

In [29]:
(city_mpg
    .gt(20)
    .astype("int64[pyarrow]")   # True/False -> 1/0 en Arrow
    .mul(100)
    .mean()
)

24.965973167412017

### `.agg` & Aggreation Strings

In [30]:
city_mpg.agg('mean')

18.369045304297103

In [31]:
def second_to_last(s):
    return s.iloc[-2]

In [32]:
city_mpg.agg(['mean', np.var, max, second_to_last])

mean               18.369045
var                62.501517
max               150.000000
second_to_last     18.000000
Name: city08, dtype: float64

# Conversion Methods

### Type Conversion

In [33]:
city_mpg.astype('int16[pyarrow]')

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int16[pyarrow]

In [34]:
city_mpg.astype('int8[pyarrow]')

ArrowInvalid: Integer value 132 not in range: -128 to 127

In [None]:
np.iinfo('int64')

In [35]:
np.iinfo('int8')

iinfo(min=-128, max=127, dtype=int8)

In [36]:
# np.finfo('int32')

### Memory Usage

In [37]:
city_mpg.nbytes

329152

In [38]:
city_mpg.astype('Int16').nbytes

123432

In [39]:
make = df.make

In [40]:
make.nbytes

425635

In [41]:
make.memory_usage()

425767

In [42]:
make.memory_usage(deep=True)

425767

In [43]:
make.astype(str).memory_usage()

590343

In [44]:
make.astype(str).memory_usage(deep=True)

590343

### String & Category Types

In [45]:
(make
 .astype('category')
 .memory_usage(deep=True)
)

84533

In [46]:
(city_mpg
 .astype('category')
 .cat.as_ordered()
)

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: category
Categories (105, int64[pyarrow]): [6 < 7 < 8 < 9 ... 137 < 138 < 140 < 150]

### Ordered Categories

In [47]:
values = pd.Series(sorted(set(city_mpg)))

In [48]:
city_type = pd.CategoricalDtype(
    categories=values,
    ordered=True)

In [49]:
city_mpg.astype(city_type)

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: category
Categories (105, int64): [6 < 7 < 8 < 9 ... 137 < 138 < 140 < 150]

In [50]:
city_mpg.astype('category').cat.as_ordered()

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: category
Categories (105, int64[pyarrow]): [6 < 7 < 8 < 9 ... 137 < 138 < 140 < 150]

### Converting to Other Types

In [51]:
city_mpg.to_frame()

Unnamed: 0,city08
0,19
1,9
2,23
3,10
4,17
...,...
41139,19
41140,20
41141,18
41142,18


# Manipulation Methods

### `.apply & .where`

In [52]:
def gt20(val):
    return val > 20

In [53]:
%%timeit
city_mpg.apply(gt20)

2.13 ms ± 19.1 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [54]:
%%timeit
city_mpg.gt(20)

22.1 μs ± 99.5 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [55]:
make = df.make
make

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: string[pyarrow]

In [56]:
make.value_counts()

make
Chevrolet                           4003
Ford                                3371
Dodge                               2583
GMC                                 2494
Toyota                              2071
                                    ... 
Grumman Allied Industries              1
Environmental Rsch and Devp Corp       1
General Motors                         1
Goldacre                               1
Isis Imports Ltd                       1
Name: count, Length: 136, dtype: int64[pyarrow]

In [57]:
top5 = make.value_counts().index[:5]

top5

Index(['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota'], dtype='string[pyarrow]', name='make')

In [58]:
def generalize_top5(val):
    if val in top5:
        return val
    return 'Other'

In [59]:
make.apply(generalize_top5)

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: str

In [60]:
make.where(make.isin(top5), other='Other')

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: string[pyarrow]

In [61]:
%%timeit
make.apply(generalize_top5)

12.5 ms ± 70.9 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [62]:
%%timeit
make.where(make.isin(top5), 'Other')

637 μs ± 3.93 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [63]:
make.mask(~make.isin(top5), other='Other')

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: string[pyarrow]

### Apply with NumPy Functions

In [64]:
%%timeit
city_mpg.apply(np.log)

163 μs ± 1.26 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [65]:
%%timeit
city_mpg.apply(math.log)

2.42 ms ± 9.42 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### `if Else` with Pandas

In [66]:
vc = make.value_counts()

top5 = vc.index[:5]

top10 = vc.index[:10]

In [67]:
def generalize(val):
    if val in top5:
        return val
    elif val in top10:
        return 'Top10'
    else:
        return 'Other'

In [68]:
make.apply(generalize)

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: str

In [69]:
(make
 .case_when(caselist=[(make.isin(top5), make),
                     (make.isin(top10), 'Top10'),
                     (pd.Series(True, index=make.index), 'Other')])
)

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: object

In [70]:
(make
 .where(make.isin(top5), 'Top10')
 .where(make.isin(top10), 'Other')
)

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: string[pyarrow]

### Missing Data

In [71]:
cyl = df.cylinders

In [72]:
(cyl
 .isna()
 .sum()
)

np.int64(206)

In [73]:
missing = cyl.isna()

In [74]:
make.loc[missing]

7138     Nissan
7139     Toyota
8143     Toyota
8144       Ford
8146       Ford
          ...  
34563     Tesla
34564     Tesla
34565     Tesla
34566     Tesla
34567     Tesla
Name: make, Length: 206, dtype: string[pyarrow]

### Filling In Missing Data

In [75]:
cyl[cyl.isna()]

7138     <NA>
7139     <NA>
8143     <NA>
8144     <NA>
8146     <NA>
         ... 
34563    <NA>
34564    <NA>
34565    <NA>
34566    <NA>
34567    <NA>
Name: cylinders, Length: 206, dtype: int64[pyarrow]

In [76]:
cyl.fillna(0).loc[7136:7141]

7136    6
7137    6
7138    0
7139    0
7140    6
7141    6
Name: cylinders, dtype: int64[pyarrow]

### Interpolating Data

In [77]:
temp = pd.Series([32, 40, None, 42, 39, 32],
                dtype='float[pyarrow]')

temp

0    32.0
1    40.0
2    <NA>
3    42.0
4    39.0
5    32.0
dtype: float[pyarrow]

In [78]:
temp.interpolate()

0    32.0
1    40.0
2    41.0
3    42.0
4    39.0
5    32.0
dtype: float[pyarrow]

### Clipping Data

In [79]:
city_mpg.loc[:446]

0      19
1       9
2      23
3      10
4      17
       ..
442    15
443    15
444    15
445    15
446    31
Name: city08, Length: 447, dtype: int64[pyarrow]

In [80]:
(city_mpg
 .loc[:446]
 .clip(lower=city_mpg.quantile(.05),
       upper=city_mpg.quantile(.95))
)

0      19
1      11
2      23
3      11
4      17
       ..
442    15
443    15
444    15
445    15
446    27
Name: city08, Length: 447, dtype: int64[pyarrow]

### Sorting Values

In [81]:
city_mpg.sort_values()

7901       6
21060      6
34557      6
35887      6
37161      6
        ... 
34563    138
34564    140
31256    150
32599    150
33423    150
Name: city08, Length: 41144, dtype: int64[pyarrow]

In [82]:
(city_mpg.sort_values() + highway_mpg) / 2

0        22.0
1        11.5
2        28.0
3        11.0
4        20.0
         ... 
41139    22.5
41140    24.0
41141    21.0
41142    21.0
41143    18.5
Length: 41144, dtype: double[pyarrow]

### Sorting The Index

In [83]:
city_mpg.sort_values().sort_index()

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int64[pyarrow]

### Dropping Duplicates

In [84]:
city_mpg.drop_duplicates()

0         19
1          9
2         23
3         10
4         17
        ... 
34364    127
34409    114
34564    140
34565    115
34566    104
Name: city08, Length: 105, dtype: int64[pyarrow]

### Ranking Data

In [85]:
city_mpg.rank()

0        27060.5
1          235.5
2        35830.0
3          607.5
4        19484.0
          ...   
41139    27060.5
41140    29719.5
41141    23528.0
41142    23528.0
41143    15479.0
Name: city08, Length: 41144, dtype: double[pyarrow]

In [86]:
city_mpg.rank(method='min')

0        25555
1          136
2        35119
3          336
4        17467
         ...  
41139    25555
41140    28567
41141    21502
41142    21502
41143    13492
Name: city08, Length: 41144, dtype: uint64[pyarrow]

In [87]:
city_mpg.rank(method='dense')

0        14
1         4
2        18
3         5
4        12
         ..
41139    14
41140    15
41141    13
41142    13
41143    11
Name: city08, Length: 41144, dtype: uint64[pyarrow]

### Replacing Data

In [88]:
make.replace('Subaru', 'スバル')

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4               スバル
            ...    
41139           スバル
41140           スバル
41141           スバル
41142           スバル
41143           スバル
Name: make, Length: 41144, dtype: string[pyarrow]

In [89]:
make.replace(r'(Fer)ra(r.*)',
   value=r'\2-other-\1', regex=True)

0          Alfa Romeo
1        ri-other-Fer
2               Dodge
3               Dodge
4              Subaru
             ...     
41139          Subaru
41140          Subaru
41141          Subaru
41142          Subaru
41143          Subaru
Name: make, Length: 41144, dtype: string[pyarrow]

In [None]:
### Binning Data