# Series Deep Dive

### Import Libraries

In [69]:
# Numerical Computing
import numpy as np
# Data Manipulation
import pandas as pd
# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

#### Loading Data

In [70]:
file_path = '/Users/isisromero/desktop/effective_pandas/data/vehicles.csv'

In [71]:
df = pd.read_csv(file_path, low_memory=False)

In [72]:
city_mpg = df.city08

In [73]:
highway_mpg = df.highway08

In [74]:
city_mpg

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int64

In [75]:
highway_mpg

0        25
1        14
2        33
3        12
4        23
         ..
41139    26
41140    28
41141    24
41142    24
41143    21
Name: highway08, Length: 41144, dtype: int64

#### Series Attributes

In [76]:
len(dir(city_mpg))

418

## Operators (& Dunder Methods)

#### Dunder Methods

In [77]:
2 + 4

6

In [78]:
(city_mpg + highway_mpg) / 2

0        22.0
1        11.5
2        28.0
3        11.0
4        20.0
         ... 
41139    22.5
41140    24.0
41141    21.0
41142    21.0
41143    18.5
Length: 41144, dtype: float64

#### Index Alignment

In [79]:
s1 = pd.Series([10, 20, 30], index=[1, 2, 2])

In [80]:
s2 = pd.Series([35, 44, 53], index=[2, 2, 4], name='s2')

In [81]:
s1

1    10
2    20
2    30
dtype: int64

In [82]:
s2

2    35
2    44
4    53
Name: s2, dtype: int64

In [83]:
s1 + s2

1     NaN
2    55.0
2    64.0
2    65.0
2    74.0
4     NaN
dtype: float64

#### Iterationss

In [84]:
s1 + s2

1     NaN
2    55.0
2    64.0
2    65.0
2    74.0
4     NaN
dtype: float64

In [85]:
s1.add(s2)

1     NaN
2    55.0
2    64.0
2    65.0
2    74.0
4     NaN
dtype: float64

In [86]:
# Using fill_value
s1.add(s2, fill_value=0)

1    10.0
2    55.0
2    64.0
2    65.0
2    74.0
4    53.0
dtype: float64

#### Chaining

In [87]:
((city_mpg +
 highway_mpg)
 / 2
)

0        22.0
1        11.5
2        28.0
3        11.0
4        20.0
         ... 
41139    22.5
41140    24.0
41141    21.0
41142    21.0
41143    18.5
Length: 41144, dtype: float64

In [88]:
# Chaining Example
(city_mpg
    .add(highway_mpg)
 .div(2)
)

0        22.0
1        11.5
2        28.0
3        11.0
4        20.0
         ... 
41139    22.5
41140    24.0
41141    21.0
41142    21.0
41143    18.5
Length: 41144, dtype: float64

## Aggregate Methods

#### Aggregation

In [89]:
# Mean
city_mpg.mean()

18.369045304297103

In [90]:
city_mpg.is_unique

False

In [91]:
city_mpg.is_monotonic_increasing

False

In [92]:
city_mpg.quantile()

17.0

In [93]:
city_mpg.quantile(.9)

24.0

In [94]:
city_mpg.quantile([.1, .5, .9])

0.1    13.0
0.5    17.0
0.9    24.0
Name: city08, dtype: float64

#### Count & Mean of an Attribute

In [95]:
(city_mpg
 .gt(20)
 .sum()    
)

10272

In [96]:
(city_mpg
 .gt(20)
 .mul(100)
 .mean()
)

24.965973167412017

In [97]:
city_mpg.agg('mean')

18.369045304297103

In [98]:
def second_to_last(s):
    return s.iloc[-2]

In [99]:
city_mpg.agg(['mean', np.var, "max", second_to_last])

  city_mpg.agg(['mean', np.var, "max", second_to_last])


mean               18.369045
var                62.503036
max               150.000000
second_to_last     18.000000
Name: city08, dtype: float64

## Conversion Methods

#### Automatic Conversion

In [100]:
city_mpg.convert_dtypes()

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: Int64

In [101]:
city_mpg.astype('Int16')

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: Int16

In [102]:
# city_mpg.astype('Int8')

In [103]:
np.iinfo('int64')

iinfo(min=-9223372036854775808, max=9223372036854775807, dtype=int64)

In [104]:
np.iinfo('int8')

iinfo(min=-128, max=127, dtype=int8)

In [105]:
np.finfo('float16')

finfo(resolution=0.001, min=-6.55040e+04, max=6.55040e+04, dtype=float16)

In [106]:
np.finfo('float64')

finfo(resolution=1e-15, min=-1.7976931348623157e+308, max=1.7976931348623157e+308, dtype=float64)

#### Memory Usage

In [107]:
city_mpg.nbytes

329152

In [108]:
city_mpg.astype('Int16').nbytes

123432

In [109]:
make = df.make

In [110]:
make.nbytes

329152

In [111]:
make.memory_usage()

329280

In [112]:
make.memory_usage(deep=True)

2606395

In [113]:
(make
 .astype('category')
 .memory_usage(deep=True)
)

95888

#### Strings & Category Types

In [114]:
city_mpg.astype(str)

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: object

In [115]:
city_mpg.astype('category')

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: category
Categories (105, int64): [6, 7, 8, 9, ..., 137, 138, 140, 150]

#### Ordered Categories

In [116]:
values = pd.Series(sorted(set(city_mpg)))

In [117]:
city_type = pd.CategoricalDtype(categories=values,
                                ordered=True)

In [118]:
city_mpg.astype(city_type)

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: category
Categories (105, int64): [6 < 7 < 8 < 9 ... 137 < 138 < 140 < 150]

#### Converting to Other Types

In [119]:
city_mpg.to_frame()

Unnamed: 0,city08
0,19
1,9
2,23
3,10
4,17
...,...
41139,19
41140,20
41141,18
41142,18


## Manipulation Methods

#### .apply & .where

In [120]:
def gt20(val):
    return val > 20

In [121]:
%%timeit
city_mpg.apply(gt20)

5.52 ms ± 373 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [122]:
%%timeit
city_mpg.gt(20)

78.9 µs ± 1.11 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [123]:
make = df.make

In [124]:
make

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: object

In [125]:
make.value_counts()

make
Chevrolet                           4003
Ford                                3371
Dodge                               2583
GMC                                 2494
Toyota                              2071
                                    ... 
E. P. Dutton, Inc.                     1
Fisker                                 1
Panoz Auto-Development                 1
Environmental Rsch and Devp Corp       1
Grumman Allied Industries              1
Name: count, Length: 136, dtype: int64

In [126]:
top5 = make.value_counts().index[:5]

In [127]:
def generalize_top5(val):
    if val in top5:
        return val
    return 'Other'

In [128]:
make.apply(generalize_top5)

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: object

In [129]:
# More Idiomatic approach
make.where(make.isin(top5), other='Other')

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: object

In [130]:
%%timeit
make.apply(generalize_top5)

39.6 ms ± 1.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [131]:
%%timeit
make.where(make.isin(top5), 'Other')

2.06 ms ± 19.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [132]:
# Mask complement version
make.mask(~make.isin(top5), other='Other')

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: object

#### If Else with Pandas

In [133]:
vc = make.value_counts()

In [134]:
top5 = vc.index[:5]
top10 = vc.index[:10]

In [135]:
def generalize(val):
    if val in top5:
        return val
    elif val in top10:
        return 'Top10'
    else:
        return 'Other'

In [136]:
make.apply(generalize)

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: object

In [137]:
(make
 .where(make.isin(top5), 'Top10')
 .where(make.isin(top10), 'Other')
)

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: object

In [138]:
# Numpy Approach
np.select([make.isin(top5), make.isin(top10)],
          [make, 'Top10'], 'Other')

array(['Other', 'Other', 'Dodge', ..., 'Other', 'Other', 'Other'],
      dtype=object)

In [139]:
pd.Series(np.select([make.isin(top5), make.isin(top10)],
                   [make, 'Top10'], 'Other'), index=make.index)

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Length: 41144, dtype: object

#### Mising Data

In [140]:
cyl = df.cylinders

In [141]:
(cyl
 .isna()
 .sum()
)

206

In [142]:
missing = cyl.isna()

In [143]:
make.loc[missing]

7138     Nissan
7139     Toyota
8143     Toyota
8144       Ford
8146       Ford
          ...  
34563     Tesla
34564     Tesla
34565     Tesla
34566     Tesla
34567     Tesla
Name: make, Length: 206, dtype: object

#### Filling in Missing Data

In [144]:
cyl[cyl.isna()]

7138    NaN
7139    NaN
8143    NaN
8144    NaN
8146    NaN
         ..
34563   NaN
34564   NaN
34565   NaN
34566   NaN
34567   NaN
Name: cylinders, Length: 206, dtype: float64

In [145]:
cyl.fillna(0).loc[7136:7141]

7136    6.0
7137    6.0
7138    0.0
7139    0.0
7140    6.0
7141    6.0
Name: cylinders, dtype: float64

#### Interpolating Data

In [146]:
temp = pd.Series([32, 40, None, 42, 39, 32])
temp

0    32.0
1    40.0
2     NaN
3    42.0
4    39.0
5    32.0
dtype: float64

In [147]:
temp.interpolate()

0    32.0
1    40.0
2    41.0
3    42.0
4    39.0
5    32.0
dtype: float64

#### Clipping Data

In [148]:
city_mpg.loc[:446]

0      19
1       9
2      23
3      10
4      17
       ..
442    15
443    15
444    15
445    15
446    31
Name: city08, Length: 447, dtype: int64

In [149]:
(city_mpg
 .loc[:446]
 .clip(lower=city_mpg.quantile(.05),
      upper=city_mpg.quantile(.95))
)

0      19
1      11
2      23
3      11
4      17
       ..
442    15
443    15
444    15
445    15
446    27
Name: city08, Length: 447, dtype: int64

In [150]:
# if upper is not None:
#     subset = self.to_numpy() <= upper
#     result = result.where(subset, upper)
# if lower is not None:
#     subset = sefl.to_numpy() >= lower
#     result = result.where(subset, lower)

#### Sorting values

In [151]:
city_mpg.sort_values()

7901       6
34557      6
37161      6
35887      6
21060      6
        ... 
25615    138
34564    140
33423    150
31256    150
32599    150
Name: city08, Length: 41144, dtype: int64

In [152]:
(city_mpg.sort_values() + highway_mpg) / 2

0        22.0
1        11.5
2        28.0
3        11.0
4        20.0
         ... 
41139    22.5
41140    24.0
41141    21.0
41142    21.0
41143    18.5
Length: 41144, dtype: float64

#### Sorting the Index

In [153]:
city_mpg.sort_values().sort_index()

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int64

#### Dropping Duplicates

In [154]:
city_mpg.drop_duplicates()

0         19
1          9
2         23
3         10
4         17
        ... 
34364    127
34409    114
34564    140
34565    115
34566    104
Name: city08, Length: 105, dtype: int64

#### Ranking Data

In [155]:
city_mpg.rank()

0        27060.5
1          235.5
2        35830.0
3          607.5
4        19484.0
          ...   
41139    27060.5
41140    29719.5
41141    23528.0
41142    23528.0
41143    15479.0
Name: city08, Length: 41144, dtype: float64

In [156]:
city_mpg.rank(method='min')

0        25555.0
1          136.0
2        35119.0
3          336.0
4        17467.0
          ...   
41139    25555.0
41140    28567.0
41141    21502.0
41142    21502.0
41143    13492.0
Name: city08, Length: 41144, dtype: float64

In [157]:
city_mpg.rank(method='dense')

0        14.0
1         4.0
2        18.0
3         5.0
4        12.0
         ... 
41139    14.0
41140    15.0
41141    13.0
41142    13.0
41143    11.0
Name: city08, Length: 41144, dtype: float64

#### Replacing Data

In [158]:
make.replace('Subaru', 'スバル')

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4               スバル
            ...    
41139           スバル
41140           スバル
41141           スバル
41142           スバル
41143           スバル
Name: make, Length: 41144, dtype: object

In [159]:
make.replace(r'(Fer)ra(r.*)',
value=r'\2-other-\1', regex=True)

0          Alfa Romeo
1        ri-other-Fer
2               Dodge
3               Dodge
4              Subaru
             ...     
41139          Subaru
41140          Subaru
41141          Subaru
41142          Subaru
41143          Subaru
Name: make, Length: 41144, dtype: object

#### Binning Data

In [160]:
pd.cut(city_mpg, 10)

0        (5.856, 20.4]
1        (5.856, 20.4]
2         (20.4, 34.8]
3        (5.856, 20.4]
4        (5.856, 20.4]
             ...      
41139    (5.856, 20.4]
41140    (5.856, 20.4]
41141    (5.856, 20.4]
41142    (5.856, 20.4]
41143    (5.856, 20.4]
Name: city08, Length: 41144, dtype: category
Categories (10, interval[float64, right]): [(5.856, 20.4] < (20.4, 34.8] < (34.8, 49.2] < (49.2, 63.6] ... (92.4, 106.8] < (106.8, 121.2] < (121.2, 135.6] < (135.6, 150.0]]

In [161]:
pd.cut(city_mpg, [0, 10, 20, 40, 70, 150])

0        (10, 20]
1         (0, 10]
2        (20, 40]
3         (0, 10]
4        (10, 20]
           ...   
41139    (10, 20]
41140    (10, 20]
41141    (10, 20]
41142    (10, 20]
41143    (10, 20]
Name: city08, Length: 41144, dtype: category
Categories (5, interval[int64, right]): [(0, 10] < (10, 20] < (20, 40] < (40, 70] < (70, 150]]

In [162]:
pd.cut(city_mpg, 10)

0        (5.856, 20.4]
1        (5.856, 20.4]
2         (20.4, 34.8]
3        (5.856, 20.4]
4        (5.856, 20.4]
             ...      
41139    (5.856, 20.4]
41140    (5.856, 20.4]
41141    (5.856, 20.4]
41142    (5.856, 20.4]
41143    (5.856, 20.4]
Name: city08, Length: 41144, dtype: category
Categories (10, interval[float64, right]): [(5.856, 20.4] < (20.4, 34.8] < (34.8, 49.2] < (49.2, 63.6] ... (92.4, 106.8] < (106.8, 121.2] < (121.2, 135.6] < (135.6, 150.0]]

In [163]:
pd.cut(city_mpg, 10, labels=list(range(1, 11)))

0        1
1        1
2        2
3        1
4        1
        ..
41139    1
41140    1
41141    1
41142    1
41143    1
Name: city08, Length: 41144, dtype: category
Categories (10, int64): [1 < 2 < 3 < 4 ... 7 < 8 < 9 < 10]

## Indexing Operations

#### Prepping The Data & Renaming The index

In [165]:
city2 = city_mpg.rename(make.to_dict())
city2city2

Alfa Romeo    19
Ferrari        9
Dodge         23
Dodge         10
Subaru        17
              ..
Subaru        19
Subaru        20
Subaru        18
Subaru        18
Subaru        16
Name: city08, Length: 41144, dtype: int64

In [166]:
city2.index

Index(['Alfa Romeo', 'Ferrari', 'Dodge', 'Dodge', 'Subaru', 'Subaru', 'Subaru',
       'Toyota', 'Toyota', 'Toyota',
       ...
       'Saab', 'Saturn', 'Saturn', 'Saturn', 'Saturn', 'Subaru', 'Subaru',
       'Subaru', 'Subaru', 'Subaru'],
      dtype='object', length=41144)

In [168]:
city2 = city_mpg.rename(make)
city2

Alfa Romeo    19
Ferrari        9
Dodge         23
Dodge         10
Subaru        17
              ..
Subaru        19
Subaru        20
Subaru        18
Subaru        18
Subaru        16
Name: city08, Length: 41144, dtype: int64

In [169]:
city2.rename('citympg')

Alfa Romeo    19
Ferrari        9
Dodge         23
Dodge         10
Subaru        17
              ..
Subaru        19
Subaru        20
Subaru        18
Subaru        18
Subaru        16
Name: citympg, Length: 41144, dtype: int64

#### Resetting The index

In [170]:
city2.reset_index()

Unnamed: 0,index,city08
0,Alfa Romeo,19
1,Ferrari,9
2,Dodge,23
3,Dodge,10
4,Subaru,17
...,...,...
41139,Subaru,19
41140,Subaru,20
41141,Subaru,18
41142,Subaru,18


In [171]:
city2.reset_index(drop=True)

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int64

#### The .iloc Attribute

In [172]:
city2.loc['Subaru']

Subaru    17
Subaru    21
Subaru    22
Subaru    19
Subaru    20
          ..
Subaru    19
Subaru    20
Subaru    18
Subaru    18
Subaru    16
Name: city08, Length: 885, dtype: int64

In [173]:
city2.loc['Fisker']

20

In [174]:
city2.loc[['Fisker']]

Fisker    20
Name: city08, dtype: int64

In [176]:
city2.loc[['Ferrari', 'Lamborghini']]

Ferrari         9
Ferrari        12
Ferrari        11
Ferrari        10
Ferrari        11
               ..
Lamborghini     6
Lamborghini     8
Lamborghini     8
Lamborghini     8
Lamborghini     8
Name: city08, Length: 357, dtype: int64

In [177]:
city2.sort_index().loc['Ferrari': 'Lamborghini']

Ferrari        10
Ferrari        13
Ferrari        13
Ferrari         9
Ferrari        10
               ..
Lamborghini    12
Lamborghini     9
Lamborghini     8
Lamborghini    13
Lamborghini     8
Name: city08, Length: 11210, dtype: int64

In [179]:
city2.sort_index().loc["F":"J"]

Federal Coach    15
Federal Coach    13
Federal Coach    13
Federal Coach    14
Federal Coach    13
                 ..
Isuzu            15
Isuzu            15
Isuzu            15
Isuzu            27
Isuzu            18
Name: city08, Length: 9040, dtype: int64

In [180]:
idx = pd.Index(['Dodge'])

In [181]:
city2.loc[idx]

Dodge    23
Dodge    10
Dodge    12
Dodge    11
Dodge    11
         ..
Dodge    18
Dodge    17
Dodge    14
Dodge    14
Dodge    11
Name: city08, Length: 2583, dtype: int64

In [182]:
idx = pd.Index(['Dodge', 'Dodge'])

In [184]:
city2.loc[idx]

Dodge    23
Dodge    10
Dodge    12
Dodge    11
Dodge    11
         ..
Dodge    18
Dodge    17
Dodge    14
Dodge    14
Dodge    11
Name: city08, Length: 5166, dtype: int64

In [186]:
mask = city2 > 50
mask

Alfa Romeo    False
Ferrari       False
Dodge         False
Dodge         False
Subaru        False
              ...  
Subaru        False
Subaru        False
Subaru        False
Subaru        False
Subaru        False
Name: city08, Length: 41144, dtype: bool

In [187]:
city2.loc[mask]

Nissan     81
Toyota     81
Toyota     81
Ford       74
Nissan     84
         ... 
Tesla     140
Tesla     115
Tesla     104
Tesla      98
Toyota     55
Name: city08, Length: 236, dtype: int64

In [188]:
cost = pd.Series([1.00, 2.25, 3.99, .99, 2.79],
                 index = ['Gum', 'Cookie', 'Melon', 'Roll', 'Carrots'])

In [189]:
inflation = 1.10

In [190]:
(cost
 .mul(inflation)
 .loc[lambda s_: s_ >3]
)

Melon      4.389
Carrots    3.069
dtype: float64

In [191]:
cost = pd.Series([1.00, 2.25, 3.99, .99, 2.79],
                 index=['Gum', 'Cookie', 'Melon', 'Roll', 'Carrots'])

In [192]:
inflation = 1.10

In [193]:
mask = cost > 3

In [194]:
(cost
 .mul(inflation)
 .loc[mask]
)

Melon    4.389
dtype: float64

#### The .iloc Attribute

In [195]:
city2.iloc[0]

19

In [196]:
city2.iloc[-1]

16

In [197]:
city2.iloc[[0, 1, -1]]

Alfa Romeo    19
Ferrari        9
Subaru        16
Name: city08, dtype: int64

In [198]:
city2.iloc[0:5]

Alfa Romeo    19
Ferrari        9
Dodge         23
Dodge         10
Subaru        17
Name: city08, dtype: int64

In [199]:
city2.iloc[-8:]

Saturn    21
Saturn    24
Saturn    21
Subaru    19
Subaru    20
Subaru    18
Subaru    18
Subaru    16
Name: city08, dtype: int64

In [201]:
mask = city2 > 50

In [204]:
city2.iloc[mask.to_numpy()]

Nissan     81
Toyota     81
Toyota     81
Ford       74
Nissan     84
         ... 
Tesla     140
Tesla     115
Tesla     104
Tesla      98
Toyota     55
Name: city08, Length: 236, dtype: int64

In [205]:
city2.iloc[list(mask)]

Nissan     81
Toyota     81
Toyota     81
Ford       74
Nissan     84
         ... 
Tesla     140
Tesla     115
Tesla     104
Tesla      98
Toyota     55
Name: city08, Length: 236, dtype: int64

#### Heads & Tails

In [206]:
city2.head(3)

Alfa Romeo    19
Ferrari        9
Dodge         23
Name: city08, dtype: int64

In [207]:
city2.tail(3)

Subaru    18
Subaru    18
Subaru    16
Name: city08, dtype: int64

#### Sampling

In [208]:
city2.sample(6, random_state=42)

Volvo         16
Mitsubishi    19
Buick         27
Jeep          15
Land Rover    13
Saab          17
Name: city08, dtype: int64

#### Filtering Index Values

In [209]:
city2.filter(like='rd')

Ford    18
Ford    16
Ford    17
Ford    17
Ford    15
        ..
Ford    26
Ford    19
Ford    21
Ford    18
Ford    19
Name: city08, Length: 3371, dtype: int64

In [210]:
city2.filter(regex='(Ford)|(Subaru)')

Subaru    17
Subaru    21
Subaru    22
Ford      18
Ford      16
          ..
Subaru    19
Subaru    20
Subaru    18
Subaru    18
Subaru    16
Name: city08, Length: 4256, dtype: int64

#### Reindexing

In [212]:
# city2.reindex(['Missing', 'Ford')

In [215]:
city_mpg.reindex([0,0, 10, 20, 2_000_000])

0          19.0
0          19.0
10         23.0
20         14.0
2000000     NaN
Name: city08, dtype: float64

In [216]:
s1 = pd.Series([10, 20, 30], index=['a', 'b', 'c'])
s2 = pd.Series([15, 25, 35], index=['b', 'c', 'd'])

In [217]:
s2

b    15
c    25
d    35
dtype: int64

In [219]:
s2.reindex(s1.index)

a     NaN
b    15.0
c    25.0
dtype: float64

## String Manipulation

#### Strings & Objects

In [221]:
make

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: object

#### Categorical Strings

In [222]:
make.astype('category')

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: category
Categories (136, object): ['AM General', 'ASC Incorporated', 'Acura', 'Alfa Romeo', ..., 'Volvo', 'Wallace Environmental', 'Yugo', 'smart']

#### The .str Accessor

In [223]:
'Ford'.lower()

'ford'

In [224]:
make.str.lower()

0        alfa romeo
1           ferrari
2             dodge
3             dodge
4            subaru
            ...    
41139        subaru
41140        subaru
41141        subaru
41142        subaru
41143        subaru
Name: make, Length: 41144, dtype: object

In [225]:
'Alfa Romeo'.find('A')

0

In [226]:
make.str.find('A')

0        0
1       -1
2       -1
3       -1
4       -1
        ..
41139   -1
41140   -1
41141   -1
41142   -1
41143   -1
Name: make, Length: 41144, dtype: int64

#### Searching

In [227]:
make.str.extract(r'([^a-z A-Z])')

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
41139,
41140,
41141,
41142,


In [228]:
(make
 .str.extract(r'([^a-z A-Z])', expand=False)
 .value_counts()
)

make
-    1727
.      46
,       9
Name: count, dtype: int64

In [229]:
# For Non-Numeric Characters
# (col
#  .str.extract(r'([^0-9])', expand=False)
#  .value_counts()
# )

#### Splitting

In [230]:
age = pd.Series(['0-10', '11-15', '11-15', '61-65', '46-50'])
age

0     0-10
1    11-15
2    11-15
3    61-65
4    46-50
dtype: object

In [231]:
age.str.split('-')

0     [0, 10]
1    [11, 15]
2    [11, 15]
3    [61, 65]
4    [46, 50]
dtype: object

In [232]:
(age
 .str.split('-', expand=True)
 .iloc[:,0]
 .astype(int)
)

0     0
1    11
2    11
3    61
4    46
Name: 0, dtype: int64

In [234]:
(age
 .str.slice(-2)
 .astype(int)
)

0    10
1    15
2    15
3    65
4    50
dtype: int64

In [235]:
(age
 .str[-2:]
 .astype(int)
)

0    10
1    15
2    15
3    65
4    50
dtype: int64

In [236]:
(age
 .str.split('-', expand=True)
 .astype(int)
 .mean(axis='columns')
)

0     5.0
1    13.0
2    13.0
3    63.0
4    48.0
dtype: float64

In [237]:
import random 

def between(row):
    return random.randint(*row.values)

In [239]:
(age
 .str.split('-', expand=True)
 .astype(int)
 .apply(between, axis='columns')
)

0     8
1    11
2    13
3    61
4    50
dtype: int64

#### Optimizing .apply with Cython

In [240]:
%load_ext Cython

In [244]:
%%cython
import random
def between_cy(row):
    return random.randint(*row.values)

Content of stderr:
                module = PyImport_ImportModuleLevelObject(
                         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In [249]:
%%cython
import random
cpdef int between_cy3(int x, int y):
    return random.randint(x, y)

Content of stderr:
                module = PyImport_ImportModuleLevelObject(
                         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In [250]:
(age
 .str.split('-', expand=True)
 .astype(int)
 .apply(lambda row: between_cy3(row[0], row[1]), axis=1)
)

0     2
1    15
2    11
3    63
4    49
dtype: int64

In [253]:
%prun -l 10 (age.str.split('-', expand=True).astype(int).apply(lambda row: between_%%cy3(row[0], row[1]), axis=1))

 

         1183 function calls (1167 primitive calls) in 0.005 seconds

   Ordered by: internal time
   List reduced from 263 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.003    0.003    0.003    0.003 base.py:649(_simple_new)
        1    0.000    0.000    0.005    0.005 {built-in method builtins.exec}
      241    0.000    0.000    0.000    0.000 {built-in method builtins.isinstance}
        8    0.000    0.000    0.000    0.000 {pandas._libs.lib.maybe_convert_objects}
      2/1    0.000    0.000    0.003    0.003 series.py:371(__init__)
        9    0.000    0.000    0.000    0.000 numeric.py:274(full)
        4    0.000    0.000    0.000    0.000 construction.py:519(sanitize_array)
       10    0.000    0.000    0.000    0.000 series.py:1016(__getitem__)
        4    0.000    0.000    0.000    0.000 generic.py:6147(__finalize__)
        1    0.000    0.000    0.000    0.000 accessor.py:254(_wrap_result)

In [258]:
# %%cython
# import numpy as np
# import random
# cpdef np.ndarray[int] apply_between_cy4(np.ndarray[int] x, np.ndarray[int] y):
#     cdef np.array[int] res = np.empty(len(x), dtype='int31')
#     for i in range(len(x)):
#         res[i] = random.randint(x[i], y[i])
#     return res

#### Replacing Textmake.str.replace('A', 'Å')

In [259]:
make.str.replace('A', 'Å')

0        Ålfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: object

In [260]:
make.replace('A', 'Å')

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: object

In [261]:
make.replace({'Audi': 'Åudi', 'Acura': 'Åcura',
    'Ashton Martin': 'Åshton Martin',
    'Alfa Romeo': 'Ålfa Romeo'})

0        Ålfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: object

In [262]:
make.replace('A', 'Å', regex=True)

0        Ålfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: object

## Date & Time Manipulation

#### Loading UTC Time Data

In [263]:
col = pd.Series(['2015-03-08 08:00:00+00:00',
    '2015-03-08 08:30:00+00:00',
    '2015-03-08 09:00:00+00:00',
    '2015-03-08 09:30:00+00:00',
    '2015-11-01 06:30:00+00:00',
    '2015-11-01 07:00:00+00:00',
    '2015-11-01 07:30:00+00:00',
    '2015-11-01 08:00:00+00:00',
    '2015-11-01 08:30:00+00:00',
    '2015-11-01 08:00:00+00:00',
    '2015-11-01 08:30:00+00:00',
    '2015-11-01 09:00:00+00:00',
    '2015-11-01 09:30:00+00:00',
    '2015-11-01 10:00:00+00:00'])

In [264]:
utc_s = pd.to_datetime(col, utc=True)
utc_s

0    2015-03-08 08:00:00+00:00
1    2015-03-08 08:30:00+00:00
2    2015-03-08 09:00:00+00:00
3    2015-03-08 09:30:00+00:00
4    2015-11-01 06:30:00+00:00
5    2015-11-01 07:00:00+00:00
6    2015-11-01 07:30:00+00:00
7    2015-11-01 08:00:00+00:00
8    2015-11-01 08:30:00+00:00
9    2015-11-01 08:00:00+00:00
10   2015-11-01 08:30:00+00:00
11   2015-11-01 09:00:00+00:00
12   2015-11-01 09:30:00+00:00
13   2015-11-01 10:00:00+00:00
dtype: datetime64[ns, UTC]

In [265]:
utc_s.dt.tz_convert('America/Denver')

0    2015-03-08 01:00:00-07:00
1    2015-03-08 01:30:00-07:00
2    2015-03-08 03:00:00-06:00
3    2015-03-08 03:30:00-06:00
4    2015-11-01 00:30:00-06:00
5    2015-11-01 01:00:00-06:00
6    2015-11-01 01:30:00-06:00
7    2015-11-01 01:00:00-07:00
8    2015-11-01 01:30:00-07:00
9    2015-11-01 01:00:00-07:00
10   2015-11-01 01:30:00-07:00
11   2015-11-01 02:00:00-07:00
12   2015-11-01 02:30:00-07:00
13   2015-11-01 03:00:00-07:00
dtype: datetime64[ns, America/Denver]

In [266]:
s = pd.Series(['2015-03-08 01:00:00-07:00',
 '2015-03-08 01:30:00-07:00',
 '2015-03-08 03:00:00-06:00',
 '2015-03-08 03:30:00-06:00',
 '2015-11-01 00:30:00-06:00',
 '2015-11-01 01:00:00-06:00',
 '2015-11-01 01:30:00-06:00',
 '2015-11-01 01:00:00-07:00',
 '2015-11-01 01:30:00-07:00',
 '2015-11-01 01:00:00-07:00',
 '2015-11-01 01:30:00-07:00',
 '2015-11-01 02:00:00-07:00',
 '2015-11-01 02:30:00-07:00',
 '2015-11-01 03:00:00-07:00'])

In [267]:
pd.to_datetime(s, utc=True).dt.tz_convert('America/Denver')

0    2015-03-08 01:00:00-07:00
1    2015-03-08 01:30:00-07:00
2    2015-03-08 03:00:00-06:00
3    2015-03-08 03:30:00-06:00
4    2015-11-01 00:30:00-06:00
5    2015-11-01 01:00:00-06:00
6    2015-11-01 01:30:00-06:00
7    2015-11-01 01:00:00-07:00
8    2015-11-01 01:30:00-07:00
9    2015-11-01 01:00:00-07:00
10   2015-11-01 01:30:00-07:00
11   2015-11-01 02:00:00-07:00
12   2015-11-01 02:30:00-07:00
13   2015-11-01 03:00:00-07:00
dtype: datetime64[ns, America/Denver]

#### Loading Local Time Data

In [268]:
time = pd.Series(['2015-03-08 01:00:00',
 '2015-03-08 01:30:00',
 '2015-03-08 02:00:00',
 '2015-03-08 02:30:00',
 '2015-03-08 03:00:00',
 '2015-03-08 02:00:00',
 '2015-03-08 02:30:00',
 '2015-03-08 03:00:00',
 '2015-03-08 03:30:00',
 '2015-11-01 00:30:00',
 '2015-11-01 01:00:00',
 '2015-11-01 01:30:00',
 '2015-11-01 02:00:00',
 '2015-11-01 02:30:00',
 '2015-11-01 01:00:00',
 '2015-11-01 01:30:00',
 '2015-11-01 02:00:00',
 '2015-11-01 02:30:00',
 '2015-11-01 03:00:00'])

In [269]:
offset = pd.Series([-7, -7, -7, -7, -7, -6, -6,
  -6, -6, -6, -6, -6, -6, -6, -7, -7, -7, -7, -7])

In [270]:
(pd.to_datetime(time)
    .groupby(offset)
    .transform(lambda s: s.dt.tz_localize(s.name)
                          .dt.tz_convert('America/Denver'))
)

0    2015-03-07 18:00:07-07:00
1    2015-03-07 18:30:07-07:00
2    2015-03-07 19:00:07-07:00
3    2015-03-07 19:30:07-07:00
4    2015-03-07 20:00:07-07:00
5    2015-03-07 19:00:06-07:00
6    2015-03-07 19:30:06-07:00
7    2015-03-07 20:00:06-07:00
8    2015-03-07 20:30:06-07:00
9    2015-10-31 18:30:06-06:00
10   2015-10-31 19:00:06-06:00
11   2015-10-31 19:30:06-06:00
12   2015-10-31 20:00:06-06:00
13   2015-10-31 20:30:06-06:00
14   2015-10-31 19:00:07-06:00
15   2015-10-31 19:30:07-06:00
16   2015-10-31 20:00:07-06:00
17   2015-10-31 20:30:07-06:00
18   2015-10-31 21:00:07-06:00
dtype: datetime64[ns, America/Denver]

In [271]:
offset = offset.replace({-7:'-07:00', -6:'-06:00'})
local = (pd.to_datetime(time)
    .groupby(offset)
    .transform(lambda s: s.dt.tz_localize(s.name)
                          .dt.tz_convert('America/Denver'))
)

#### Converting Local Time to UTC

In [272]:
local.dt.tz_convert('UTC')

0    2015-03-08 08:00:00+00:00
1    2015-03-08 08:30:00+00:00
2    2015-03-08 09:00:00+00:00
3    2015-03-08 09:30:00+00:00
4    2015-03-08 10:00:00+00:00
5    2015-03-08 08:00:00+00:00
6    2015-03-08 08:30:00+00:00
7    2015-03-08 09:00:00+00:00
8    2015-03-08 09:30:00+00:00
9    2015-11-01 06:30:00+00:00
10   2015-11-01 07:00:00+00:00
11   2015-11-01 07:30:00+00:00
12   2015-11-01 08:00:00+00:00
13   2015-11-01 08:30:00+00:00
14   2015-11-01 08:00:00+00:00
15   2015-11-01 08:30:00+00:00
16   2015-11-01 09:00:00+00:00
17   2015-11-01 09:30:00+00:00
18   2015-11-01 10:00:00+00:00
dtype: datetime64[ns, UTC]

#### Converting to Epochs

In [273]:
secs = local.view(int).floordiv(1e9).astype(int)
secs

0     1425801600
1     1425803400
2     1425805200
3     1425807000
4     1425808800
5     1425801600
6     1425803400
7     1425805200
8     1425807000
9     1446359400
10    1446361200
11    1446363000
12    1446364800
13    1446366600
14    1446364800
15    1446366600
16    1446368400
17    1446370200
18    1446372000
dtype: int64

In [274]:
(pd.to_datetime(secs, unit='s')
  .dt.tz_localize('UTC'))

0    2015-03-08 08:00:00+00:00
1    2015-03-08 08:30:00+00:00
2    2015-03-08 09:00:00+00:00
3    2015-03-08 09:30:00+00:00
4    2015-03-08 10:00:00+00:00
5    2015-03-08 08:00:00+00:00
6    2015-03-08 08:30:00+00:00
7    2015-03-08 09:00:00+00:00
8    2015-03-08 09:30:00+00:00
9    2015-11-01 06:30:00+00:00
10   2015-11-01 07:00:00+00:00
11   2015-11-01 07:30:00+00:00
12   2015-11-01 08:00:00+00:00
13   2015-11-01 08:30:00+00:00
14   2015-11-01 08:00:00+00:00
15   2015-11-01 08:30:00+00:00
16   2015-11-01 09:00:00+00:00
17   2015-11-01 09:30:00+00:00
18   2015-11-01 10:00:00+00:00
dtype: datetime64[ns, UTC]

#### Manipulating Dates

In [None]:
path = '/Users/isisromero/desktop/effective_pandas/data/vehicles.csv'