<a href="https://colab.research.google.com/github/jack-cao-623/python_learning/blob/main/effective_pandas_ch_05_thru_ch_XX.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chapter 5: Series Deep Dive

In [2]:
# libraries needed
import numpy as np
import pandas as pd

In [3]:
# load data
url = 'https://github.com/mattharrison/datasets/raw/master/data/vehicles.csv.zip'
df = pd.read_csv(url)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
# one column as a series; city vs.highway
city_mpg = df.city08
highway_mpg = df.highway08
print(city_mpg)
print('\n')
print(highway_mpg)

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int64


0        25
1        14
2        33
3        12
4        23
         ..
41139    26
41140    28
41141    24
41142    24
41143    21
Name: highway08, Length: 41144, dtype: int64


In [5]:
# basic info about city_mpg series
print(city_mpg.count())
print(city_mpg.size)
print(len(city_mpg))

print('\n')

# basic info about highway_mpg series
print(highway_mpg.count())
print(highway_mpg.size)
print(len(highway_mpg))

41144
41144
41144


41144
41144
41144


In [6]:
# number of properties and methods for each series:
print(len(dir(city_mpg)))
print(len(dir(highway_mpg)))

419
419


# Chapter 6: Operators (& Dunder Methods)

In [7]:
# find average of city and highway mpg
print((city_mpg + highway_mpg) / 2)

0        22.0
1        11.5
2        28.0
3        11.0
4        20.0
         ... 
41139    22.5
41140    24.0
41141    21.0
41142    21.0
41143    18.5
Length: 41144, dtype: float64


In [8]:
# be careful about math operations with series when indices aren't aligned, i.e., unique and common across series
s1 = pd.Series(
    data = [10, 20, 30], 
    index = [1, 2, 2]
)

s2 = pd.Series(
    data = [35, 44, 53],
    index = [2, 2, 4]
)

print(s1 + s2)

1     NaN
2    55.0
2    64.0
2    65.0
2    74.0
4     NaN
dtype: float64


In [9]:
# two ways to add
print(s1 + s2)

print('\n')

print(s1.add(s2))

1     NaN
2    55.0
2    64.0
2    65.0
2    74.0
4     NaN
dtype: float64


1     NaN
2    55.0
2    64.0
2    65.0
2    74.0
4     NaN
dtype: float64


In [10]:
# benefits of .add() methods
print(s1.add(s2, fill_value = 0))

1    10.0
2    55.0
2    64.0
2    65.0
2    74.0
4    53.0
dtype: float64


In [11]:
# benefits of methods: can chain together to make more readable
print(
    city_mpg
      .add(highway_mpg)
      .div(2)
)

0        22.0
1        11.5
2        28.0
3        11.0
4        20.0
         ... 
41139    22.5
41140    24.0
41141    21.0
41142    21.0
41143    18.5
Length: 41144, dtype: float64


**see p. 41 for list of methods**

# Chapter 7: Aggregate Methods

In [12]:
# average of city and highway mpg
print('city_mpg average: ', city_mpg.mean())
print('highway_mpg average: ', highway_mpg.mean())

city_mpg average:  18.369045304297103
highway_mpg average:  24.504666537040638


In [13]:
# check if values are unique
print(city_mpg.is_unique)

print(pd.Series(data = [0, 1, 2, 3]).is_unique)

False
True


In [14]:
# check if values are monotonically increasing
print(pd.Series(data = [0, 1, 2, 2]).is_monotonic_increasing)

True


In [15]:
# find various quantiles
print('city_mpg median: ', city_mpg.median())
print('city_mpg 50th percentile: ', city_mpg.quantile(0.50))
print('city_mpg 10th percentile: ', city_mpg.quantile(0.10))

print('\n')
print('city_mpg various quantiles: ')
print(city_mpg.quantile([0.10, 0.50, 0.90]))

city_mpg median:  17.0
city_mpg 50th percentile:  17.0
city_mpg 10th percentile:  13.0


city_mpg various quantiles: 
0.1    13.0
0.5    17.0
0.9    24.0
Name: city08, dtype: float64


In [16]:
# count of number of mpg's that are greater than 20
print(
    (
        city_mpg
          .gt(20)   # boolean vector at this stage
          .sum()    # sum up booleans; get count
    )
)

# equivalent to:
print('\n')
(
    city_mpg[city_mpg > 20].count()
)

10272




10272

In [17]:
# percent of cars (assuming 1 city_mpg per car)
(
    city_mpg
      .gt(20)  # boolean TRUE and FALSE
      .mean()  # take mean to get proportion of TRUE
) * 100

24.965973167412017

In [18]:
# cell direciton above is equivalent to
len(city_mpg[city_mpg > 20]) / len(city_mpg)

0.24965973167412017

In [19]:
# more generic way to aggregate a Series
city_mpg.agg(['mean', np.var, np.max])

mean     18.369045
var      62.503036
amax    150.000000
Name: city08, dtype: float64

# Chapter 8: Conversion Methods

In [20]:
# view series
city_mpg

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int64

In [21]:
# not sure what this does...
city_mpg.convert_dtypes()

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: Int64

In [22]:
type(city_mpg)

pandas.core.series.Series

In [23]:
# view limits on integers
print(np.iinfo('int64'))
print(np.iinfo('int32'))
print(np.iinfo('int16'))
print(np.iinfo('int8'))

Machine parameters for int64
---------------------------------------------------------------
min = -9223372036854775808
max = 9223372036854775807
---------------------------------------------------------------

Machine parameters for int32
---------------------------------------------------------------
min = -2147483648
max = 2147483647
---------------------------------------------------------------

Machine parameters for int16
---------------------------------------------------------------
min = -32768
max = 32767
---------------------------------------------------------------

Machine parameters for int8
---------------------------------------------------------------
min = -128
max = 127
---------------------------------------------------------------



In [24]:
# view limits on unsigned integers
print(np.iinfo('uint64'))
print(np.iinfo('uint32'))
print(np.iinfo('uint16'))
print(np.iinfo('uint8'))

Machine parameters for uint64
---------------------------------------------------------------
min = 0
max = 18446744073709551615
---------------------------------------------------------------

Machine parameters for uint32
---------------------------------------------------------------
min = 0
max = 4294967295
---------------------------------------------------------------

Machine parameters for uint16
---------------------------------------------------------------
min = 0
max = 65535
---------------------------------------------------------------

Machine parameters for uint8
---------------------------------------------------------------
min = 0
max = 255
---------------------------------------------------------------



In [25]:
# view limits on floats
print(np.finfo('float64'))
print(np.finfo('float32'))
print(np.finfo('float16'))

Machine parameters for float64
---------------------------------------------------------------
precision =  15   resolution = 1.0000000000000001e-15
machep =    -52   eps =        2.2204460492503131e-16
negep =     -53   epsneg =     1.1102230246251565e-16
minexp =  -1022   tiny =       2.2250738585072014e-308
maxexp =   1024   max =        1.7976931348623157e+308
nexp =       11   min =        -max
---------------------------------------------------------------

Machine parameters for float32
---------------------------------------------------------------
precision =   6   resolution = 1.0000000e-06
machep =    -23   eps =        1.1920929e-07
negep =     -24   epsneg =     5.9604645e-08
minexp =   -126   tiny =       1.1754944e-38
maxexp =    128   max =        3.4028235e+38
nexp =        8   min =        -max
---------------------------------------------------------------

Machine parameters for float16
---------------------------------------------------------------
precision =   3 

In [26]:
# city_mpg is currently int64
print(city_mpg)

# check memory usage
print('\n')
print('nbytes: ', city_mpg.nbytes)

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int64


nbytes:  329152


In [27]:
# cast as int32
print(city_mpg.astype('int32').nbytes)

164576


In [28]:
city_mpg

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int64

In [29]:
print('city_mpg min and max: ', city_mpg.agg([np.min, np.max]))
#city_mpg ranges from 6 to 150

city_mpg min and max:  amin      6
amax    150
Name: city08, dtype: int64


In [30]:
# convert to uint8
city_mpg.astype('uint8').agg([np.min, np.max])
  # why does this happen? why not 6 and 150

amin      6
amax   -106
Name: city08, dtype: int8

In [31]:
# see notes about converting str to category (ordered) for memory saving; can still use str methods

# Chapter 9: Manipulation Methods

In [32]:
# series called make from df
make = df.make
print(make)

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: object


In [33]:
# get coutns fo each make
print(make.value_counts())

Chevrolet                      4003
Ford                           3371
Dodge                          2583
GMC                            2494
Toyota                         2071
                               ... 
Volga Associated Automobile       1
Panos                             1
Mahindra                          1
Excalibur Autos                   1
London Coach Co Inc               1
Name: make, Length: 136, dtype: int64


In [34]:
# for make, if make is in top 5 (Chevrolet, Ford, Dodge, GMC, Toyota), keep make as is; otherwise 'Other'

# top 5
top5 = make.value_counts().index[:5]   # .index pulls out index; [:5] pulls out top 5, which is 0 thru 4
print(top5)

Index(['Chevrolet', 'Ford', 'Dodge', 'GMC', 'Toyota'], dtype='object')


In [35]:
# if make is in top5, leave as is; otherwise, replace with 'Other'
make_bucketed = make.where(make.isin(top5), other = 'Other')
print(make_bucketed)

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: object


In [36]:
# check value_counts()
make_bucketed.value_counts()

Other        26622
Chevrolet     4003
Ford          3371
Dodge         2583
GMC           2494
Toyota        2071
Name: make, dtype: int64

In [37]:
# see mask for inverse/reverse of .where

In [38]:
# for make series, if it's in top 5, keep make as is; if it's 6-10, say "top 10", i.e., 6-10, otherwise 'Other
(
    make
      .where(cond = make.isin(make.value_counts().index[:5]), other = 'Top10')    # if in top5, keep as is, otherwise 'Top10'
      .where(cond = make.isin(make.value_counts().index[:10]), other = 'Other')   # if in top10, keep as is, otherwise 'Other'
)

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
41139    Other
41140    Other
41141    Other
41142    Other
41143    Other
Name: make, Length: 41144, dtype: object

In [39]:
# count how many missing values there are in cyl series

# create variable
cyl = df.cylinders

# total number of entries in cyl
print(len(cyl))

# number of missing values
print(
    (
        cyl
          .isna()
          .sum()
    )
)

41144
206


In [40]:
# entries in cyl that are missing
cyl.loc[cyl.isna()]

# equivlent to above
cyl[cyl.isna()]  # boolean mask inside square brackets []

7138    NaN
7139    NaN
8143    NaN
8144    NaN
8146    NaN
         ..
34563   NaN
34564   NaN
34565   NaN
34566   NaN
34567   NaN
Name: cylinders, Length: 206, dtype: float64

In [41]:
# based on subject matter expert, missing values (NaN) are for electric vehicles, which have 0 cylinders
# so replace NaN with 0
cyl.fillna(0)

print(cyl.fillna(0)[cyl.fillna(0).isna()]) # should return empty

Series([], Name: cylinders, dtype: float64)


In [42]:
# interpolate (as opposed to extrapolate)

temp = pd.Series(
    data = [32, 40, np.nan, 42, 39, 32]
)

print(temp)
print('\n')
print(temp.interpolate())

0    32.0
1    40.0
2     NaN
3    42.0
4    39.0
5    32.0
dtype: float64


0    32.0
1    40.0
2    41.0
3    42.0
4    39.0
5    32.0
dtype: float64


In [43]:
# before cliping
print(city_mpg.agg(['min', 'max']))
print('\n')
print(len(city_mpg))


min      6
max    150
Name: city08, dtype: int64


41144


In [44]:
# after clipping
print(city_mpg.clip(lower = city_mpg.quantile(0.05), upper = city_mpg.quantile(0.95)).agg(['min', 'max']))
print('\n')
print(len(city_mpg.clip(lower = city_mpg.quantile(0.05), upper = city_mpg.quantile(0.95))))

min    11.0
max    27.0
Name: city08, dtype: float64


41144


In [45]:
# clipping replaces values outside of lower and upper with lower and upper values, respectively

In [46]:
# sorting values
city_mpg.sort_values() # by default, it's ascending, like SQL

7901       6
34557      6
37161      6
21060      6
35887      6
        ... 
34563    138
34564    140
32599    150
31256    150
33423    150
Name: city08, Length: 41144, dtype: int64

In [47]:
# sort index
city_mpg.sort_values().sort_index()

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int64

In [48]:
# above is same as city_mpg
city_mpg

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int64

In [49]:
# dropping duplicate values
duplicated_series = pd.Series(data = [40, 20, 30, 20, 10])
print(duplicated_series)

0    40
1    20
2    30
3    20
4    10
dtype: int64


In [50]:
# keep the first duplicated value -- index 1, value is 20 -- but drop others
duplicated_series.drop_duplicates()

0    40
1    20
2    30
4    10
dtype: int64

In [51]:
# keep the last duplicated value, drop others -- index 3, value 20
duplicated_series.drop_duplicates(keep = 'last')

0    40
2    30
3    20
4    10
dtype: int64

In [52]:
# drop all duplicates
duplicated_series.drop_duplicates(keep = False)

0    40
2    30
4    10
dtype: int64

In [53]:
city_mpg

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int64

In [54]:
city_mpg.rank()

0        27060.5
1          235.5
2        35830.0
3          607.5
4        19484.0
          ...   
41139    27060.5
41140    29719.5
41141    23528.0
41142    23528.0
41143    15479.0
Name: city08, Length: 41144, dtype: float64

In [55]:
# what does ranking do? .rank()
# see fig. 9.5 on p. 74; instead of values, returns a rank, 1 thur n where n is len(series)

In [56]:
series_to_rank = pd.Series(data = [30, 20, 30, 20, 10])
print(series_to_rank)

0    30
1    20
2    30
3    20
4    10
dtype: int64


In [57]:
print(series_to_rank.rank())

0    4.5
1    2.5
2    4.5
3    2.5
4    1.0
dtype: float64


In [58]:
print(series_to_rank.rank(method = 'dense'))

0    3.0
1    2.0
2    3.0
3    2.0
4    1.0
dtype: float64


In [59]:
# replace values; see fig 9.6 on p. 75
print(series_to_rank)

0    30
1    20
2    30
3    20
4    10
dtype: int64


In [60]:
print(series_to_rank.replace(to_replace = {30:3000, 10:1000}))

0    3000
1      20
2    3000
3      20
4    1000
dtype: int64


In [61]:
# bin these data
print(city_mpg)

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int64


In [62]:
# 10 bins
(
pd.cut(x = city_mpg, bins = 10)      # cut city_mpg series into 10 bins; each bin covers the same ground
  .value_counts()                    # number of entries in each bin
  .sort_index()                      # sort by index
)

(5.856, 20.4]     30872
(20.4, 34.8]       9667
(34.8, 49.2]        367
(49.2, 63.6]         54
(63.6, 78.0]         11
(78.0, 92.4]         48
(92.4, 106.8]        32
(106.8, 121.2]       26
(121.2, 135.6]       55
(135.6, 150.0]       12
Name: city08, dtype: int64

In [63]:
# 10 bins, approx. number of entries in each bin, i.e., deciles
(
    pd.qcut(x = city_mpg, q = 10)
      .value_counts()
      .sort_index()
)

(5.999, 13.0]    6019
(13.0, 14.0]     2969
(14.0, 15.0]     4503
(15.0, 16.0]     3975
(16.0, 17.0]     4035
(17.0, 18.0]     4053
(18.0, 20.0]     5318
(20.0, 21.0]     2532
(21.0, 24.0]     4036
(24.0, 150.0]    3704
Name: city08, dtype: int64

In [64]:
# label the bins
pd.qcut(x = city_mpg, q = 10, labels = list(range(1, 11)))  # 1 is lowest decile, 10 is top decile

0        7
1        1
2        9
3        1
4        5
        ..
41139    7
41140    7
41141    6
41142    6
41143    4
Name: city08, Length: 41144, dtype: category
Categories (10, int64): [1 < 2 < 3 < 4 ... 7 < 8 < 9 < 10]

In [65]:
pd.qcut(x = city_mpg, q = 10)

0         (18.0, 20.0]
1        (5.999, 13.0]
2         (21.0, 24.0]
3        (5.999, 13.0]
4         (16.0, 17.0]
             ...      
41139     (18.0, 20.0]
41140     (18.0, 20.0]
41141     (17.0, 18.0]
41142     (17.0, 18.0]
41143     (15.0, 16.0]
Name: city08, Length: 41144, dtype: category
Categories (10, interval[float64, right]): [(5.999, 13.0] < (13.0, 14.0] < (14.0, 15.0] <
                                            (15.0, 16.0] ... (18.0, 20.0] < (20.0, 21.0] <
                                            (21.0, 24.0] < (24.0, 150.0]]

# Chapter 10: Indexing Operations

In [66]:
# reset index of city_mpg
city_mpg

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int64

In [69]:
# index is 0, 1, 2, 3, etc...
# change to make
print(make) # make is a series
print('\n')
city_mpg.rename(make) # change index in city_mpg to make values

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: object




Alfa Romeo    19
Ferrari        9
Dodge         23
Dodge         10
Subaru        17
              ..
Subaru        19
Subaru        20
Subaru        18
Subaru        18
Subaru        16
Name: city08, Length: 41144, dtype: int64

In [71]:
# to resent index
city_mpg.rename(make).reset_index(drop = True) # drop = True keeps it as a Series; otherwise it's a Pandas Dataframe

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int64