<a href="https://colab.research.google.com/github/jack-cao-623/python_learning/blob/main/effective_pandas_ch_05_thru_ch_XX.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chapter 5: Series Deep Dive

In [62]:
# libraries needed
import numpy as np
import pandas as pd

In [63]:
# load data
url = 'https://github.com/mattharrison/datasets/raw/master/data/vehicles.csv.zip'
df = pd.read_csv(url)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
# one column as a series; city vs.highway
city_mpg = df.city08
highway_mpg = df.highway08
print(city_mpg)
print('\n')
print(highway_mpg)

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int64


0        25
1        14
2        33
3        12
4        23
         ..
41139    26
41140    28
41141    24
41142    24
41143    21
Name: highway08, Length: 41144, dtype: int64


In [None]:
# basic info about city_mpg series
print(city_mpg.count())
print(city_mpg.size)
print(len(city_mpg))

print('\n')

# basic info about highway_mpg series
print(highway_mpg.count())
print(highway_mpg.size)
print(len(highway_mpg))

41144
41144
41144


41144
41144
41144


In [None]:
# number of properties and methods for each series:
print(len(dir(city_mpg)))
print(len(dir(highway_mpg)))

419
419


# Chapter 6: Operators (& Dunder Methods)

In [None]:
# find average of city and highway mpg
print((city_mpg + highway_mpg) / 2)

0        22.0
1        11.5
2        28.0
3        11.0
4        20.0
         ... 
41139    22.5
41140    24.0
41141    21.0
41142    21.0
41143    18.5
Length: 41144, dtype: float64


In [None]:
# be careful about math operations with series when indices aren't aligned, i.e., unique and common across series
s1 = pd.Series(
    data = [10, 20, 30], 
    index = [1, 2, 2]
)

s2 = pd.Series(
    data = [35, 44, 53],
    index = [2, 2, 4]
)

print(s1 + s2)

1     NaN
2    55.0
2    64.0
2    65.0
2    74.0
4     NaN
dtype: float64


In [None]:
# two ways to add
print(s1 + s2)

print('\n')

print(s1.add(s2))

1     NaN
2    55.0
2    64.0
2    65.0
2    74.0
4     NaN
dtype: float64


1     NaN
2    55.0
2    64.0
2    65.0
2    74.0
4     NaN
dtype: float64


In [None]:
# benefits of .add() methods
print(s1.add(s2, fill_value = 0))

1    10.0
2    55.0
2    64.0
2    65.0
2    74.0
4    53.0
dtype: float64


In [None]:
# benefits of methods: can chain together to make more readable
print(
    city_mpg
      .add(highway_mpg)
      .div(2)
)

0        22.0
1        11.5
2        28.0
3        11.0
4        20.0
         ... 
41139    22.5
41140    24.0
41141    21.0
41142    21.0
41143    18.5
Length: 41144, dtype: float64


**see p. 41 for list of methods**

# Chapter 7: Aggregate Methods

In [None]:
# average of city and highway mpg
print('city_mpg average: ', city_mpg.mean())
print('highway_mpg average: ', highway_mpg.mean())

city_mpg average:  18.369045304297103
highway_mpg average:  24.504666537040638


In [None]:
# check if values are unique
print(city_mpg.is_unique)

print(pd.Series(data = [0, 1, 2, 3]).is_unique)

False
True


In [None]:
# check if values are monotonically increasing
print(pd.Series(data = [0, 1, 2, 2]).is_monotonic_increasing)

True


In [None]:
# find various quantiles
print('city_mpg median: ', city_mpg.median())
print('city_mpg 50th percentile: ', city_mpg.quantile(0.50))
print('city_mpg 10th percentile: ', city_mpg.quantile(0.10))

print('\n')
print('city_mpg various quantiles: ')
print(city_mpg.quantile([0.10, 0.50, 0.90]))

city_mpg median:  17.0
city_mpg 50th percentile:  17.0
city_mpg 10th percentile:  13.0


city_mpg various quantiles: 
0.1    13.0
0.5    17.0
0.9    24.0
Name: city08, dtype: float64


In [None]:
# count of number of mpg's that are greater than 20
print(
    (
        city_mpg
          .gt(20)   # boolean vector at this stage
          .sum()    # sum up booleans; get count
    )
)

# equivalent to:
print('\n')
(
    city_mpg[city_mpg > 20].count()
)

10272




10272

In [None]:
# percent of cars (assuming 1 city_mpg per car)
(
    city_mpg
      .gt(20)  # boolean TRUE and FALSE
      .mean()  # take mean to get proportion of TRUE
) * 100

24.965973167412017

In [None]:
# cell direciton above is equivalent to
len(city_mpg[city_mpg > 20]) / len(city_mpg)

0.24965973167412017

In [None]:
# more generic way to aggregate a Series
city_mpg.agg(['mean', np.var, np.max])

mean     18.369045
var      62.503036
amax    150.000000
Name: city08, dtype: float64