In [2]:
import pandas as pd
url = 'https://github.com/mattharrison/datasets/raw/master/data/vehicles.csv.zip' 
df = pd.read_csv(url)
city_mpg = df.city08
highway_mpg = df.highway08

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
city_mpg

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int64

In [4]:
highway_mpg

0        25
1        14
2        33
3        12
4        23
         ..
41139    26
41140    28
41141    24
41142    24
41143    21
Name: highway08, Length: 41144, dtype: int64

In [5]:
len(dir(city_mpg))

419

In [6]:
(city_mpg + highway_mpg )/2

0        22.0
1        11.5
2        28.0
3        11.0
4        20.0
         ... 
41139    22.5
41140    24.0
41141    21.0
41142    21.0
41143    18.5
Length: 41144, dtype: float64

In [7]:
city_mpg.mean()

18.369045304297103

In [8]:
city_mpg.is_monotonic_increasing

False

In [9]:
city_mpg.quantile(0.9)
city_mpg.quantile([.1, .5, .9])
city_mpg.gt(20).sum()
city_mpg.eq(20).mul(100).mean()

5.60470542484931

In [10]:
city_mpg.agg('mean')

18.369045304297103

In [11]:
import numpy as np
def second_to_last(x):
    return x.iloc[-2]

city_mpg.agg(['mean', np.var, max, second_to_last])

mean               18.369045
var                62.503036
max               150.000000
second_to_last     18.000000
Name: city08, dtype: float64

In [12]:
stock_df = pd.read_csv('/home/jose/VSCodeProjects/pandas/National_Stock_Exchange_of_India_Ltd.csv')
stock_df['Symbol'].count()
stock_df['Symbol'].size
stock_df['30 d % chng'].agg(['count', 'size', 'nunique', 'mean', 'max'])

count      50.0000
size       50.0000
nunique    50.0000
mean       -5.9968
max         6.3600
Name: 30 d % chng, dtype: float64

# Conversion methods

## Automatic conversion

In [13]:
city_mpg.convert_dtypes()

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: Int64

In [14]:
city_mpg.astype('Int16')

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: Int16

**The max value for Int8 is 150 and we have values over that so this will not work.**

In [15]:
city_mpg.astype('Int8')

TypeError: cannot safely cast non-equivalent int64 to int8

Here is how we can see max value types.

In [None]:
np.iinfo('int64')

iinfo(min=-9223372036854775808, max=9223372036854775807, dtype=int64)

In [None]:
np.iinfo('uint8')

iinfo(min=0, max=255, dtype=uint8)

In [None]:
np.finfo('float16')

finfo(resolution=0.001, min=-6.55040e+04, max=6.55040e+04, dtype=float16)

In [None]:
np.finfo('float64')

finfo(resolution=1e-15, min=-1.7976931348623157e+308, max=1.7976931348623157e+308, dtype=float64)

## Memory usage

In [None]:
city_mpg.nbytes

329152

In [None]:
city_mpg.astype('Int16').nbytes

123432

In [None]:
make = df.make
make.nbytes

329152

In [None]:
make.memory_usage()

329280

## String and category types

In [None]:
city_mpg.astype(str)

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: object

In [None]:
city_mpg.astype('category')

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: category
Categories (105, int64): [6, 7, 8, 9, ..., 137, 138, 140, 150]

## Ordered Categories

In [None]:
values = pd.Series(sorted(set(city_mpg)))
city_type = pd.CategoricalDtype(categories=values, ordered=True)
city_mpg.astype(city_type)

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: category
Categories (105, int64): [6 < 7 < 8 < 9 ... 137 < 138 < 140 < 150]

## Converting to other types
We can convert Series objects back into DataFrames.

In [None]:
city_mpg.to_frame()

Unnamed: 0,city08
0,19
1,9
2,23
3,10
4,17
...,...
41139,19
41140,20
41141,18
41142,18


### Exercises
1. Convert a numeric column to a smaller type.

In [22]:
workouts_df = pd.read_csv('/home/jose/VSCodeProjects/pandas/workouts.csv')
workouts_df['Calories Burned'].fillna(0).astype('int32')

0       19
1        2
2      100
3      151
4      100
      ... 
749    105
750    293
751     47
752     30
753    202
Name: Calories Burned, Length: 754, dtype: int32

2. Calculate the memory savings by converting to smaller numeric types.

In [None]:
calories = workouts_df['Calories Burned']
calories.nbytes

6032

In [None]:
calories.memory_usage()

6160

In [None]:
workouts_df['Calories Burned'].fillna(0).astype('int32').memory_usage()

3144

3. Convert a string column into a categorical type.

In [None]:
workouts_df['Instructor Name'].fillna('Unknown').astype('str')

0           Olivia Amato
1            Cody Rigsby
2              Jess Sims
3       Matty Maggiacomo
4             Ben Alldis
             ...        
749            Rad Lopez
750         Camila Ramon
751       Alex Toussaint
752       Daniel McKenna
753    Callie Gullickson
Name: Instructor Name, Length: 754, dtype: object

In [None]:
workouts_df['Instructor Name'].fillna('Unknown').astype('category')

0           Olivia Amato
1            Cody Rigsby
2              Jess Sims
3       Matty Maggiacomo
4             Ben Alldis
             ...        
749            Rad Lopez
750         Camila Ramon
751       Alex Toussaint
752       Daniel McKenna
753    Callie Gullickson
Name: Instructor Name, Length: 754, dtype: category
Categories (43, object): ['Aditi Shah', 'Adrian Williams', 'Alex Toussaint', 'Ally Love', ..., 'Sam Yo', 'Selena Samuela', 'Tunde Oyeneyin', 'Unknown']

4. Calculate the memory savings by converting to a categorical type.

In [None]:
workouts_df['Instructor Name'].fillna('Unknown').astype('str').memory_usage()

6160

In [None]:
workouts_df['Instructor Name'].fillna('Unknown').astype('category').memory_usage()

2298

# Chapter 8: Conversion Methods
## Automatic conversion

In [None]:
city_mpg.convert_dtypes()

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: Int64

In [None]:
city_mpg.astype('Int16')

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: Int16

Converting to Int8 not possible because we have values over 150.

In [None]:
city_mpg.astype('Int8')

TypeError: cannot safely cast non-equivalent int64 to int8

### Limits on integers and floats

In [None]:
np.iinfo('int64')

iinfo(min=-9223372036854775808, max=9223372036854775807, dtype=int64)

In [None]:
np.iinfo('uint8')

iinfo(min=0, max=255, dtype=uint8)

In [None]:
np.finfo('float16')

finfo(resolution=0.001, min=-6.55040e+04, max=6.55040e+04, dtype=float16)

In [None]:
np.finfo('float64')

finfo(resolution=1e-15, min=-1.7976931348623157e+308, max=1.7976931348623157e+308, dtype=float64)

## Memory Usage

In [None]:
city_mpg.nbytes

329152

In [None]:
city_mpg.astype('Int16').nbytes

123432

In [None]:
make = df.make
make.nbytes

329152

nbytes only shows how much memory the Pandas object takes up. To get amount of memory including the strings, we use .memory_usage.

In [None]:
make.memory_usage()

329280

In [None]:
make.memory_usage(deep=True)

2606395

## String and Category Types

In [None]:
city_mpg.astype(str)

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: object

Converting to categorical data has large memory savings because pandas stores Python strings when you have string data.

In [None]:
city_mpg.astype('category')

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: category
Categories (105, int64): [6, 7, 8, 9, ..., 137, 138, 140, 150]

## Ordered categories

To create ordered categories, define your own CategoricalDtype:


In [None]:
values = pd.Series(sorted(set(city_mpg)))
city_type = pd.CategoricalDtype(categories=values, ordered=True)
city_mpg.astype(city_type)

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: category
Categories (105, int64): [6 < 7 < 8 < 9 ... 137 < 138 < 140 < 150]

## Converting to Other Types
Stay away from using .to_numpy and .to_list. It can slow down code significantly.

We can convert a single column series into a dataframe

In [None]:
city_mpg.to_frame()

Unnamed: 0,city08
0,19
1,9
2,23
3,10
4,17
...,...
41139,19
41140,20
41141,18
41142,18


## Exercises

1. Convert a numeric column to a smaller type.

In [None]:
workouts_df.loc[workouts_df['Length (minutes)'] == 'None', 'Length (minutes)'] = 0
workouts_df['Length (minutes)'].astype('uint8')

0      15
1      10
2      20
3      30
4      20
       ..
749    10
750    20
751     5
752     5
753    20
Name: Length (minutes), Length: 754, dtype: uint8

2. Calculate the memory savings by converting to smaller numeric types

In [None]:
workouts_df['Length (minutes)'].memory_usage()

6160

In [None]:
workouts_df.loc[workouts_df['Length (minutes)'] == 'None', 'Length (minutes)'] = 0
workouts_df['Length (minutes)'].astype('uint8').memory_usage()

882

3. Convert a string column into a categorical type.

In [None]:
workouts_df['Instructor Name'].astype('category')

0           Olivia Amato
1            Cody Rigsby
2              Jess Sims
3       Matty Maggiacomo
4             Ben Alldis
             ...        
749            Rad Lopez
750         Camila Ramon
751       Alex Toussaint
752       Daniel McKenna
753    Callie Gullickson
Name: Instructor Name, Length: 754, dtype: category
Categories (42, object): ['Aditi Shah', 'Adrian Williams', 'Alex Toussaint', 'Ally Love', ..., 'Ross Rayburn', 'Sam Yo', 'Selena Samuela', 'Tunde Oyeneyin']

4. Calculate the memory savings by converting to a categorical type.

In [None]:
workouts_df['Instructor Name'].astype('category').memory_usage()

2290

In [None]:
workouts_df['Instructor Name'].memory_usage(deep=True)

51802

# Chapter 9 Manipulation Methods

## 9.1 .apply and .where

*Note: using apply breakd out the dataframe into individual values in the series which makes it slow.*

In [None]:
def gt20(val):
    return val > 20

%timeit city_mpg.apply(gt20)

18.7 ms ± 200 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
%timeit city_mpg.gt(20)

146 µs ± 1.15 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


Replace makes in dataset that arent in top 5.

In [None]:
make = df.make
make.value_counts()

Chevrolet                      4003
Ford                           3371
Dodge                          2583
GMC                            2494
Toyota                         2071
                               ... 
Volga Associated Automobile       1
Panos                             1
Mahindra                          1
Excalibur Autos                   1
London Coach Co Inc               1
Name: make, Length: 136, dtype: int64

In [None]:
top5 = make.value_counts().index[:5]
def generalize_make(val):
    if val in top5:
        return val
    return 'Other'
%timeit make.apply(generalize_make)

48.5 ms ± 316 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


.where uses a boolean array to mark where a condition is true

In [None]:
%timeit make.where(make.isin(top5), 'Other')

1.96 ms ± 27.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


The complement of the where method is the mask method. This takes the condition wherever it is False and keeps the original values.
*Note: it is better to use .where and ignore mask since it is the complement.*

In [None]:
%timeit make.mask(~make.isin(top5), 'Other')

2.14 ms ± 36.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## 9.3 Missing Data

In [None]:
cyl = df.cylinders
cyl.isna().sum()

206

In [None]:
missing = cyl.isna()

In [None]:
make.loc[missing]

7138     Nissan
7139     Toyota
8143     Toyota
8144       Ford
8146       Ford
          ...  
34563     Tesla
34564     Tesla
34565     Tesla
34566     Tesla
34567     Tesla
Name: make, Length: 206, dtype: object

## 9.4 Filling in missing data

In [None]:
cyl[cyl.isna()]

7138    NaN
7139    NaN
8143    NaN
8144    NaN
8146    NaN
         ..
34563   NaN
34564   NaN
34565   NaN
34566   NaN
34567   NaN
Name: cylinders, Length: 206, dtype: float64

In [None]:
cyl.fillna(0).loc[7136:7141]

7136    6.0
7137    6.0
7138    0.0
7139    0.0
7140    6.0
7141    6.0
Name: cylinders, dtype: float64

## 9.5 Interpolating Data

In [None]:
temp = pd.Series([32, 40, None, 42, 39, 32])
temp

0    32.0
1    40.0
2     NaN
3    42.0
4    39.0
5    32.0
dtype: float64

In [None]:
temp.interpolate()

0    32.0
1    40.0
2    41.0
3    42.0
4    39.0
5    32.0
dtype: float64

## 9.6 Clipping Data

In [None]:
city_mpg.loc[:446]

0      19
1       9
2      23
3      10
4      17
       ..
442    15
443    15
444    15
445    15
446    31
Name: city08, Length: 447, dtype: int64

In [None]:
city_mpg.loc[:446].clip(lower=city_mpg.quantile(0.05), upper=city_mpg.quantile(0.95))

0      19.0
1      11.0
2      23.0
3      11.0
4      17.0
       ... 
442    15.0
443    15.0
444    15.0
445    15.0
446    27.0
Name: city08, Length: 447, dtype: float64

## 9.7 Sorting values

In [None]:
city_mpg.sort_values()

7901       6
34557      6
37161      6
21060      6
35887      6
        ... 
34563    138
34564    140
32599    150
31256    150
33423    150
Name: city08, Length: 41144, dtype: int64

In [None]:
(city_mpg.sort_values() + highway_mpg) / 2

0        22.0
1        11.5
2        28.0
3        11.0
4        20.0
         ... 
41139    22.5
41140    24.0
41141    21.0
41142    21.0
41143    18.5
Length: 41144, dtype: float64

## 9.8 Sorting the index

In [None]:
city_mpg.sort_values().sort_index()

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int64

## 9.9 Dropping Duplicates

In [None]:
city_mpg.drop_duplicates()

0         19
1          9
2         23
3         10
4         17
        ... 
34364    127
34409    114
34564    140
34565    115
34566    104
Name: city08, Length: 105, dtype: int64

## 9.10 Ranking Data

In [None]:
city_mpg.rank()

0        27060.5
1          235.5
2        35830.0
3          607.5
4        19484.0
          ...   
41139    27060.5
41140    29719.5
41141    23528.0
41142    23528.0
41143    15479.0
Name: city08, Length: 41144, dtype: float64

In [None]:
city_mpg.rank(method='min')

0        25555.0
1          136.0
2        35119.0
3          336.0
4        17467.0
          ...   
41139    25555.0
41140    28567.0
41141    21502.0
41142    21502.0
41143    13492.0
Name: city08, Length: 41144, dtype: float64

In [None]:
city_mpg.rank(method='dense')

0        14.0
1         4.0
2        18.0
3         5.0
4        12.0
         ... 
41139    14.0
41140    15.0
41141    13.0
41142    13.0
41143    11.0
Name: city08, Length: 41144, dtype: float64

## 9.11 Replacing Data

In [None]:
make = df.make
make.replace('Subaru', 'スバル')

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4               スバル
            ...    
41139           スバル
41140           スバル
41141           スバル
41142           スバル
41143           スバル
Name: make, Length: 41144, dtype: object

In [None]:
make.replace(r'(Fer)ra(r.*)' , value=r'\2-other-\1', regex=True)

0          Alfa Romeo
1        ri-other-Fer
2               Dodge
3               Dodge
4              Subaru
             ...     
41139          Subaru
41140          Subaru
41141          Subaru
41142          Subaru
41143          Subaru
Name: make, Length: 41144, dtype: object

## 9.12 Binning Data

We can create bin values of equal width.

In [None]:
pd.cut(city_mpg, 10)

0        (5.856, 20.4]
1        (5.856, 20.4]
2         (20.4, 34.8]
3        (5.856, 20.4]
4        (5.856, 20.4]
             ...      
41139    (5.856, 20.4]
41140    (5.856, 20.4]
41141    (5.856, 20.4]
41142    (5.856, 20.4]
41143    (5.856, 20.4]
Name: city08, Length: 41144, dtype: category
Categories (10, interval[float64, right]): [(5.856, 20.4] < (20.4, 34.8] < (34.8, 49.2] < (49.2, 63.6] ... (92.4, 106.8] < (106.8, 121.2] < (121.2, 135.6] < (135.6, 150.0]]

If you have specific sizes for bin edges, you can specify those. In the following example five
bins are created (so you need to provide six edges)

In [None]:
pd.cut(city_mpg, [0, 10, 20, 40, 70, 150])

0        (10, 20]
1         (0, 10]
2        (20, 40]
3         (0, 10]
4        (10, 20]
           ...   
41139    (10, 20]
41140    (10, 20]
41141    (10, 20]
41142    (10, 20]
41143    (10, 20]
Name: city08, Length: 41144, dtype: category
Categories (5, interval[int64, right]): [(0, 10] < (10, 20] < (20, 40] < (40, 70] < (70, 100]]

We can also bin data with quantities. The example below creates 10 bins with approximately the same number of entries in each bin.

In [None]:
pd.qcut(city_mpg, 10)

0         (18.0, 20.0]
1        (5.999, 13.0]
2         (21.0, 24.0]
3        (5.999, 13.0]
4         (16.0, 17.0]
             ...      
41139     (18.0, 20.0]
41140     (18.0, 20.0]
41141     (17.0, 18.0]
41142     (17.0, 18.0]
41143     (15.0, 16.0]
Name: city08, Length: 41144, dtype: category
Categories (10, interval[float64, right]): [(5.999, 13.0] < (13.0, 14.0] < (14.0, 15.0] < (15.0, 16.0] ... (18.0, 20.0] < (20.0, 21.0] < (21.0, 24.0] < (24.0, 150.0]]

We can also label the categorical intervals.

In [None]:
pd.qcut(city_mpg, 10, labels=list(range(1, 11)))

0        7
1        1
2        9
3        1
4        5
        ..
41139    7
41140    7
41141    6
41142    6
41143    4
Name: city08, Length: 41144, dtype: category
Categories (10, int64): [1 < 2 < 3 < 4 ... 7 < 8 < 9 < 10]

## 9.14 Exercises
1. Create a series from a numeric column that has the value of 'high' if it is equal to or above
the mean and 'low' if it is below the mean using .apply.

In [None]:
workouts_df['Total Output'].apply(lambda x: 'Low' if x < workouts_df['Total Output'].mean() else 'High')

0      High
1      High
2      High
3      High
4      High
       ... 
749    High
750    High
751     Low
752    High
753    High
Name: Total Output, Length: 754, dtype: object

2. Create a series from a numeric column that has the value of 'high' if it is equal to or above the mean and 'low' if it is below the mean using np.select.

In [None]:
condlist = [workouts_df['Total Output'] < workouts_df['Total Output'].mean(), workouts_df['Total Output'] > workouts_df['Total Output'].mean()]
choicelist = ['Low', 'High']
np.select(condlist, choicelist)

array(['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', 'Low',
       'Low', 'High', '0', 'Low', 'Low', '0', '0', 'High', 'Low', '0',
       '0', '0', 'Low', 'Low', '0', 'Low', 'Low', 'Low', '0', '0', '0',
       'High', 'Low', 'Low', '0', '0', '0', '0', 'High', 'Low', '0',
       'High', 'Low', '0', '0', '0', 'Low', 'Low', 'Low', '0', 'Low',
       'High', 'Low', '0', '0', '0', '0', 'High', 'Low', '0', '0', '0',
       'High', 'Low', '0', 'Low', 'High', 'Low', '0', '0', '0', 'Low',
       'High', 'Low', '0', 'High', 'Low', '0', 'High', 'Low', '0', 'Low',
       'Low', 'High', 'Low', 'Low', '0', '0', '0', 'Low', 'High', 'Low',
       '0', '0', '0', 'High', 'Low', 'Low', '0', '0', '0', '0', 'High',
       'Low', '0', 'High', 'Low', '0', '0', '0', 'High', 'Low', '0',
       'High', 'Low', '0', '0', '0', '0', '0', '0', '0', '0', '0', 'High',
       'Low', '0', 'High', 'Low', '0', 'Low', 'Low', 'High', 'Low', '0',
       '0', 'High', 'Low', '0', 'High', 'Low', '0', 'High', 'Low', '0'

3. Time the differences between the previous two solutions to see which is faster.

In [None]:
%timeit np.select(condlist, choicelist)

23.9 µs ± 129 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [None]:
%timeit workouts_df['Total Output'].apply(lambda x: 'Low' if x < workouts_df['Total Output'].mean() else 'High')

15.7 ms ± 151 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


4. Replace the missing values of a numeric series with the median value.


In [None]:
workouts_df['Total Output'].replace(to_replace=np.nan, value=workouts_df['Total Output'].median())

0      145.0
1      145.0
2      145.0
3      145.0
4      145.0
       ...  
749    145.0
750    220.0
751     19.0
752    145.0
753    145.0
Name: Total Output, Length: 754, dtype: float64

5. Clip the values of a numeric series to between to 10th and 90th percentiles.

In [None]:
workouts_df['Calories Burned'].clip(lower=workouts_df['Calories Burned'].quantile(0.05), upper=workouts_df['Calories Burned'].quantile(0.95))

0       28.0
1       28.0
2      100.0
3      151.0
4      100.0
       ...  
749    105.0
750    293.0
751     47.0
752     30.0
753    202.0
Name: Calories Burned, Length: 754, dtype: float64

In [None]:
workouts_df['Calories Burned']

0       19.0
1        2.0
2      100.0
3      151.0
4      100.0
       ...  
749    105.0
750    293.0
751     47.0
752     30.0
753    202.0
Name: Calories Burned, Length: 754, dtype: float64

6. Using a categorical column, replace any value that is not in the top 5 most frequent values
with 'Other'.

In [None]:
workouts_df['Instructor Name'].astype('category').replace(to_replace=workouts_df['Instructor Name'].value_counts().index.tolist()[:5], value='Other')

0                  Other
1            Cody Rigsby
2              Jess Sims
3                  Other
4                  Other
             ...        
749                Other
750         Camila Ramon
751       Alex Toussaint
752       Daniel McKenna
753    Callie Gullickson
Name: Instructor Name, Length: 754, dtype: category
Categories (38, object): ['Aditi Shah', 'Adrian Williams', 'Alex Toussaint', 'Ally Love', ..., 'Ross Rayburn', 'Sam Yo', 'Selena Samuela', 'Tunde Oyeneyin']

7. Using a categorical column, replace any value that is not in the top 10 most frequent values
with 'Other'.

In [None]:
workouts_df['Instructor Name'].astype('category').replace(to_replace=workouts_df['Instructor Name'].value_counts().index.tolist()[:10], value='Other')

0                  Other
1            Cody Rigsby
2              Jess Sims
3                  Other
4                  Other
             ...        
749                Other
750         Camila Ramon
751       Alex Toussaint
752       Daniel McKenna
753    Callie Gullickson
Name: Instructor Name, Length: 754, dtype: category
Categories (33, object): ['Aditi Shah', 'Adrian Williams', 'Alex Toussaint', 'Andy Speer', ..., 'Rebecca Kennedy', 'Robin Arzon', 'Ross Rayburn', 'Tunde Oyeneyin']

8. Make a function that takes a categorical series and a number (n) and returns a replace series that replaces any value that is not in the top n most frequent values with 'Other'.


In [None]:
def replace_not_top_n(series: pd.Series, n: int) -> pd.Series:
    return series.astype('category').replace(to_replace=series.value_counts().index.tolist()[:n], value='Other')

replace_not_top_n(workouts_df['Instructor Name'], 10)

0                  Other
1            Cody Rigsby
2              Jess Sims
3                  Other
4                  Other
             ...        
749                Other
750         Camila Ramon
751       Alex Toussaint
752       Daniel McKenna
753    Callie Gullickson
Name: Instructor Name, Length: 754, dtype: category
Categories (33, object): ['Aditi Shah', 'Adrian Williams', 'Alex Toussaint', 'Andy Speer', ..., 'Rebecca Kennedy', 'Robin Arzon', 'Ross Rayburn', 'Tunde Oyeneyin']

9. Using a numeric column, bin it into 10 groups that have the same width.

In [None]:
pd.cut(workouts_df['Calories Burned'], 10)

0      (-0.989, 98.9]
1      (-0.989, 98.9]
2       (98.9, 197.8]
3       (98.9, 197.8]
4       (98.9, 197.8]
            ...      
749     (98.9, 197.8]
750    (197.8, 296.7]
751    (-0.989, 98.9]
752    (-0.989, 98.9]
753    (197.8, 296.7]
Name: Calories Burned, Length: 754, dtype: category
Categories (10, interval[float64, right]): [(-0.989, 98.9] < (98.9, 197.8] < (197.8, 296.7] < (296.7, 395.6] ... (593.4, 692.3] < (692.3, 791.2] < (791.2, 890.1] < (890.1, 989.0]]

10. Using a numeric column, bin it into 10 groups that have equal sized bins.

In [None]:
pd.qcut(workouts_df['Calories Burned'], 10)

0      (-0.001, 34.0]
1      (-0.001, 34.0]
2       (81.0, 122.0]
3      (122.0, 190.0]
4       (81.0, 122.0]
            ...      
749     (81.0, 122.0]
750    (247.2, 324.8]
751      (34.0, 53.0]
752    (-0.001, 34.0]
753    (190.0, 247.2]
Name: Calories Burned, Length: 754, dtype: category
Categories (10, interval[float64, right]): [(-0.001, 34.0] < (34.0, 53.0] < (53.0, 60.0] < (60.0, 81.0] ... (190.0, 247.2] < (247.2, 324.8] < (324.8, 399.4] < (399.4, 989.0]]

# Chapter 10: Indexing Operations

We can pass in a dictionary to map the previous index label to the new label:

In [None]:
city2 = city_mpg.rename(make.to_dict())
city2

Alfa Romeo    19
Ferrari        9
Dodge         23
Dodge         10
Subaru        17
              ..
Subaru        19
Subaru        20
Subaru        18
Subaru        18
Subaru        16
Name: city08, Length: 41144, dtype: int64

We can view the index

In [None]:
city2.index

Index(['Alfa Romeo', 'Ferrari', 'Dodge', 'Dodge', 'Subaru', 'Subaru', 'Subaru',
       'Toyota', 'Toyota', 'Toyota',
       ...
       'Saab', 'Saturn', 'Saturn', 'Saturn', 'Saturn', 'Subaru', 'Subaru',
       'Subaru', 'Subaru', 'Subaru'],
      dtype='object', length=41144)

We can also pass in a Series.

In [None]:
city2 = city_mpg.rename(make)
city2

Alfa Romeo    19
Ferrari        9
Dodge         23
Dodge         10
Subaru        17
              ..
Subaru        19
Subaru        20
Subaru        18
Subaru        18
Subaru        16
Name: city08, Length: 41144, dtype: int64

Careful though, passing in a scalar value into .rename will change the .name attribute of the series.

In [None]:
city2.rename('citympg')

Alfa Romeo    19
Ferrari        9
Dodge         23
Dodge         10
Subaru        17
              ..
Subaru        19
Subaru        20
Subaru        18
Subaru        18
Subaru        16
Name: citympg, Length: 41144, dtype: int64

## 10.2 Resetting the Index

This sets the index to increasing and moved the current index to its own column.

In [None]:
city2.reset_index()

Unnamed: 0,index,city08
0,Alfa Romeo,19
1,Ferrari,9
2,Dodge,23
3,Dodge,10
4,Subaru,17
...,...,...
41139,Subaru,19
41140,Subaru,20
41141,Subaru,18
41142,Subaru,18


In [None]:
city2.reset_index(drop=True)

0        19
1         9
2        23
3        10
4        17
         ..
41139    19
41140    20
41141    18
41142    18
41143    16
Name: city08, Length: 41144, dtype: int64

## 10.3 The .loc Attribute

.loc can be used to pull out data using indexing.
The following can be passed into .loc:
* A scalar value of one of the index labels
* A list of index labels
* A slice of labels (closed intervals so it includes the stop values)
* An index
* A boolean array (same index labels as the series but with True or False values)
* A function that accepts a series and returns one of the above

One caveat we need to watch out for with .loc is that it can return a scalar value if only one entry is found. Otherwise, it returns a Series.

In [None]:
city2.loc['Subaru']

Subaru    17
Subaru    21
Subaru    22
Subaru    19
Subaru    20
          ..
Subaru    19
Subaru    20
Subaru    18
Subaru    18
Subaru    16
Name: city08, Length: 885, dtype: int64

In [None]:
city2.loc[['Fisker']]

Fisker    20
Name: city08, dtype: int64

In [None]:
city2.loc[['Ferrari', 'Lamborghini']]

Ferrari         9
Ferrari        12
Ferrari        11
Ferrari        10
Ferrari        11
               ..
Lamborghini     6
Lamborghini     8
Lamborghini     8
Lamborghini     8
Lamborghini     8
Name: city08, Length: 357, dtype: int64

To mitigate this, we can pass in a list rather than a scalar to gaurantee that a series will be returned.

In [None]:
city2.loc[['Fisker']]

Fisker    20
Name: city08, dtype: int64

In [None]:
city2.loc[['Fisker', 'Ferrari', 'Lamborghini']]

Fisker         20
Ferrari         9
Ferrari        12
Ferrari        11
Ferrari        10
               ..
Lamborghini     6
Lamborghini     8
Lamborghini     8
Lamborghini     8
Lamborghini     8
Name: city08, Length: 358, dtype: int64

We can slice with string values using string values but we need to sort first. Otherwise, it will throw a keyerror.

In [None]:
city2.loc['Ferrari': 'Lamborghini']

KeyError: "Cannot get left slice bound for non-unique label: 'Ferrari'"

In [None]:
city2.sort_index().loc['Ferrari': 'Lamborghini']

Ferrari        10
Ferrari        13
Ferrari        13
Ferrari         9
Ferrari        10
               ..
Lamborghini    12
Lamborghini     9
Lamborghini     8
Lamborghini    13
Lamborghini     8
Name: city08, Length: 11210, dtype: int64

We can also slice based on strings that are not labels. We can can slice on all labels that start with F and go up to index labels G, H, I and up to J.

In [None]:
city2.sort_index().loc['F':'J']

Federal Coach    15
Federal Coach    13
Federal Coach    13
Federal Coach    14
Federal Coach    13
                 ..
Isuzu            15
Isuzu            15
Isuzu            15
Isuzu            27
Isuzu            18
Name: city08, Length: 9040, dtype: int64

We can create an Index to align a series to a new index.

In [None]:
idx = pd.Index(['Dodge'])
city2.loc[idx]

Dodge    23
Dodge    10
Dodge    12
Dodge    11
Dodge    11
         ..
Dodge    18
Dodge    17
Dodge    14
Dodge    14
Dodge    11
Name: city08, Length: 2583, dtype: int64

If we duplicate 'Dodge'in the Index, then we get twice as many values.

In [None]:
idx = pd.Index(['Dodge', 'Dodge'])
city2.loc[idx]

Dodge    23
Dodge    10
Dodge    12
Dodge    11
Dodge    11
         ..
Dodge    18
Dodge    17
Dodge    14
Dodge    14
Dodge    11
Name: city08, Length: 5166, dtype: int64

We can also pass in a boolean array to .loc and it will return only the values where the boolean array was true.

In [None]:
mask = city2 > 50
mask

Alfa Romeo    False
Ferrari       False
Dodge         False
Dodge         False
Subaru        False
              ...  
Subaru        False
Subaru        False
Subaru        False
Subaru        False
Subaru        False
Name: city08, Length: 41144, dtype: bool

In [None]:
city2.loc[mask]

Nissan     81
Toyota     81
Toyota     81
Ford       74
Nissan     84
         ... 
Tesla     140
Tesla     115
Tesla     104
Tesla      98
Toyota     55
Name: city08, Length: 236, dtype: int64

We can pass in functions to .loc which is useful when chaining operations.

In [None]:
cost = pd.Series([1.00, 2.25, 3.99, .99, 2.79],
        index=['Gum', 'Cookie', 'Melon', 'Roll', 'Carrots'])
inflation = 1.10
(cost.mul(inflation).loc[lambda s_: s_ > 3])

Melon      4.389
Carrots    3.069
dtype: float64

Below, we get a different answer because we calculate the boolean array before taking into account the inflation.

In [None]:
code = pd.Series([1.00, 2.25, 3.99, .99, 2.79],
        index=['Gum', 'Cookie', 'Melon', 'Roll', 'Carrots'])
inflation = 1.10
mask = cost > 3
(cost.mul(inflation).loc[mask])

Melon    4.389
dtype: float64

## 10.4 The .iloc Attribute

iloc is used when pulling out items by the index position. It supports indexing with the following:
* A scalar index position (an integer)
* A list of index positions
* A slice of positions (half open interval so it does not include stop value)
* A NumPy array (or Python list) of boolean values
* A function that accepts a series and returns one of the above

In [None]:
city2.iloc[0]

19

In [None]:
city2.iloc[-1]

16

We can also return a series object with a list of indices.

In [None]:
city2.iloc[[0, 1, -1]]

Alfa Romeo    19
Ferrari        9
Subaru        16
Name: city08, dtype: int64

iloc also works with slices. They work the same way as they do in Python lists and follow the half-open interval. They include the first index and go up to but do not include the last index.

In [None]:
city2.iloc[0:5]

Alfa Romeo    19
Ferrari        9
Dodge         23
Dodge         10
Subaru        17
Name: city08, dtype: int64

We can get the last 8 values using negative indexing.

In [None]:
city2.iloc[-8:]

Saturn    21
Saturn    24
Saturn    21
Subaru    19
Subaru    20
Subaru    18
Subaru    18
Subaru    16
Name: city08, dtype: int64

We can also pass in a list of booleans but if we pass in a pandas series with booleans it will fail.

In [None]:
mask = city2 > 50
city2.iloc[mask]

ValueError: iLocation based boolean indexing cannot use an indexable as a mask

We can convert the mask to a NumPy array or python list and iloc will work.

In [None]:
city2.iloc[mask.to_numpy()]

Nissan     81
Toyota     81
Toyota     81
Ford       74
Nissan     84
         ... 
Tesla     140
Tesla     115
Tesla     104
Tesla      98
Toyota     55
Name: city08, Length: 236, dtype: int64

In [None]:
city2.iloc[list(mask)]

Nissan     81
Toyota     81
Toyota     81
Ford       74
Nissan     84
         ... 
Tesla     140
Tesla     115
Tesla     104
Tesla      98
Toyota     55
Name: city08, Length: 236, dtype: int64

## 10.5 Heads and Tails

The .head and .tail methods are useful for pulling out values at the start or end of the series.

In [None]:
city2.head(3)

Alfa Romeo    19
Ferrari        9
Dodge         23
Name: city08, dtype: int64

In [None]:
city2.tail(3)

Subaru    18
Subaru    18
Subaru    16
Name: city08, dtype: int64

## 10.6 Sampling

We can pull out random samples of the data. Below we get 6 random samples of the data and pass in 32 as the state. This state allows us to regenerate the same random sample.

In [None]:
city2.sample(6, random_state=32)

Buick            15
Mercedes-Benz    22
Volkswagen       21
Buick            21
Mercedes-Benz    20
Mazda            28
Name: city08, dtype: int64

## 10.7 Filtering Index Values

The .filter method filters index labels by exact match, substring or regex. Exact match with items fails with duplicate indices.

In [None]:
city2.filter(items=['Ford', 'Subary'])

ValueError: cannot reindex from a duplicate axis

We can do substring matches

In [None]:
city2.filter(like='rd')

Ford    18
Ford    16
Ford    17
Ford    17
Ford    15
        ..
Ford    26
Ford    19
Ford    21
Ford    18
Ford    19
Name: city08, Length: 3371, dtype: int64

We can also specify a regular expression to match against the index values.

In [None]:
city2.filter(regex='(Ford)|(Subary)')

Ford    18
Ford    16
Ford    17
Ford    17
Ford    15
        ..
Ford    26
Ford    19
Ford    21
Ford    18
Ford    19
Name: city08, Length: 3371, dtype: int64

## 10.8 Reindexing

In [108]:
city2.reindex(['Missing', 'Ford'])

ValueError: cannot reindex from a duplicate axis

Reindexing allows pulling out values by index label.

In [16]:
city_mpg.reindex([0 ,0 , 10 , 20 , 2_000_000 ])

0          19.0
0          19.0
10         23.0
20         14.0
2000000     NaN
Name: city08, dtype: float64

In [18]:
s1 = pd . Series ([10 ,20 ,30] , index =['a', 'b', 'c'])
s2 = pd . Series ([15 ,25 ,35] , index =['b', 'c', 'd'])

In [19]:
s2

b    15
c    25
d    35
dtype: int64

In [20]:
s2.reindex(s1.index)

a     NaN
b    15.0
c    25.0
dtype: float64

In [24]:
workouts = workouts_df.rename(workouts_df['Workout Timestamp'].to_dict())

RangeIndex(start=0, stop=754, step=1)