## Pandas Sort: Your Guide to Sorting Data in Python

In [1]:
import pandas as pd
import numpy as np

In [2]:
column_subset = [
...     "id",
...     "make",
...     "model",
...     "year",
...     "cylinders",
...     "fuelType",
...     "trany",
...     "mpgData",
...     "city08",
...     "highway08"
... ]

In [3]:
df = pd.read_csv(
...     "data/vehicles.csv",
...     usecols=column_subset,
...     nrows=100
... )

In [4]:
df.head()

Unnamed: 0,city08,cylinders,fuelType,highway08,id,make,model,mpgData,trany,year
0,19,4,Regular,25,1,Alfa Romeo,Spider Veloce 2000,Y,Manual 5-spd,1985
1,9,12,Regular,14,10,Ferrari,Testarossa,N,Manual 5-spd,1985
2,23,4,Regular,33,100,Dodge,Charger,Y,Manual 5-spd,1985
3,10,8,Regular,12,1000,Dodge,B150/B250 Wagon 2WD,N,Automatic 3-spd,1985
4,17,4,Premium,23,10000,Subaru,Legacy AWD Turbo,N,Manual 5-spd,1993


In [9]:
df.describe()

Unnamed: 0,city08,cylinders,highway08,id,year
count,100.0,100.0,100.0,100.0,100.0
mean,16.79,5.94,23.47,8929.75,1992.04
std,3.373381,1.644212,4.560381,3034.851599,2.612789
min,9.0,4.0,10.0,1.0,1985.0
25%,15.0,4.0,22.0,10012.75,1993.0
50%,17.0,6.0,24.0,10037.5,1993.0
75%,18.0,6.0,26.0,10062.25,1993.0
max,23.0,12.0,33.0,10087.0,1993.0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   city08     100 non-null    int64 
 1   cylinders  100 non-null    int64 
 2   fuelType   100 non-null    object
 3   highway08  100 non-null    int64 
 4   id         100 non-null    int64 
 5   make       100 non-null    object
 6   model      100 non-null    object
 7   mpgData    100 non-null    object
 8   trany      100 non-null    object
 9   year       100 non-null    int64 
dtypes: int64(5), object(5)
memory usage: 7.9+ KB


In [13]:
df = df.astype(dtype={'city08':'uint8', 
                       'cylinders':'uint8',
                       'highway08':'uint8',
                       'id':'uint16'})

In [14]:
df.info(0)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   city08     100 non-null    uint8 
 1   cylinders  100 non-null    uint8 
 2   fuelType   100 non-null    object
 3   highway08  100 non-null    uint8 
 4   id         100 non-null    uint16
 5   make       100 non-null    object
 6   model      100 non-null    object
 7   mpgData    100 non-null    object
 8   trany      100 non-null    object
 9   year       100 non-null    int64 
dtypes: int64(1), object(5), uint16(1), uint8(3)
memory usage: 5.3+ KB


In [20]:
df['fuelType'].unique()

array(['Regular', 'Premium'], dtype=object)

In [22]:
df = df.astype(dtype={'year':'uint16', 'mpgData':'category', 'fuelType':'category'})

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   city08     100 non-null    uint8   
 1   cylinders  100 non-null    uint8   
 2   fuelType   100 non-null    category
 3   highway08  100 non-null    uint8   
 4   id         100 non-null    uint16  
 5   make       100 non-null    object  
 6   model      100 non-null    object  
 7   mpgData    100 non-null    category
 8   trany      100 non-null    object  
 9   year       100 non-null    uint16  
dtypes: category(2), object(3), uint16(2), uint8(3)
memory usage: 3.6+ KB


In [24]:
df.sort_values('city08', ascending=False)

Unnamed: 0,city08,cylinders,fuelType,highway08,id,make,model,mpgData,trany,year
9,23,4,Regular,30,10005,Toyota,Corolla,Y,Automatic 4-spd,1993
2,23,4,Regular,33,100,Dodge,Charger,Y,Manual 5-spd,1985
7,23,4,Regular,26,10003,Toyota,Corolla,Y,Automatic 3-spd,1993
8,23,4,Regular,31,10004,Toyota,Corolla,Y,Manual 5-spd,1993
76,23,4,Regular,31,10066,Mazda,626,Y,Manual 5-spd,1993
...,...,...,...,...,...,...,...,...,...,...
58,10,8,Regular,11,1005,Dodge,B350 Wagon 2WD,N,Automatic 3-spd,1985
80,9,8,Regular,10,1007,Dodge,B350 Wagon 2WD,N,Automatic 3-spd,1985
1,9,12,Regular,14,10,Ferrari,Testarossa,N,Manual 5-spd,1985
47,9,8,Regular,11,1004,Dodge,B150/B250 Wagon 2WD,N,Automatic 3-spd,1985


In [37]:
cars = df[['make', 'model']].groupby('make')
cars.get_group('Audi')

Unnamed: 0,make,model
18,Audi,100
19,Audi,100


In [33]:
cars

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000013B8EB00400>

In [38]:
assigned_index_df = df.set_index(
...     ["make", "model"])

In [39]:
assigned_index_df

Unnamed: 0_level_0,Unnamed: 1_level_0,city08,cylinders,fuelType,highway08,id,mpgData,trany,year
make,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alfa Romeo,Spider Veloce 2000,19,4,Regular,25,1,Y,Manual 5-spd,1985
Ferrari,Testarossa,9,12,Regular,14,10,N,Manual 5-spd,1985
Dodge,Charger,23,4,Regular,33,100,Y,Manual 5-spd,1985
Dodge,B150/B250 Wagon 2WD,10,8,Regular,12,1000,N,Automatic 3-spd,1985
Subaru,Legacy AWD Turbo,17,4,Premium,23,10000,N,Manual 5-spd,1993
...,...,...,...,...,...,...,...,...,...
Pontiac,Grand Prix,17,6,Regular,25,10083,Y,Automatic 3-spd,1993
Pontiac,Grand Prix,17,6,Regular,27,10084,N,Automatic 4-spd,1993
Pontiac,Grand Prix,15,6,Regular,24,10085,N,Automatic 4-spd,1993
Pontiac,Grand Prix,15,6,Regular,24,10086,N,Manual 5-spd,1993


In [43]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,year,trany,mpgData,model,make,id,highway08,fuelType,cylinders,city08
0,1985,Manual 5-spd,Y,Spider Veloce 2000,Alfa Romeo,1,25,Regular,4,19
1,1985,Manual 5-spd,N,Testarossa,Ferrari,10,14,Regular,12,9
2,1985,Manual 5-spd,Y,Charger,Dodge,100,33,Regular,4,23
3,1985,Automatic 3-spd,N,B150/B250 Wagon 2WD,Dodge,1000,12,Regular,8,10
4,1993,Manual 5-spd,N,Legacy AWD Turbo,Subaru,10000,23,Premium,4,17
...,...,...,...,...,...,...,...,...,...,...
95,1993,Automatic 3-spd,Y,Grand Prix,Pontiac,10083,25,Regular,6,17
96,1993,Automatic 4-spd,N,Grand Prix,Pontiac,10084,27,Regular,6,17
97,1993,Automatic 4-spd,N,Grand Prix,Pontiac,10085,24,Regular,6,15
98,1993,Manual 5-spd,N,Grand Prix,Pontiac,10086,24,Regular,6,15


In [44]:
df.sort_index(axis=0, ascending=False)

Unnamed: 0,city08,cylinders,fuelType,highway08,id,make,model,mpgData,trany,year
99,9,8,Premium,13,10087,Rolls-Royce,Brooklands/Brklnds L,N,Automatic 4-spd,1993
98,15,6,Regular,24,10086,Pontiac,Grand Prix,N,Manual 5-spd,1993
97,15,6,Regular,24,10085,Pontiac,Grand Prix,N,Automatic 4-spd,1993
96,17,6,Regular,27,10084,Pontiac,Grand Prix,N,Automatic 4-spd,1993
95,17,6,Regular,25,10083,Pontiac,Grand Prix,Y,Automatic 3-spd,1993
...,...,...,...,...,...,...,...,...,...,...
4,17,4,Premium,23,10000,Subaru,Legacy AWD Turbo,N,Manual 5-spd,1993
3,10,8,Regular,12,1000,Dodge,B150/B250 Wagon 2WD,N,Automatic 3-spd,1985
2,23,4,Regular,33,100,Dodge,Charger,Y,Manual 5-spd,1985
1,9,12,Regular,14,10,Ferrari,Testarossa,N,Manual 5-spd,1985


In [45]:
test_col = df['mpgData'].apply(lambda res: True if res == 'Y' else np.nan)

In [46]:
test_col

0     True
1      NaN
2     True
3      NaN
4      NaN
      ... 
95    True
96     NaN
97     NaN
98     NaN
99     NaN
Name: mpgData, Length: 100, dtype: object

In [50]:
test_col.sort_values(na_position='first')

1      NaN
3      NaN
4      NaN
5      NaN
11     NaN
      ... 
32    True
33    True
37    True
85    True
95    True
Name: mpgData, Length: 100, dtype: object