# How do I make my pandas DataFrame smaller and faster?


In [1]:
import pandas as pd

In [4]:
drinks = pd.read_csv("http://bit.ly/drinksbycountry")

In [7]:
drinks.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   beer_servings                 193 non-null    int64  
 2   spirit_servings               193 non-null    int64  
 3   wine_servings                 193 non-null    int64  
 4   total_litres_of_pure_alcohol  193 non-null    float64
 5   continent                     193 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 30.5 KB


In [9]:
drinks.memory_usage(deep=True)

Index                             132
country                         12588
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                       12332
dtype: int64

In [10]:
drinks.memory_usage(deep=True).sum()

31228

In [11]:
sorted(drinks['continent'].unique())

['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']

In [13]:
drinks['continent'].head()

0      Asia
1    Europe
2    Africa
3    Europe
4    Africa
Name: continent, dtype: object

In [14]:
drinks['continent'] = drinks.continent.astype('category')

In [17]:
drinks.dtypes

country                           object
beer_servings                      int64
spirit_servings                    int64
wine_servings                      int64
total_litres_of_pure_alcohol     float64
continent                       category
dtype: object

In [18]:
drinks.continent.cat.codes.head()

0    1
1    2
2    0
3    2
4    0
dtype: int8

In [21]:
drinks.memory_usage(deep=True)

Index                             132
country                         12588
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                         756
dtype: int64

In [22]:
drinks.memory_usage(deep=True).sum()

19652

In [23]:
drinks['country'] = drinks['country'].astype('category')

In [24]:
drinks.memory_usage(deep=True)

Index                             132
country                         17142
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                         756
dtype: int64

In [25]:
len(drinks['country'].unique())

193

In [26]:
len(drinks['continent'].unique())

6

In [35]:
df = pd.DataFrame({"ID": [100, 101, 102, 103],
                   "Quality": ['good', 'very good', 'good', 'excellent']})

In [36]:
df

Unnamed: 0,ID,Quality
0,100,good
1,101,very good
2,102,good
3,103,excellent


In [37]:
df.sort_values('Quality')

Unnamed: 0,ID,Quality
3,103,excellent
0,100,good
2,102,good
1,101,very good


In [39]:
from pandas.api.types import CategoricalDtype

cat_types = CategoricalDtype(categories=['good', 'very good', 'excellent'], ordered=True)
df['Quality'] = df['Quality'].astype(cat_types)

In [40]:
df

Unnamed: 0,ID,Quality
0,100,good
1,101,very good
2,102,good
3,103,excellent


In [43]:
df['Quality'].sort_values(ascending=False)

3    excellent
1    very good
0         good
2         good
Name: Quality, dtype: category
Categories (3, object): ['good' < 'very good' < 'excellent']

In [48]:
df.sort_values("Quality", ascending=False).reset_index(inplace=True)

In [52]:
df.loc[df['Quality'] > 'good']

Unnamed: 0,ID,Quality
1,101,very good
3,103,excellent
