In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
def get_dataset(size):
    """Create fake dataset"""
    data = pd.DataFrame()
    data['model'] = np.random.choice(
    ['m1', 'm2', 'm3', 'm4'], size)
    data['fuel'] = np.random.choice(['petrol', 'diesel', 'gas'], size)
    data['production_date'] = np.random.randint(1990, 2024, size)
    data['transmission'] = np.random.choice(
    ['mechanical', 'automatic', 'robotic'], size)
    data['engine_power'] = np.random.randint(129, 609, size)
    data['price'] = np.random.uniform(60000., 12460000, size)
    data['count'] = np.random.randint(1, 30, size)
    return data

**Изучаем эффективное выделение памяти в Pandas**
1. int8: -128 to 127
2. int16: -32768 to 32767
3. int32: -2147483648 to 2147483647
4. int64: -9223372036854775808 to 9223372036854775807

In [3]:
SIZE = 1_000_000
df = get_dataset(SIZE)

In [4]:
df.head()

Unnamed: 0,model,fuel,production_date,transmission,engine_power,price,count
0,m4,gas,2001,automatic,216,5127199.0,1
1,m2,diesel,2008,robotic,266,6275512.0,17
2,m1,gas,2020,automatic,573,5018483.0,14
3,m1,gas,2003,automatic,510,618208.7,19
4,m2,gas,1993,robotic,518,2326860.0,20


In [5]:
df_start = df.copy()

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 7 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   model            1000000 non-null  object 
 1   fuel             1000000 non-null  object 
 2   production_date  1000000 non-null  int32  
 3   transmission     1000000 non-null  object 
 4   engine_power     1000000 non-null  int32  
 5   price            1000000 non-null  float64
 6   count            1000000 non-null  int32  
dtypes: float64(1), int32(3), object(3)
memory usage: 42.0+ MB


In [10]:
df.memory_usage()

Index                  132
model              8000000
fuel               8000000
production_date    4000000
transmission       8000000
engine_power       4000000
price              8000000
count              4000000
dtype: int64

In [11]:
df.memory_usage().sum()

44000132

**production_date**

**Изучаем эффективное выделение памяти в Pandas**
1. int8: -128 to 127
2. int16: -32768 to 32767
3. int32: -2147483648 to 2147483647
4. int64: -9223372036854775808 to 9223372036854775807

In [12]:
print(df['production_date'].min(), df['production_date'].max())

1990 2023


In [13]:
print(df['engine_power'].min(), df['engine_power'].max())

129 608


In [14]:
int_large_cols = ['production_date', 'engine_power']
df[int_large_cols] = df[int_large_cols].astype('int16')

In [15]:
df.memory_usage()

Index                  132
model              8000000
fuel               8000000
production_date    2000000
transmission       8000000
engine_power       2000000
price              8000000
count              4000000
dtype: int64

In [16]:
df.memory_usage().sum()

40000132

**count**

In [17]:
print(df['count'].min(), df['count'].max())

1 29


In [18]:
df['count'] = df['count'].astype('int8')

In [19]:
df.memory_usage()

Index                  132
model              8000000
fuel               8000000
production_date    2000000
transmission       8000000
engine_power       2000000
price              8000000
count              1000000
dtype: int64

In [20]:
df.memory_usage().sum()

37000132

**category**

In [29]:

cat_cols = df.select_dtypes('object').columns
df[cat_cols] = df[cat_cols].astype('category')


In [32]:
df.memory_usage()

Index                  132
model              1000204
fuel               1000132
production_date    2000000
transmission       1000132
engine_power       2000000
price              8000000
count              1000000
dtype: int64

In [33]:
df.memory_usage().sum()

16000600

**result**

In [34]:
df_end = df.copy()

In [35]:
print(df_start.memory_usage().sum(), " ", df_end.memory_usage().sum())

44000132   16000600
