## Python Built-In “Get Size Of”

In [1]:
import sys
x = [1, 2, 3, 4, 5]
print(f"Size of list: {sys.getsizeof(x)} bytes")

Size of list: 104 bytes


## Memory Profiler

In [2]:
from memory_profiler import memory_usage

def my_function():
    a = 'Towards Data Science' * (10**7)
    return a

mem_usage = memory_usage((my_function,))
print(f"Memory usage: {max(mem_usage) - min(mem_usage)} MB")

Memory usage: 164.0078125 MB


## Use Generators Instead of Lists

In [3]:
def process_with_list(size):
    data = [i for i in range(size)]
    result = sum(data)
    return result

In [4]:
def process_with_generator(size):
    data = (i for i in range(size))
    result = sum(data)
    return result

In [5]:
size = 10**7
print(process_with_list(size))
print(process_with_generator(size))

49999995000000
49999995000000


In [6]:
import time
from memory_profiler import memory_usage

# List version implementation
print("Metrics for the list version")
start_time = time.time()
mem_usage = memory_usage((process_with_list, (size,)), interval=0.1)
end_time = time.time()

print(f"Using lists:")
print(f"Memory usage: {max(mem_usage) - min(mem_usage):.2f} MB")
print(f"Execution time: {end_time - start_time:.2f} seconds\n")
# Generator version implementation
print("Metrics for the generator version")
start_time = time.time()
mem_usage = memory_usage((process_with_generator, (size,)), interval=0.1)
end_time = time.time()

print(f"Using generators:")
print(f"Memory usage: {max(mem_usage) - min(mem_usage):.2f} MB")
print(f"Execution time: {end_time - start_time:.2f} seconds")

Metrics for the list version
Using lists:
Memory usage: 382.09 MB
Execution time: 1.34 seconds

Metrics for the generator version
Using generators:
Memory usage: 0.00 MB
Execution time: 1.39 seconds


## Efficient Data Structures

In [7]:
import pandas as pd

In [8]:
def process_with_dataframe(size):
    data = pd.DataFrame({'numbers': range(size)})
    result = data['numbers'].sum()
    return result

In [9]:
def process_with_series(size):
    data = pd.Series(range(size))
    result = data.sum()
    return result

In [10]:
size = 10**7

# Metrics for DataFrame version
start_time = time.time()
mem_usage_dataframe = memory_usage((process_with_dataframe, (size,)), interval=0.1)
end_time = time.time()
dataframe_time = end_time - start_time
dataframe_memory = max(mem_usage_dataframe) - min(mem_usage_dataframe)

print(f"Using DataFrame:")
print(f"Memory usage: {dataframe_memory:.2f} MB")
print(f"Execution time: {dataframe_time:.2f} seconds")

# Metrics for Series version
start_time = time.time()
mem_usage_series = memory_usage((process_with_series, (size,)), interval=0.1)
end_time = time.time()
series_time = end_time - start_time
series_memory = max(mem_usage_series) - min(mem_usage_series)

print(f"Using Series:")
print(f"Memory usage: {series_memory:.2f} MB")
print(f"Execution time: {series_time:.2f} seconds")

Using DataFrame:
Memory usage: 139.56 MB
Execution time: 1.23 seconds
Using Series:
Memory usage: 76.30 MB
Execution time: 2.28 seconds


## Using Categorical Type In Pandas

In [11]:
import pandas as pd
import numpy as np
import sys

df = pd.DataFrame({
    'A': np.random.choice(['foo', 'bar', 'baz', 'qux'], size=size),
    'B': range(10**7)
})

df

Unnamed: 0,A,B
0,foo,0
1,foo,1
2,bar,2
3,bar,3
4,foo,4
...,...,...
9999995,foo,9999995
9999996,qux,9999996
9999997,bar,9999997
9999998,bar,9999998


In [12]:
mem_before = sys.getsizeof(df)
print(f"Memory usage for string type: {mem_before / (1024 * 1024):.2f} MB")

Memory usage for string type: 648.50 MB


In [13]:
# Convert column A to categorical type
df['A'] = df['A'].astype('category')

mem_after = sys.getsizeof(df)
print(f"Memory usage for categorical: {mem_after / (1024 * 1024):.2f} MB")

Memory usage for categorical: 85.83 MB


In [14]:
df['A'].cat.categories

Index(['bar', 'baz', 'foo', 'qux'], dtype='object')

In [15]:
df['A'].cat.codes[:10]

0    2
1    2
2    0
3    0
4    2
5    1
6    2
7    1
8    0
9    3
dtype: int8

In [16]:
print(f"Row Num | Code\n{df['A'].cat.codes[:10]}")

Row Num | Code
0    2
1    2
2    0
3    0
4    2
5    1
6    2
7    1
8    0
9    3
dtype: int8
