In [1]:
import pandas as pd
import numpy as np

# 1 Memory Otimization

# Reduce Memory Usage

In [6]:
df = pd.DataFrame(np.random.randint(1, 100, (10**7, 2)), columns = ["A", "B"])

In [7]:
df.A.info()

<class 'pandas.core.series.Series'>
RangeIndex: 10000000 entries, 0 to 9999999
Series name: A
Non-Null Count     Dtype
--------------     -----
10000000 non-null  int32
dtypes: int32(1)
memory usage: 38.1 MB


In [8]:
df["A"] = df.A.astype(np.int8)

In [9]:
df.A.info()

<class 'pandas.core.series.Series'>
RangeIndex: 10000000 entries, 0 to 9999999
Series name: A
Non-Null Count     Dtype
--------------     -----
10000000 non-null  int8 
dtypes: int8(1)
memory usage: 9.5 MB


In [10]:
df.A.min(), df.A.max()

(1, 99)

#  Handling Missing Valued

In [11]:
df = pd.DataFrame(np.random.randint(1, 1000000, size = (10**6, 3)), 
                  columns = ["col1", "col2", "col3"])

df["col1"] = df.col1.astype(np.float64)

In [12]:
df.loc[::2, "col1"] = np.nan
df.loc[::3, "col1"] = np.nan
df.loc[::5, "col1"] = np.nan

In [13]:
df.col1.info()

<class 'pandas.core.series.Series'>
RangeIndex: 1000000 entries, 0 to 999999
Series name: col1
Non-Null Count   Dtype  
--------------   -----  
266666 non-null  float64
dtypes: float64(1)
memory usage: 7.6 MB


In [14]:
df["col1"] = df.col1.astype("Sparse[float32]")

In [15]:
df.col1.info()

<class 'pandas.core.series.Series'>
RangeIndex: 1000000 entries, 0 to 999999
Series name: col1
Non-Null Count   Dtype               
--------------   -----               
266666 non-null  Sparse[float32, nan]
dtypes: Sparse[float32, nan](1)
memory usage: 2.0 MB


# Save Memory with Python Generators

In [16]:
from sys import getsizeof

In [17]:
my_list = [i for i in range(10**7)]

In [18]:
getsizeof(my_list)

89095160

In [21]:
sum(my_list)

49999995000000

In [20]:
# generator 

my_gen = (i for i in range(10**7)) ## use () to create a generator
getsizeof(my_gen)

112

In [22]:
sum(my_gen)

49999995000000

# Define the Correct DataType for Categorical Columns

In [23]:
df = pd.read_csv("titanic_train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [24]:
df.Gender.dtype

dtype('O')

In [25]:
df.Gender.info(memory_usage="deep")

<class 'pandas.core.series.Series'>
RangeIndex: 1500 entries, 0 to 1499
Series name: Gender
Non-Null Count  Dtype 
--------------  ----- 
1500 non-null   object
dtypes: object(1)
memory usage: 90.5 KB


In [26]:
df["Gender"] = df.Gender.astype("category")

In [27]:
df.Gender.info(memory_usage="deep")

<class 'pandas.core.series.Series'>
RangeIndex: 1500 entries, 0 to 1499
Series name: Gender
Non-Null Count  Dtype   
--------------  -----   
1500 non-null   category
dtypes: category(1)
memory usage: 1.8 KB


# Read only required columns from a CSV

In [29]:
%timeit data = pd.read_csv("titanic_train.csv")

5.98 ms ± 881 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [30]:
data.info(memory_usage = "deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1500 non-null   int64  
 1   Survived     1500 non-null   int64  
 2   Pclass       1500 non-null   int64  
 3   Name         1500 non-null   object 
 4   Gender       1500 non-null   object 
 5   Age          1195 non-null   float64
 6   SibSp        1500 non-null   int64  
 7   Parch        1500 non-null   int64  
 8   Ticket       1500 non-null   object 
 9   Fare         1500 non-null   float64
 10  Cabin        341 non-null    object 
 11  Embarked     1497 non-null   object 
dtypes: float64(2), int64(5), object(5)
memory usage: 530.3 KB


In [32]:
col_list = ["PassengerId", "Survived", "Pclass", "Embarked"]

data = pd.read_csv("titanic_train.csv", usecols=col_list)
data.info(memory_usage = "deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   PassengerId  1500 non-null   int64 
 1   Survived     1500 non-null   int64 
 2   Pclass       1500 non-null   int64 
 3   Embarked     1497 non-null   object
dtypes: int64(3), object(1)
memory usage: 120.2 KB


#  Alter the data-type of columns 
By default, Pandas always assigns the highest memory datatype to columns.


int8: 8-bit-integer that covers integers from [-2⁷, 2⁷].

int16: 16-bit-integer that covers integers from [-2¹⁵, 2¹⁵].

int32: 32-bit-integer that covers integers from [-2³¹, 2³¹].

int64: 64-bit-integer that covers integers from [-2⁶³, 2⁶³].

In [33]:
data = pd.read_csv("titanic_train.csv")
data.info(memory_usage = "deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1500 non-null   int64  
 1   Survived     1500 non-null   int64  
 2   Pclass       1500 non-null   int64  
 3   Name         1500 non-null   object 
 4   Gender       1500 non-null   object 
 5   Age          1195 non-null   float64
 6   SibSp        1500 non-null   int64  
 7   Parch        1500 non-null   int64  
 8   Ticket       1500 non-null   object 
 9   Fare         1500 non-null   float64
 10  Cabin        341 non-null    object 
 11  Embarked     1497 non-null   object 
dtypes: float64(2), int64(5), object(5)
memory usage: 530.3 KB


In [35]:
print("Memory usage before changing the datatype:", data.Survived.memory_usage())

data["Survived"] = data.Survived.astype(np.int8)

print("Memory usage after changing the datatype:", data.Survived.memory_usage())

Memory usage before changing the datatype: 12128
Memory usage after changing the datatype: 1628


<img src="img/img1.png">

In [38]:
print("Memory usage before changing the datatype:", data.Age.memory_usage())

data["Age"] = data.Age.astype("Sparse[float32]")

print("Memory usage after changing the datatype:", data.Age.memory_usage())

Memory usage before changing the datatype: 12128
Memory usage after changing the datatype: 9688


# Altering datatype of columns with NaN values

<img src="img/img3.png">

In [41]:
print("Number of records    :", data.shape[0])
print("Number of NaN values :", data.Fare.isna().sum())
print("Data type of Rating  :", data.Fare.dtype)

Number of records    : 1500
Number of NaN values : 0
Data type of Rating  : float64


# 2 Run-time Optimization

# The Best Way to Use Apply() in Pandas

Swifter: [https://github.com/jmcarpenter2/swifter](https://github.com/jmcarpenter2/swifter)

Pandarallel: [https://github.com/nalepae/pandarallel](https://github.com/nalepae/pandarallel)

Parallel Pandas: [https://pypi.org/project/parallel-pandas/](https://pypi.org/project/parallel-pandas/)

Mapply: [https://pypi.org/project/mapply/](https://pypi.org/project/mapply/)

In [42]:
!pip install pandarallel
!pip install mapply
!pip install pandas-parallel
!pip install swifter

Collecting pandarallel
  Downloading pandarallel-1.6.5.tar.gz (14 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: pandarallel
  Building wheel for pandarallel (setup.py): started
  Building wheel for pandarallel (setup.py): finished with status 'done'
  Created wheel for pandarallel: filename=pandarallel-1.6.5-py3-none-any.whl size=16678 sha256=d2b426977a95e0b5a204898b250b391938098b71c18a535c96f34fc506ec7252
  Stored in directory: c:\users\marcio rodrigues\appdata\local\pip\cache\wheels\39\fe\8d\f7912d85b21ea72c9a8a3fefa6141fab92e575a67ab17c4474
Successfully built pandarallel
Installing collected packages: pandarallel
Successfully installed pandarallel-1.6.5
Collecting mapply
  Downloading mapply-0.1.22-py3-none-any.whl.metadata (4.0 kB)
Collecting pathos>=0.2.0 (from mapply)
  Downloading pathos-0.3.1-py3-none-any.whl.metadata (11 kB)
Collecting ppft>=1.7.6.7 (from pathos>=0.2.0->mapply)


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spyder 5.2.2 requires pyqt5<5.13, which is not installed.
spyder 5.2.2 requires pyqtwebengine<5.13, which is not installed.


Collecting pandas-parallel
  Downloading pandas-parallel-0.1.7.tar.gz (3.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: pandas-parallel
  Building wheel for pandas-parallel (setup.py): started
  Building wheel for pandas-parallel (setup.py): finished with status 'done'
  Created wheel for pandas-parallel: filename=pandas_parallel-0.1.7-py3-none-any.whl size=5189 sha256=7505267a1c52feac8a50a5e2ea649aeb3d2e7d2436dd1d0ef89972918496d7f2
  Stored in directory: c:\users\marcio rodrigues\appdata\local\pip\cache\wheels\52\5a\6f\1e9df17fb398cb15349aadc61ae90e587fd93ff8c7214938b6
Successfully built pandas-parallel
Installing collected packages: pandas-parallel
Successfully installed pandas-parallel-0.1.7
Collecting swifter
  Downloading swifter-1.4.0.tar.gz (1.2 MB)
     ---------------------------------------- 1.2/1.2 MB 4.5 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing m

In [45]:
import mapply
import pandas as pd
import swifter
import numpy as np
from time import perf_counter
from pandarallel import pandarallel
# from parallel_pandas import ParallelPandas

In [46]:
df = pd.DataFrame(np.random.randint(1, 10**6, size = (10**7, 4)), columns = list("ABCD"))
df.head()

Unnamed: 0,A,B,C,D
0,249987,607866,172763,470351
1,162991,380975,758718,979068
2,870688,908779,709176,718373
3,599044,44969,786505,460742
4,660002,374291,216405,319224


In [47]:
def sum_row(row):
    return sum(row)

**Pandas Apply**

In [48]:
start = perf_counter()
a = df.apply(sum_row, axis = 1)
print(perf_counter()-start)

57.45892660000027


**Pandarallel**

In [49]:
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [50]:
start = perf_counter()
a = df.parallel_apply(sum_row, axis = 1)
print(perf_counter()-start)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1666667), Label(value='0 / 1666667…

21.73913040000025


**Mapply Apply**

In [51]:
mapply.init(
    n_workers=-1,
    chunk_size=100,
    max_chunks_per_worker=8,
    progressbar=True
)

In [52]:
start = perf_counter()
a = df.mapply(sum_row, axis = 1)
print(perf_counter()-start)

  0%|                                                                                           | 0/56 [00:00<…

21.645263699999305


**Swifter Apply**

In [53]:
start = perf_counter()
a = df.swifter.apply(sum_row, axis = 1)
print(perf_counter()-start)

Dask Apply:   0%|          | 0/24 [00:00<?, ?it/s]

32.132466099999874


# Don't Create Conditional Columns in Pandas with Apply

In [55]:
def assign_class(num):
    if num>0.5:
        return "Class A"
    return "Class B"

In [56]:
df = pd.DataFrame(np.random.random((10**7,1)).round(2), columns = ["col1"])

In [57]:
%timeit a = df.col1.apply(assign_class)

1.82 s ± 86 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [58]:
%timeit a = np.where(df["col1"]>0.5, "Class A", "Class B")

185 ms ± 3.66 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


# Speed-up Parquet I/O of Pandas by 5x

In [59]:
%%time 

df = pd.read_parquet("employee_dataset.parquet")

Wall time: 16.8 s


In [60]:
!pip install fastparquet

Collecting fastparquet
  Downloading fastparquet-2023.10.1-cp39-cp39-win_amd64.whl.metadata (4.2 kB)
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.7.0-cp39-none-win_amd64.whl.metadata (4.1 kB)
Downloading fastparquet-2023.10.1-cp39-cp39-win_amd64.whl (667 kB)
   ---------------------------------------- 667.6/667.6 kB 1.9 MB/s eta 0:00:00
Downloading cramjam-2.7.0-cp39-none-win_amd64.whl (1.3 MB)
   ---------------------------------------- 1.3/1.3 MB 5.3 MB/s eta 0:00:00
Installing collected packages: cramjam, fastparquet
Successfully installed cramjam-2.7.0 fastparquet-2023.10.1


In [61]:
from fastparquet import ParquetFile

In [63]:
%%time

pf = ParquetFile("employee_dataset.parquet")
df = pf.to_pandas()

Wall time: 9.36 s


# Never Iterate Over A DataFrame

In [73]:
#Access column
%timeit -n10000 -r7 df["Name"]

2.71 µs ± 116 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [74]:
#Access row
%timeit -n10000 -r7 df.iloc[0]

51.4 µs ± 1.27 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [75]:
df.iloc[0]

Name                           Johnny Maynard
Company_Name          White, Mcclain and Cobb
Employee_Job_Title            Equities trader
Employee_City                New Cindychester
Employee_Country                     Mongolia
Employee_Salary                        764340
Employment_Status                   Full Time
Employee_Rating                           3.3
Credits                                     1
Name: 0, dtype: object