## Read csv

In [1]:
import pandas as pd
import gc



In [2]:
df = pd.read_csv('../data/fines.csv')

In [3]:
%%timeit
# loop
res = []
for i in range(0, len(df)):
    ser = df.iloc[i]
    res.append(ser.Fines/ser.Refund*ser.Year)

117 ms ± 5.44 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [4]:
%%timeit
# iterrows
res = []
for _, ser in df.iterrows():
    res.append(ser.Fines/ser.Refund*ser.Year)

74 ms ± 1.86 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
%%timeit
df.apply(lambda ser: ser.Fines/ser.Refund*ser.Year, axis=1)

19.6 ms ± 1.19 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [6]:
%%timeit
df['Fines']/df['Refund']*df['Year']

189 µs ± 5 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [7]:
%%timeit
df['Fines'].values/df['Refund'].values*df['Year'].values

11.5 µs ± 156 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


## Indexing

In [8]:
%%timeit
df[df.CarNumber == 'O136HO197RUS']

339 µs ± 7.42 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [9]:
df.set_index('CarNumber', inplace=True)

In [10]:
%%timeit
df.loc['O136HO197RUS']

196 µs ± 5.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## Downcasting

In [11]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to Z364C8197RUS
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Refund         930 non-null    int64  
 1   Fines          930 non-null    float64
 2   Mark           930 non-null    object 
 3   Model          919 non-null    object 
 4   Year           930 non-null    int64  
 5   Millennium     5 non-null      float64
 6   Ths_fine       5 non-null      float64
 7   Cents          5 non-null      float64
 8   Is_new_car     5 non-null      object 
 9   Is_rarity_car  5 non-null      object 
dtypes: float64(4), int64(2), object(4)
memory usage: 292.5 KB


In [12]:
optimized = df.copy()

In [13]:
optimized['Fines'] = optimized.Fines.astype('float32')

In [14]:
tmp_colums = optimized.dtypes[optimized.dtypes == 'int64'].index
optimized.loc[:, tmp_colums] = optimized[tmp_colums].astype('int8')

In [15]:
optimized.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to Z364C8197RUS
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Refund         930 non-null    int8   
 1   Fines          930 non-null    float32
 2   Mark           930 non-null    object 
 3   Model          919 non-null    object 
 4   Year           930 non-null    int8   
 5   Millennium     5 non-null      float64
 6   Ths_fine       5 non-null      float64
 7   Cents          5 non-null      float64
 8   Is_new_car     5 non-null      object 
 9   Is_rarity_car  5 non-null      object 
dtypes: float32(1), float64(3), int8(2), object(4)
memory usage: 276.2 KB


## Categories

In [16]:
tmp_colums = optimized.dtypes[optimized.dtypes == 'object'].index
optimized.loc[:, tmp_colums] = optimized[tmp_colums].astype('category')

In [17]:
optimized

Unnamed: 0_level_0,Refund,Fines,Mark,Model,Year,Millennium,Ths_fine,Cents,Is_new_car,Is_rarity_car
CarNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Y163O8161RUS,2,3200.0,Ford,Focus,-59,,,,,
E432XX77RUS,1,6500.0,Toyota,Camry,-53,,,,,
7184TT36RUS,1,2100.0,Ford,Focus,-64,,,,,
X582HE161RUS,2,2000.0,Ford,Focus,-33,,,,,
92918M178RUS,1,5700.0,Ford,Focus,-34,,,,,
...,...,...,...,...,...,...,...,...,...,...
K364C8197RUS,2,4500.0,Ford,Focus,-37,2.0,4.0,0.0,True,False
M364C8197RUS,2,4500.0,Ford,Focus,-37,2.0,4.0,0.0,True,False
X364C8197RUS,2,4500.0,Ford,Focus,-37,2.0,4.0,0.0,True,False
Y364C8197RUS,2,4500.0,Ford,Focus,-37,2.0,4.0,0.0,True,False


In [18]:
optimized.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 930 entries, Y163O8161RUS to Z364C8197RUS
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   Refund         930 non-null    int8    
 1   Fines          930 non-null    float32 
 2   Mark           930 non-null    category
 3   Model          919 non-null    category
 4   Year           930 non-null    int8    
 5   Millennium     5 non-null      float64 
 6   Ths_fine       5 non-null      float64 
 7   Cents          5 non-null      float64 
 8   Is_new_car     5 non-null      category
 9   Is_rarity_car  5 non-null      category
dtypes: category(4), float32(1), float64(3), int8(2)
memory usage: 111.4 KB


## Memory clean

In [19]:
%reset_selective df
gc.collect()

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


20