In [1]:
import pandas as pd
import numpy as np
import time

np.random.seed(0)
data = np.random.randint(1, 100, size=(1000000, 1))
df = pd.DataFrame(data, columns=['Values'])

start_time = time.time()
sum_for_loop = 0
for value in df['Values']:
    sum_for_loop += value
time_for_loop = time.time() - start_time

start_time = time.time()
sum_method = df['Values'].sum()
time_sum_method = time.time() - start_time

print("Sum using for loop:", sum_for_loop)
print("Time taken using for loop:", time_for_loop, "seconds")
print("Sum using sum method:", sum_method)
print("Time taken using sum method:", time_sum_method, "seconds")

Sum using for loop: 49988718
Time taken using for loop: 0.42987799644470215 seconds
Sum using sum method: 49988718
Time taken using sum method: 0.0 seconds


In [2]:
def custom_function(x):
    return x * 2 + 3

start_time = time.time()
df['Apply_Result'] = df['Values'].apply(custom_function)
time_apply = time.time() - start_time

start_time = time.time()
df['Vectorized_Result'] = custom_function(df['Values'])
time_vectorized = time.time() - start_time

print("Time taken using apply:", time_apply, "seconds")
print("Time taken using vectorized operations:", time_vectorized, "seconds")

Time taken using apply: 0.5824587345123291 seconds
Time taken using vectorized operations: 0.017866134643554688 seconds


In [3]:
data = {
    'int_col': np.random.randint(0, 100, size=100000),
    'float_col': np.random.random(size=100000) * 100,
    'category_col': np.random.choice(['A', 'B', 'C'], size=100000),
    'object_col': np.random.choice(['foo', 'bar', 'baz'], size=100000)
}
df = pd.DataFrame(data)

print("Memory usage before optimization:")
print(df.info(memory_usage='deep'))

df['int_col'] = df['int_col'].astype('int16')
df['float_col'] = df['float_col'].astype('float32')
df['category_col'] = df['category_col'].astype('category')
df['object_col'] = df['object_col'].astype('category')

print("\nMemory usage after optimization:")
print(df.info(memory_usage='deep'))

Memory usage before optimization:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   int_col       100000 non-null  int32  
 1   float_col     100000 non-null  float64
 2   category_col  100000 non-null  object 
 3   object_col    100000 non-null  object 
dtypes: float64(1), int32(1), object(2)
memory usage: 12.4 MB
None

Memory usage after optimization:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype   
---  ------        --------------   -----   
 0   int_col       100000 non-null  int16   
 1   float_col     100000 non-null  float32 
 2   category_col  100000 non-null  category
 3   object_col    100000 non-null  category
dtypes: category(2), float32(1), int16(1)
memory usage: 781.9 KB
None
