# chapter 3: using data structures effectively

In [29]:
import pandas as pd
import numpy as np

## lists

In [1]:
small_list = list(range(10))

In [2]:
%%timeit
last_element = small_list[-1]

40.8 ns ± 4.77 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


In [3]:
large_list = list(range(10_000))

In [4]:
%%timeit
last_element = large_list[-1]

39.1 ns ± 0.415 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


In [5]:
%%timeit
4200 in small_list

113 ns ± 0.398 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


In [6]:
%%timeit
4200 in large_list

38.7 μs ± 2.54 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


## sets

In [10]:
%%timeit
large_set = set(large_list)
4200 in large_set

185 μs ± 3.52 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [12]:
# converting a list to  set then perfoming look up takes more than thrice time more than a lsit lookup due to time needed to convert the list to a set
# but once converted, set lookup can be upto 1000 times faster than list look up


In [14]:
large_set = set(large_list)

In [15]:
%%timeit
2534 in large_set

41.4 ns ± 3.16 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


## numpy arrays

In [17]:
import numpy as np
random_arr = np.random.randint(1, 100_000, 100_000)
random_lists = list(random_arr)

In [18]:
%%timeit -r 7 -n 100
sum(random_lists)

4.37 ms ± 715 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [19]:
%%timeit -r 7 -n 100
np.sum(random_arr)

49.4 μs ± 13.7 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [20]:
# using numpy array is like 100x and this is an enormous perfomanc boost


In [21]:
random_array = np.random.randint(1, 100_000, 100_000)

In [22]:
random_array.nbytes

800000

In [23]:
random_array.dtype

dtype('int64')

In [24]:
random_array_32 = random_array.astype(np.int32)

In [25]:
random_array_32.dtype

dtype('int32')

In [26]:
random_array_32.nbytes

400000

In [27]:
small_arr = np.array([1, 3, 5], dtype=np.int16)

## pandas dataframes

In [32]:
# creating a series
usa_data = pd.Series(
    [13.33, 14.02, 14.02, 14.25],
    index = ["2000", "2001", "2002", "2003"]
)

In [33]:
usa_data

2000    13.33
2001    14.02
2002    14.02
2003    14.25
dtype: float64

In [35]:
# constructing a pandas dataframe
kenya_data = pd.Series(
    [9.02, 9.01, 8.84, 8.84],
    index = ["2000", "2001", "2002", "2003"]
)

df = pd.DataFrame(
    {
        "USA": usa_data,
        "Kenya": kenya_data
    }
)

In [36]:
df

Unnamed: 0,USA,Kenya
2000,13.33,9.02
2001,14.02,9.01
2002,14.02,8.84
2003,14.25,8.84


In [38]:
%%timeit
df["Kenya_fraction"] = df["Kenya"] / 100

151 μs ± 25.5 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [39]:
%%timeit
df["Kenya_fraction"] = df["Kenya"].apply(lambda x: x / 100)

148 μs ± 44.6 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
