In [1]:
import pandas as pd
import sqlite3

In [2]:
conn = sqlite3.connect('logs.db', detect_types=sqlite3.PARSE_DECLTYPES)

In [3]:
df = pd.read_sql('SELECT * FROM logs', conn)

In [4]:
import lzma

In [5]:
with lzma.open('ips.txt.xz', 'rt') as fp:
    ips = [line.strip() for line in fp]

In [6]:
len(ips)

100000

In [7]:
%timeit df['origin'].isin(ips)

96.2 ms ± 4.92 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [8]:
type(ips)

list

In [9]:
ip = df['origin'][3923]

In [10]:
%timeit ip in ips

6.48 ms ± 656 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [11]:
ips_set = set(ips)

In [12]:
%timeit ip in ips_set

183 ns ± 0.963 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)


In [14]:
6480000 / 183  # speed up by 35409x

35409.83606557377

In [15]:
%timeit df['origin'].isin(ips_set)

190 ms ± 21.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
96.2 / 190  # slower by 2x

0.5063157894736843

In [17]:
%timeit df['origin'].apply(lambda z: z in ips_set)

8.17 ms ± 121 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [18]:
96.2 / 8.17  # speed up by 11.7x

11.774785801713588

In [19]:
is_bad = ips_set.__contains__

In [20]:
%timeit df['origin'].apply(is_bad)

5.71 ms ± 54.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [21]:
96.2 / 5.71  # speed up by 16.8x

16.847635726795097