# List vs Deque

In [2]:
from collections import deque

In [3]:
dq = deque()
ls = list()

In [4]:
dq.append(1)
ls.append(1)

dq.appendleft(1)
ls.insert(0, 1)

### Append/Prepend

In [5]:
%%timeit
ls = []
for i in xrange(100):
    ls.append(1)

100000 loops, best of 3: 7.7 µs per loop


In [6]:
%%timeit
dq = deque()
for i in xrange(100):
    dq.append(1)

100000 loops, best of 3: 7.68 µs per loop


In [7]:
%%timeit
ls = []
for i in xrange(100):
    ls.insert(0, 1)

10000 loops, best of 3: 19 µs per loop


In [8]:
%%timeit
dq = deque()
for i in xrange(100):
    dq.appendleft(1)

100000 loops, best of 3: 7.3 µs per loop


### Access

In [9]:
ls = range(100)
dq = deque(range(100))

#### Random Access

In [10]:
%%timeit
dq[50]

10000000 loops, best of 3: 38.3 ns per loop


In [11]:
%%timeit
ls[50]

10000000 loops, best of 3: 28.6 ns per loop


#### Sequential Access

In [12]:
%%timeit
for x in ls:
    pass

The slowest run took 13.39 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 3: 1 µs per loop


In [13]:
%%timeit
for x in dq:
    pass

The slowest run took 9.87 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 3: 932 ns per loop


#### Slices

In [14]:
%%timeit
ls[5:15]

10000000 loops, best of 3: 131 ns per loop


In [15]:
%%timeit
list(dq)[5:15] # deques don't support slicing directly

The slowest run took 4.71 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 3: 810 ns per loop


# Dictionary vs defaultdict

In [16]:
from collections import defaultdict
from pprint import pprint

In [17]:
d = {}
dd = defaultdict(int)

d["key"] = 1
dd["key"] = 1

print d
print dd

{'key': 1}
defaultdict(<type 'int'>, {'key': 1})


In [18]:
d = {}
dd = defaultdict(int)

print dd["key"]
print d["key"]

0


KeyError: 'key'

In [19]:
#%%timeit
d = {}

for x in range(30):
    bucket = "even" if (x % 2 == 0) else "odd"
    if bucket not in d:
        d[bucket] = []
    d[bucket].append(x)

d

{'even': [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28],
 'odd': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29]}

In [20]:
#%%timeit
dd = defaultdict(list)

for x in range(30):
    bucket = "even" if (x % 2 == 0) else "odd"
    dd[bucket].append(x)

dd

defaultdict(list,
            {'even': [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28],
             'odd': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29]})

In [21]:
import requests
words = requests.get("https://raw.githubusercontent.com/berzerk0/Probable-Wordlists/master/Real-Passwords/Top3575-probable.txt").text.split("\n")

### Counting

You can count with vanilla dictionaries by exploiting the default portion of the `.get` method.
This is a good way to prepare data for use in a histogram.

In [22]:
by_len = {}
dist_ch = {}
for word in words:
    by_len[len(word)] = by_len.get(len(word), 0) + 1
    
    chs = len(set(word))
    dist_ch[chs] = dist_ch.get(chs, 0) + 1

pprint(by_len)
pprint(dist_ch)

{0: 1,
 1: 10,
 2: 2,
 3: 5,
 4: 61,
 5: 191,
 6: 1191,
 7: 736,
 8: 972,
 9: 277,
 10: 102,
 11: 18,
 12: 7,
 13: 2}
{0: 1,
 1: 84,
 2: 51,
 3: 70,
 4: 270,
 5: 786,
 6: 1121,
 7: 750,
 8: 323,
 9: 93,
 10: 22,
 12: 4}


### Grouping

In [24]:
%%timeit
d = {}
for word in words:
    chs = len(set(word))
    d.setdefault(chs, []).append(word)
    
d

100 loops, best of 3: 3.32 ms per loop


In [25]:
%%timeit
dd = defaultdict(list)
for word in words:
    chs = len(set(word))
    dd[chs].append(word)

dict(dd)

100 loops, best of 3: 2.99 ms per loop


### More counting

In [26]:
from collections import Counter

In [27]:
c = Counter()
for word in words:
    dist_ch = tuple(sorted(set(word)))
    c[dist_ch] += 1

c.most_common(5)

[((u'1', u'2', u'3', u'4', u'5', u'6', u'7', u'8', u'9'), 14),
 ((u'1', u'2', u'3'), 8),
 ((u'1',), 8),
 ((u'a',), 7),
 ((u'1', u'2', u'3', u'4'), 7)]

In [28]:
c = Counter()
for word in words:
    chs = len(set(word))
    c[chs] += 1

c.most_common(5)

[(6, 1121), (5, 786), (7, 750), (8, 323), (4, 270)]

In [29]:
import csv
dataset = requests.get("http://samplecsvs.s3.amazonaws.com/SalesJan2009.csv")
r = csv.DictReader(dataset.iter_lines())
rows = list(r)

In [30]:
len(rows)

998

In [31]:
rows[0]

{'Account_Created': '1/2/09 6:00',
 'City': 'Basildon',
 'Country': 'United Kingdom',
 'Last_Login': '1/2/09 6:08',
 'Latitude': '51.5',
 'Longitude': '-1.1166667',
 'Name': 'carolina',
 'Payment_Type': 'Mastercard',
 'Price': '1200',
 'Product': 'Product1',
 'State': 'England',
 'Transaction_date': '1/2/09 6:17'}

In [32]:
countries = Counter()
for row in rows:
    countries[row["Country"]] += 1
countries.most_common(3)

[('United States', 463), ('United Kingdom', 100), ('Canada', 76)]

In [49]:
def parse_price(rev):
    return int(filter(lambda c: c.isdigit(), rev))

In [50]:
revenue_by_country = defaultdict(list)
for row in rows:
    revenue_by_country[row["Country"]].append(parse_price(row["Price"]))
    
sorted((sum(rev), c) for c, rev in revenue_by_country.items())

[(1200, 'Argentina'),
 (1200, 'Bahrain'),
 (1200, 'Bermuda'),
 (1200, 'Bulgaria'),
 (1200, 'Cayman Isls'),
 (1200, 'China'),
 (1200, 'Costa Rica'),
 (1200, 'Dominican Republic'),
 (1200, 'Greece'),
 (1200, 'Guatemala'),
 (1200, 'Hong Kong'),
 (1200, 'Iceland'),
 (1200, 'Israel'),
 (1200, 'Jersey'),
 (1200, 'Kuwait'),
 (1200, 'Latvia'),
 (1200, 'Luxembourg'),
 (1200, 'Malaysia'),
 (1200, 'Moldova'),
 (1200, 'Romania'),
 (1200, 'South Korea'),
 (1200, 'Ukraine'),
 (2400, 'Finland'),
 (2400, 'India'),
 (2400, 'Japan'),
 (2400, 'Monaco'),
 (2400, 'Philippines'),
 (2400, 'Poland'),
 (2400, 'The Bahamas'),
 (3600, 'Hungary'),
 (3600, 'Mauritius'),
 (3600, 'Russia'),
 (4800, 'Malta'),
 (4800, 'Thailand'),
 (6000, 'Czech Republic'),
 (7200, 'New Zealand'),
 (7200, 'Turkey'),
 (10800, 'Austria'),
 (12000, 'Belgium'),
 (12000, 'United Arab Emirates'),
 (12300, 'Brazil'),
 (12300, 'South Africa'),
 (16800, 'Spain'),
 (18000, 'Denmark'),
 (21600, 'Norway'),
 (22800, 'Sweden'),
 (37800, 'Italy'),
 

In [51]:
def avg(prices):
    return sum(prices) / len(prices)

group = defaultdict(list)
for row in rows:
    group[row["Country"]].append(parse_price(row["Price"]))

price_info = {country: (avg(prices), sum(prices), len(prices)) for country, prices in group.items()}
pprint(price_info)

{'Argentina': (1200, 1200, 1),
 'Australia': (1705, 64800, 38),
 'Austria': (1542, 10800, 7),
 'Bahrain': (1200, 1200, 1),
 'Belgium': (1500, 12000, 8),
 'Bermuda': (1200, 1200, 1),
 'Brazil': (2460, 12300, 5),
 'Bulgaria': (1200, 1200, 1),
 'Canada': (1642, 124800, 76),
 'Cayman Isls': (1200, 1200, 1),
 'China': (1200, 1200, 1),
 'Costa Rica': (1200, 1200, 1),
 'Czech Republic': (2000, 6000, 3),
 'Denmark': (1200, 18000, 15),
 'Dominican Republic': (1200, 1200, 1),
 'Finland': (1200, 2400, 2),
 'France': (1966, 53100, 27),
 'Germany': (1680, 42000, 25),
 'Greece': (1200, 1200, 1),
 'Guatemala': (1200, 1200, 1),
 'Hong Kong': (1200, 1200, 1),
 'Hungary': (1200, 3600, 3),
 'Iceland': (1200, 1200, 1),
 'India': (1200, 2400, 2),
 'Ireland': (1426, 69900, 49),
 'Israel': (1200, 1200, 1),
 'Italy': (2520, 37800, 15),
 'Japan': (1200, 2400, 2),
 'Jersey': (1200, 1200, 1),
 'Kuwait': (1200, 1200, 1),
 'Latvia': (1200, 1200, 1),
 'Luxembourg': (1200, 1200, 1),
 'Malaysia': (1200, 1200, 1),
 'M

In [48]:
from operator import itemgetter

print "best average price"
pprint(max(price_info.items(), key=itemgetter(1)))

print "most transactions"
pprint(max(price_info.items(), key=lambda pair: pair[1][2]))

best average price
('Russia', (3600, 3600, 1))
most transactions
('United States', (1619, 750000, 463))


In [36]:
from collections import namedtuple
Record = namedtuple('Record', " ".join(rows[0]))

In [52]:
Record(**rows[0])

Record(City='Basildon', Product='Product1', Name='carolina', Country='United Kingdom', Price='1200', Longitude='-1.1166667', State='England', Transaction_date='1/2/09 6:17', Last_Login='1/2/09 6:08', Payment_Type='Mastercard', Latitude='51.5', Account_Created='1/2/09 6:00')

In [38]:
table = [Record(**row) for row in rows]

In [53]:
c = Counter()
for rec in table:
    c[rec.Payment_Type] += parse_price(rec.Price)
c.most_common()

[('Visa', 849350),
 ('Mastercard', 458450),
 ('Amex', 188900),
 ('Diners', 133800)]

In [64]:
class GroupBy(defaultdict):
    
    def sum(self, by):
        return self.agg(by, sum)
    
    def max(self, by):
        return self.agg(by, max)
    
    def min(self, by):
        return self.agg(by, min)
    
    def avg(self, by):
        def _f(values):
            xs = list(values)
            return sum(xs) / len(xs)
        return self.agg(by, _f)
    
    def agg(self, by, func):
        return {k: func(rows.select(by)) for k, rows in self.items()}


    
class Table(list):
    
    def group(self, by):
        grouping = GroupBy(Table)
        for row in self:
            grouping[getattr(row, by)].append(row)
        return grouping
    
    def select(self, col):
        for row in self:
            yield getattr(row, col)
            
    def apply(self, col, func):
        return Table([row._replace(**{col: func(getattr(row, col))}) for row in self])

In [54]:
t = Table(table)
t.apply("Price", parse_price).group("Country").sum("Price")

{'Argentina': 1200,
 'Australia': 64800,
 'Austria': 10800,
 'Bahrain': 1200,
 'Belgium': 12000,
 'Bermuda': 1200,
 'Brazil': 12300,
 'Bulgaria': 1200,
 'Canada': 124800,
 'Cayman Isls': 1200,
 'China': 1200,
 'Costa Rica': 1200,
 'Czech Republic': 6000,
 'Denmark': 18000,
 'Dominican Republic': 1200,
 'Finland': 2400,
 'France': 53100,
 'Germany': 42000,
 'Greece': 1200,
 'Guatemala': 1200,
 'Hong Kong': 1200,
 'Hungary': 3600,
 'Iceland': 1200,
 'India': 2400,
 'Ireland': 69900,
 'Israel': 1200,
 'Italy': 37800,
 'Japan': 2400,
 'Jersey': 1200,
 'Kuwait': 1200,
 'Latvia': 1200,
 'Luxembourg': 1200,
 'Malaysia': 1200,
 'Malta': 4800,
 'Mauritius': 3600,
 'Moldova': 1200,
 'Monaco': 2400,
 'Netherlands': 44700,
 'New Zealand': 7200,
 'Norway': 21600,
 'Philippines': 2400,
 'Poland': 2400,
 'Romania': 1200,
 'Russia': 3600,
 'South Africa': 12300,
 'South Korea': 1200,
 'Spain': 16800,
 'Sweden': 22800,
 'Switzerland': 76800,
 'Thailand': 4800,
 'The Bahamas': 2400,
 'Turkey': 7200,
 'U

In [55]:
t = Table(table).apply("Price", parse_price)
t.group("Country").agg("Price", max)

{'Argentina': 1200,
 'Australia': 3600,
 'Austria': 3600,
 'Bahrain': 1200,
 'Belgium': 3600,
 'Bermuda': 1200,
 'Brazil': 7500,
 'Bulgaria': 1200,
 'Canada': 3600,
 'Cayman Isls': 1200,
 'China': 1200,
 'Costa Rica': 1200,
 'Czech Republic': 3600,
 'Denmark': 1200,
 'Dominican Republic': 1200,
 'Finland': 1200,
 'France': 7500,
 'Germany': 3600,
 'Greece': 1200,
 'Guatemala': 1200,
 'Hong Kong': 1200,
 'Hungary': 1200,
 'Iceland': 1200,
 'India': 1200,
 'Ireland': 7500,
 'Israel': 1200,
 'Italy': 7500,
 'Japan': 1200,
 'Jersey': 1200,
 'Kuwait': 1200,
 'Latvia': 1200,
 'Luxembourg': 1200,
 'Malaysia': 1200,
 'Malta': 3600,
 'Mauritius': 3600,
 'Moldova': 1200,
 'Monaco': 1200,
 'Netherlands': 7500,
 'New Zealand': 1200,
 'Norway': 3600,
 'Philippines': 1200,
 'Poland': 1200,
 'Romania': 1200,
 'Russia': 3600,
 'South Africa': 7500,
 'South Korea': 1200,
 'Spain': 3600,
 'Sweden': 3600,
 'Switzerland': 3600,
 'Thailand': 3600,
 'The Bahamas': 1200,
 'Turkey': 1200,
 'Ukraine': 1200,
 '

In [65]:
t = Table(table).apply("Price", parse_price)
print t.group("Product").max("Price")
print t.group("Product").min("Price")
print t.group("Product").avg("Price")
print t.group("Product").sum("Price")

{'Product3': 7500, 'Product2': 3600, 'Product1': 13000, 'Product3 ': 7500}
{'Product3': 7500, 'Product2': 3600, 'Product1': 250, 'Product3 ': 7500}
{'Product3': 7500, 'Product2': 3600, 'Product1': 1214, 'Product3 ': 7500}
{'Product3': 105000, 'Product2': 489600, 'Product1': 1028400, 'Product3 ': 7500}
