# List vs Deque

In [3]:
from collections import deque

In [30]:
dq = deque()
ls = list()

In [8]:
dq.append(1)
ls.append(1)

dq.appendleft(1)
ls.insert(0, 1)

### Append/Prepend

In [34]:
%%timeit
ls = []
for i in xrange(100):
    ls.append(1)

100000 loops, best of 3: 7.37 µs per loop


In [36]:
%%timeit
dq = deque()
for i in xrange(100):
    dq.append(1)

The slowest run took 4.26 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 7.33 µs per loop


In [31]:
%%timeit
ls = []
for i in xrange(100):
    ls.insert(0, 1)

10000 loops, best of 3: 18.9 µs per loop


In [41]:
%%timeit
dq = deque()
for i in xrange(100):
    dq.appendleft(1)

The slowest run took 5.42 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 7.74 µs per loop


### Access

In [45]:
ls = range(100)
dq = deque(range(100))

#### Random Access

In [52]:
%%timeit
dq[50]

The slowest run took 56.59 times longer than the fastest. This could mean that an intermediate result is being cached.
10000000 loops, best of 3: 39.3 ns per loop


In [48]:
%%timeit
ls[50]

10000000 loops, best of 3: 28.3 ns per loop


#### Sequential Access

In [55]:
%%timeit
for x in ls:
    pass

The slowest run took 5.09 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 3: 983 ns per loop


In [56]:
%%timeit
for x in dq:
    pass

The slowest run took 17.51 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 3: 948 ns per loop


#### Slices

In [57]:
%%timeit
ls[5:15]

The slowest run took 14.84 times longer than the fastest. This could mean that an intermediate result is being cached.
10000000 loops, best of 3: 129 ns per loop


In [59]:
%%timeit
list(dq)[5:15] # deques don't support slicing directly

The slowest run took 5.08 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 3: 798 ns per loop


# Dictionary vs defaultdict

In [73]:
from collections import defaultdict
from pprint import pprint

In [79]:
d = {}
dd = defaultdict(int)

d["key"] = 1
dd["key"] = 1

print d
print dd

{'key': 1}
defaultdict(<type 'int'>, {'key': 1})


In [80]:
d = {}
dd = defaultdict(int)

print dd["key"]
print d["key"]

0


KeyError: 'key'

In [89]:
#%%timeit
d = {}

for x in range(30):
    bucket = "even" if (x % 2 == 0) else "odd"
    if bucket not in d:
        d[bucket] = []
    d[bucket].append(x)

d

{'even': [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28],
 'odd': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29]}

In [90]:
#%%timeit
dd = defaultdict(list)

for x in range(30):
    bucket = "even" if (x % 2 == 0) else "odd"
    dd[bucket].append(x)

dd

defaultdict(list,
            {'even': [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28],
             'odd': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29]})

In [109]:
import requests
words = requests.get("https://raw.githubusercontent.com/berzerk0/Probable-Wordlists/master/Real-Passwords/Top95Thousand-probable.txt").text.split("\n")

### Counting

You can count with vanilla dictionaries by exploiting the default portion of the `.get` method.
This is a good way to prepare data for use in a histogram.

In [110]:
by_len = {}
dist_ch = {}
for word in words:
    by_len[len(word)] = by_len.get(len(word), 0) + 1
    
    chs = len(set(word))
    dist_ch[chs] = dist_ch.get(chs, 0) + 1

pprint(by_len)
pprint(dist_ch)

{0: 2,
 1: 57,
 2: 295,
 3: 540,
 4: 2417,
 5: 3492,
 6: 17508,
 7: 13382,
 8: 34001,
 9: 12680,
 10: 6757,
 11: 2302,
 12: 1042,
 13: 334,
 14: 124,
 15: 27,
 16: 13,
 17: 1,
 18: 3,
 19: 1,
 20: 6,
 26: 1}
{0: 2,
 1: 401,
 2: 1378,
 3: 3142,
 4: 6998,
 5: 14199,
 6: 23367,
 7: 23065,
 8: 15668,
 9: 4953,
 10: 1435,
 11: 245,
 12: 98,
 13: 12,
 14: 6,
 15: 3,
 16: 7,
 19: 1,
 20: 4,
 26: 1}


### Grouping

In [111]:
#%%timeit
d = {}
for word in words:
    chs = len(set(word))
    d.setdefault(chs, []).append(word)
    
d

{0: [u'', u''],
 1: [u'1',
  u'111111',
  u'11111111',
  u'000000',
  u'aaaaaa',
  u'7777777',
  u'666666',
  u'88888888',
  u'555555',
  u'888888',
  u'xxxxxx',
  u'999999',
  u'333333',
  u'222222',
  u'1111',
  u'777777',
  u'aaaaaaaa',
  u'00000000',
  u'11111',
  u'444444',
  u'1111111',
  u'0000',
  u'bbbbbb',
  u'999999999',
  u'99999999',
  u'zzzzzz',
  u'pppppp',
  u'mmmmmm',
  u'77777777',
  u'55555',
  u'qqqqqq',
  u'22222222',
  u'gggggg',
  u'a',
  u'2222',
  u'xxxxxxxx',
  u'wwwwww',
  u'dddddd',
  u'1111111111',
  u'ssssss',
  u'oooooo',
  u'aaaa',
  u'55555555',
  u'qqqqqqqq',
  u'111',
  u'pppppppp',
  u'ffffff',
  u'7777',
  u'44444444',
  u'cccccc',
  u'4444',
  u'zzzzzzzz',
  u'tttttt',
  u'llllll',
  u'kkkkkk',
  u'jjjjjj',
  u'hhhhhh',
  u'aaa',
  u'2',
  u'xxxxx',
  u's',
  u'mmmmmmmm',
  u'm',
  u'0000000000',
  u'00000',
  u'eeeeee',
  u'33333333',
  u'vvvvvv',
  u'gggggggg',
  u'e',
  u'5555',
  u'xxxxxxx',
  u'rrrrrr',
  u'aaaaaaaaaa',
  u'7',
  u'66666666',


In [100]:
#%%timeit
dd = defaultdict(list)
for word in words:
    chs = len(set(word))
    dd[chs].append(word)

dict(dd)

100 loops, best of 3: 2.95 ms per loop


### More counting

In [101]:
from collections import Counter

In [113]:
c = Counter()
for word in words:
    dist_ch = tuple(sorted(set(word)))
    c[dist_ch] += 1

c.most_common(5)

[((u'1', u'2', u'3'), 94),
 ((u'1', u'2', u'3', u'4', u'5', u'6', u'7', u'8', u'9'), 93),
 ((u'0', u'1', u'2'), 76),
 ((u'0', u'1', u'2', u'9'), 59),
 ((u'1', u'2', u'3', u'4', u'5', u'6'), 58)]

In [115]:
c = Counter()
for word in words:
    chs = len(set(word))
    c[chs] += 1

c.most_common(5)

[(6, 23367), (7, 23065), (8, 15668), (5, 14199), (4, 6998)]

In [132]:
import csv
dataset = requests.get("http://samplecsvs.s3.amazonaws.com/SalesJan2009.csv")
r = csv.DictReader(dataset.iter_lines())
rows = list(r)

In [133]:
len(rows)

998

In [134]:
rows[0]

{'Account_Created': '1/2/09 6:00',
 'City': 'Basildon',
 'Country': 'United Kingdom',
 'Last_Login': '1/2/09 6:08',
 'Latitude': '51.5',
 'Longitude': '-1.1166667',
 'Name': 'carolina',
 'Payment_Type': 'Mastercard',
 'Price': '1200',
 'Product': 'Product1',
 'State': 'England',
 'Transaction_date': '1/2/09 6:17'}

In [135]:
countries = Counter()
for row in rows:
    countries[row["Country"]] += 1
countries.most_common(3)

[('United States', 463), ('United Kingdom', 100), ('Canada', 76)]

In [146]:
def parse_revenue(rev):
    return int(filter(lambda c: c.isdigit(), rev))

In [148]:
revenue_by_country = defaultdict(list)
for row in rows:
    revenue_by_country[row["Country"]].append(parse_revenue(row["Price"]))
    
sorted((sum(rev), c) for c, rev in revenue_by_country.items())

[(1200, 'Argentina'),
 (1200, 'Bahrain'),
 (1200, 'Bermuda'),
 (1200, 'Bulgaria'),
 (1200, 'Cayman Isls'),
 (1200, 'China'),
 (1200, 'Costa Rica'),
 (1200, 'Dominican Republic'),
 (1200, 'Greece'),
 (1200, 'Guatemala'),
 (1200, 'Hong Kong'),
 (1200, 'Iceland'),
 (1200, 'Israel'),
 (1200, 'Jersey'),
 (1200, 'Kuwait'),
 (1200, 'Latvia'),
 (1200, 'Luxembourg'),
 (1200, 'Malaysia'),
 (1200, 'Moldova'),
 (1200, 'Romania'),
 (1200, 'South Korea'),
 (1200, 'Ukraine'),
 (2400, 'Finland'),
 (2400, 'India'),
 (2400, 'Japan'),
 (2400, 'Monaco'),
 (2400, 'Philippines'),
 (2400, 'Poland'),
 (2400, 'The Bahamas'),
 (3600, 'Hungary'),
 (3600, 'Mauritius'),
 (3600, 'Russia'),
 (4800, 'Malta'),
 (4800, 'Thailand'),
 (6000, 'Czech Republic'),
 (7200, 'New Zealand'),
 (7200, 'Turkey'),
 (10800, 'Austria'),
 (12000, 'Belgium'),
 (12000, 'United Arab Emirates'),
 (12300, 'Brazil'),
 (12300, 'South Africa'),
 (16800, 'Spain'),
 (18000, 'Denmark'),
 (21600, 'Norway'),
 (22800, 'Sweden'),
 (37800, 'Italy'),
 

In [165]:
def avg(prices):
    return sum(prices) / len(prices)

group = defaultdict(list)
for row in rows:
    group[row["Country"]].append(parse_revenue(row["Price"]))

price_info = {country: (avg(prices), sum(prices), len(prices)) for country, prices in group.items()}
pprint(price_info)

from operator import itemgetter

print "best average price"
pprint(max(price_info.items(), key=itemgetter(1)))

print "most transactions"
pprint(max(price_info.items(), key=lambda pair: pair[1][2]))

{'Argentina': (1200, 1200, 1),
 'Australia': (1705, 64800, 38),
 'Austria': (1542, 10800, 7),
 'Bahrain': (1200, 1200, 1),
 'Belgium': (1500, 12000, 8),
 'Bermuda': (1200, 1200, 1),
 'Brazil': (2460, 12300, 5),
 'Bulgaria': (1200, 1200, 1),
 'Canada': (1642, 124800, 76),
 'Cayman Isls': (1200, 1200, 1),
 'China': (1200, 1200, 1),
 'Costa Rica': (1200, 1200, 1),
 'Czech Republic': (2000, 6000, 3),
 'Denmark': (1200, 18000, 15),
 'Dominican Republic': (1200, 1200, 1),
 'Finland': (1200, 2400, 2),
 'France': (1966, 53100, 27),
 'Germany': (1680, 42000, 25),
 'Greece': (1200, 1200, 1),
 'Guatemala': (1200, 1200, 1),
 'Hong Kong': (1200, 1200, 1),
 'Hungary': (1200, 3600, 3),
 'Iceland': (1200, 1200, 1),
 'India': (1200, 2400, 2),
 'Ireland': (1426, 69900, 49),
 'Israel': (1200, 1200, 1),
 'Italy': (2520, 37800, 15),
 'Japan': (1200, 2400, 2),
 'Jersey': (1200, 1200, 1),
 'Kuwait': (1200, 1200, 1),
 'Latvia': (1200, 1200, 1),
 'Luxembourg': (1200, 1200, 1),
 'Malaysia': (1200, 1200, 1),
 'M

In [168]:
from collections import namedtuple
Record = namedtuple('Record', " ".join(rows[0]))

In [171]:
rec = Record(**rows[0])
rec

Record(City='Basildon', Product='Product1', Name='carolina', Country='United Kingdom', Price='1200', Longitude='-1.1166667', State='England', Transaction_date='1/2/09 6:17', Last_Login='1/2/09 6:08', Payment_Type='Mastercard', Latitude='51.5', Account_Created='1/2/09 6:00')

In [172]:
table = [Record(**row) for row in rows]

In [178]:
c = Counter()
for rec in table:
    c[rec.Payment_Type] += parse_revenue(rec.Price)
c.most_common()

[('Visa', 849350),
 ('Mastercard', 458450),
 ('Amex', 188900),
 ('Diners', 133800)]

In [199]:
class GroupBy(defaultdict):
    
    def sum(self, by):
        return {k: sum(rows.select(by)) for k, rows in self.items()}

    
class Table(list):
    
    def group(self, by):
        grouping = GroupBy(Table)
        for row in self:
            grouping[getattr(row, by)].append(row)
        return grouping
    
    def select(self, col):
        for row in self:
            yield getattr(row, col)
            
    def apply(self, col, func):
        return Table([row._replace(**{col: func(getattr(row, col))}) for row in self])

In [201]:
t = Table(table)
t.apply("Price", parse_revenue).group("Country").sum("Price")

{'Argentina': 1200,
 'Australia': 64800,
 'Austria': 10800,
 'Bahrain': 1200,
 'Belgium': 12000,
 'Bermuda': 1200,
 'Brazil': 12300,
 'Bulgaria': 1200,
 'Canada': 124800,
 'Cayman Isls': 1200,
 'China': 1200,
 'Costa Rica': 1200,
 'Czech Republic': 6000,
 'Denmark': 18000,
 'Dominican Republic': 1200,
 'Finland': 2400,
 'France': 53100,
 'Germany': 42000,
 'Greece': 1200,
 'Guatemala': 1200,
 'Hong Kong': 1200,
 'Hungary': 3600,
 'Iceland': 1200,
 'India': 2400,
 'Ireland': 69900,
 'Israel': 1200,
 'Italy': 37800,
 'Japan': 2400,
 'Jersey': 1200,
 'Kuwait': 1200,
 'Latvia': 1200,
 'Luxembourg': 1200,
 'Malaysia': 1200,
 'Malta': 4800,
 'Mauritius': 3600,
 'Moldova': 1200,
 'Monaco': 2400,
 'Netherlands': 44700,
 'New Zealand': 7200,
 'Norway': 21600,
 'Philippines': 2400,
 'Poland': 2400,
 'Romania': 1200,
 'Russia': 3600,
 'South Africa': 12300,
 'South Korea': 1200,
 'Spain': 16800,
 'Sweden': 22800,
 'Switzerland': 76800,
 'Thailand': 4800,
 'The Bahamas': 2400,
 'Turkey': 7200,
 'U