## division

In [1]:
7 / 2

3.5

In [2]:
7 // 2

3

In [3]:
7 % 2

1

## tuple
- fixed-length
- immutable

In [4]:
tup = 4, 5, 6
tup

(4, 5, 6)

In [5]:
(4, None, 'foo') + (6, 0) + ('bar',)

(4, None, 'foo', 6, 0, 'bar')

In [6]:
('foo', 'bar') * 2

('foo', 'bar', 'foo', 'bar')

In [7]:
(4, 3, 2) + (1, 2, 6)

(4, 3, 2, 1, 2, 6)

In [8]:
(4, 3, 2) * 2

(4, 3, 2, 4, 3, 2)

In [9]:
tup[1]

5

Unpacking tuples

In [10]:
tup = (4, 5, 6)
a, b, c = tup
b

5

Iteration

In [11]:
seq = [( 1, 2, 3), ( 4, 5, 6), ( 7, 8, 9)]

for a, b, c in seq:
    print('a={0}, b={1}, c={2}'.format(a, b, c))

a=1, b=2, c=3
a=4, b=5, c=6
a=7, b=8, c=9


Tuple methods

In [12]:
tup = (1, 2, 2, 2, 3, 4, 5, 6)
tup.count(2)

3

In [13]:
tup[1]

2

zip

In [14]:
tup1 = tuple(['foo', 'bar'])
tup2 = tuple(['one', 'two'])
tuple(zip(tup1, tup2))

(('foo', 'one'), ('bar', 'two'))

## array

In [15]:
import numpy as np
arr_a = np.array([1, 2, 3])
arr_a

array([1, 2, 3])

Insert one element into array

In [16]:
np.append(arr_a, 1)

array([1, 2, 3, 1])

Append 2 arrays

In [17]:
arr_b = np.array([1, 6, 7])
np.append(arr_a, arr_b)

array([1, 2, 3, 1, 6, 7])

## list
- variable-length
- can be modified in place
- `list`, `[]`

Convert a tuple to list

In [18]:
tup = ('foo', 'bar')
list(tup)

['foo', 'bar']

In [19]:
type(tup)

tuple

In [20]:
type(list(tup))

list

Adding elements

In [21]:
a_list = list(tup)
a_list

['foo', 'bar']

In [22]:
a_list.append('new')
a_list

['foo', 'bar', 'new']

In [23]:
a_list.insert(1, 'red')
a_list

['foo', 'red', 'bar', 'new']

Removing elements

In [24]:
a_list.pop(2) # in-place

'bar'

In [25]:
a_list

['foo', 'red', 'new']

In [26]:
a_list.remove('foo') # in-place
a_list

['red', 'new']

Check if a list contains a value using the `in` keyword

In [27]:
'red' in a_list

True

In [28]:
'red' not in a_list

False

In [29]:
'red'.isin(a_list)

AttributeError: 'str' object has no attribute 'isin'

Concatenating & combining lists

In [30]:
[4, None, 'bar'] + ['foo', 2, (6, 2)]

[4, None, 'bar', 'foo', 2, (6, 2)]

In [31]:
[4, None, 'bar'] * 2

[4, None, 'bar', 4, None, 'bar']

In [32]:
x = [4, None, 'bar']
x.extend(['new', 'bar']) # faster than '+'
x

[4, None, 'bar', 'new', 'bar']

In [33]:
[4, 3, 2] + [1, 2, 6]

[4, 3, 2, 1, 2, 6]

In [34]:
[4, 3, 2] * 2

[4, 3, 2, 4, 3, 2]

Sorting

In [35]:
a = [7, 4, 8, 2, 5]
a.sort()
a

[2, 4, 5, 7, 8]

In [36]:
a = [7, 4, 8, 2, 5]
sorted(a)

[2, 4, 5, 7, 8]

Slicing

In [37]:
seq = [7, 1, 3, 6, 2, 8, 9, 12, 16]

In [38]:
seq[2]

3

In [39]:
seq[[2]]

TypeError: list indices must be integers or slices, not list

In [40]:
seq[1:3]

[1, 3]

In [41]:
seq[1:]

[1, 3, 6, 2, 8, 9, 12, 16]

In [42]:
seq[:3]

[7, 1, 3]

In [43]:
seq[-3:-1]

[9, 12]

In [44]:
seq[-3:]

[9, 12, 16]

In [45]:
seq[:-2]

[7, 1, 3, 6, 2, 8, 9]

In [46]:
seq[::-1]

[16, 12, 9, 8, 2, 6, 3, 1, 7]

In [47]:
seq[::2]

[7, 3, 2, 9, 16]

Iteration

In [48]:
for i in seq:
    print(i)

7
1
3
6
2
8
9
12
16


In [49]:
for i, c in enumerate(seq):
    print(i, c)

0 7
1 1
2 3
3 6
4 2
5 8
6 9
7 12
8 16


list comprehension

`[expr for val in collection if condition]`

In [50]:
strings = ['f', 'bar', 'car', 'dove']
[string.upper() for string in strings]

['F', 'BAR', 'CAR', 'DOVE']

In [51]:
[string.upper() for string in strings if len(string) > 2]

['BAR', 'CAR', 'DOVE']

Zip

In [52]:
seq1 = ['foo', 'bar']
seq2 = ['one', 'two']
list(zip(seq1, seq2))

[('foo', 'one'), ('bar', 'two')]

In [53]:
# simultaneously iterate over multiple sequences
for i, (a, b) in enumerate(zip(seq1, seq2)):
    print('{0}: {1}, {2}'.format(i, a, b))

0: foo, one
1: bar, two


In [54]:
# unzip
pitchers = [('Nolan', 'Ryan'), ('Schilling', 'Curt')]
first_name, last_name = zip(*pitchers)
print('first_name:', first_name)
print('last_name:', last_name)

first_name: ('Nolan', 'Schilling')
last_name: ('Ryan', 'Curt')


Reversed

In [55]:
list(reversed(range(10)))

[9, 8, 7, 6, 5, 4, 3, 2, 1, 0]

## dict

In [56]:
d1 = {'a': 1, 'b': 2}
d1

{'a': 1, 'b': 2}

In [57]:
d1['b']

2

In [58]:
'a' in d1

True

Add a new key

In [59]:
d1[7] = 'seven'
d1

{'a': 1, 'b': 2, 7: 'seven'}

Update value & add new key

In [60]:
d1.update({'b':'new b value', 'c':12})
d1

{'a': 1, 'b': 'new b value', 7: 'seven', 'c': 12}

Delete key

In [61]:
d1[5] = 'some value'
d1['dummy'] = 'another value'
d1

{'a': 1,
 'b': 'new b value',
 7: 'seven',
 'c': 12,
 5: 'some value',
 'dummy': 'another value'}

In [62]:
del d1[5]
d1

{'a': 1, 'b': 'new b value', 7: 'seven', 'c': 12, 'dummy': 'another value'}

In [63]:
ret = d1.pop('dummy')
print('ret:', ret)
print(d1)

ret: another value
{'a': 1, 'b': 'new b value', 7: 'seven', 'c': 12}


Creating dicts from sequences

In [64]:
map = {}
for key, value in zip(range(5), reversed(range(5))):
    map[key] = value

map

{0: 4, 1: 3, 2: 2, 3: 1, 4: 0}

In [65]:
dict(zip(range(5), reversed(range(5))))

{0: 4, 1: 3, 2: 2, 3: 1, 4: 0}

Default values

In [66]:
words = ['apple', 'bar', 'book', 'foo']
by_letter = {}

for word in words:
    letter = word[0]
    if letter not in by_letter:
        by_letter[letter] = [word]
    else:
        by_letter[letter].append(word)

by_letter

{'a': ['apple'], 'b': ['bar', 'book'], 'f': ['foo']}

In [67]:
words = ['apple', 'bar', 'book', 'foo']
by_letter_2 = {}

for word in words:
    letter = word[0]
    by_letter_2.setdefault(letter, []).append(word)

by_letter_2

{'a': ['apple'], 'b': ['bar', 'book'], 'f': ['foo']}

Iteration

In [68]:
europe = {'Spain': 'Madrid',
          'France': 'Paris',
          'Germany': 'Berlin',
          'Norway': 'Oslo',
          'Italy': 'Rome',
          'Poland': 'Warsaw',
          'Australia': 'Canberra'}

for country, city in europe.items():
    print('The capital of {0} is {1}.'.format(country, city))

The capital of Spain is Madrid.
The capital of France is Paris.
The capital of Germany is Berlin.
The capital of Norway is Oslo.
The capital of Italy is Rome.
The capital of Poland is Warsaw.
The capital of Australia is Canberra.


dict comprehension

`{key-expr : value-expr for value in collection if condition}`

In [69]:
words = ['apple', 'banana', 'candy']
{word[0]:word for word in words}

{'a': 'apple', 'b': 'banana', 'c': 'candy'}

## set
- unordered collection of unique elements
- `set()`
- `{}`

In [70]:
set([2, 2, 2, 3, 3, 1, 1])

{1, 2, 3}

In [71]:
{2, 2, 2, 3, 1, 1, 3}

{1, 2, 3}

Add element

In [72]:
s = {4, 2, 5, 1, 4}
s

{1, 2, 4, 5}

In [73]:
# "add" adds an element
s.add(3)
s

{1, 2, 3, 4, 5}

In [74]:
s.add(3)
s

{1, 2, 3, 4, 5}

In [75]:
# "update" adds another iterable set, list or tuple
s.update({5, 7, 1})
s

{1, 2, 3, 4, 5, 7}

In [76]:
s.update([19, 15, 12])
s

{1, 2, 3, 4, 5, 7, 12, 15, 19}

In [77]:
s.update(tuple([18, 12, 20]))
s

{1, 2, 3, 4, 5, 7, 12, 15, 18, 19, 20}

Remove/delete one element

In [78]:
s.remove(0)

KeyError: 0

In [79]:
s.remove(1)
s

{2, 3, 4, 5, 7, 12, 15, 18, 19, 20}

Sorting

In [80]:
s.sort()

AttributeError: 'set' object has no attribute 'sort'

In [81]:
sorted(s)

[2, 3, 4, 5, 7, 12, 15, 18, 19, 20]

Slicing

In [82]:
s[0]

TypeError: 'set' object is not subscriptable

In [83]:
s[[0]]

TypeError: 'set' object is not subscriptable

In [84]:
list(s)[0]

2

Combining sets

In [85]:
{1, 2, 3} + {1, 2, 3}

TypeError: unsupported operand type(s) for +: 'set' and 'set'

In [86]:
# union
print({1, 2, 3}.union({1, 2, 3}))
print({1, 2, 3} | {4, 5, 6})

{1, 2, 3}
{1, 2, 3, 4, 5, 6}


Iteration

In [87]:
for i in s:
    print(i)

2
3
4
5
7
12
15
18
19
20


In [88]:
for i in enumerate(s):
    print(i)

(0, 2)
(1, 3)
(2, 4)
(3, 5)
(4, 7)
(5, 12)
(6, 15)
(7, 18)
(8, 19)
(9, 20)


Zip

In [89]:
s1 = {5, 3, 6}
s2 = set([2, 7, 1])
zip(s1, s2)

<zip at 0x11322c888>

In [90]:
set(zip(s1, s2))

{(3, 1), (5, 2), (6, 7)}

Set operations

In [91]:
a = {1, 2, 3}
b = {2, 3, 4, 5, 6}

In [92]:
# union
print(a.union(b))
print(a | b)

{1, 2, 3, 4, 5, 6}
{1, 2, 3, 4, 5, 6}


In [93]:
# intersection
print(a.intersection(b))
print(a & b)

{2, 3}
{2, 3}


In [94]:
# subset
a_set = {1, 2, 3, 4, 5}
{1, 2, 3}.issubset(a_set)

True

In [95]:
a_set.issuperset({1, 2, 3})

True

set comprehension

`{expr for value in collection if condition}`

In [96]:
strings = ['f', 'bar', 'car', 'dove']
{len(string) for string in strings}

{1, 3, 4}

## string

Add

In [97]:
m = 'string'
m + 'g'

'stringg'

Update / Replace

In [98]:
strA = "Game of Thrones"
strA.replace('e', 'E')

'GamE of ThronEs'

Iteration

In [99]:
for i in m:
    print(i)

s
t
r
i
n
g


Slicing

In [100]:
m[1]

't'

In [101]:
m[[1]]

TypeError: string indices must be integers

## numpy

In [102]:
import numpy as np
rdm_ary = np.random.randn(2, 3)
rdm_ary

array([[-0.5425803 ,  1.10660782, -0.2253057 ],
       [ 0.63488708, -0.28611817,  1.42039329]])

In [103]:
rdm_ary * 10

array([[-5.425803  , 11.06607815, -2.253057  ],
       [ 6.34887081, -2.86118173, 14.20393286]])

In [104]:
rdm_ary.ndim

2

In [105]:
rdm_ary.shape

(2, 3)

In [106]:
# zero
np.zeros((2, 3))

array([[0., 0., 0.],
       [0., 0., 0.]])

In [107]:
# arange
np.arange(5)

array([0, 1, 2, 3, 4])

### dtype

In [108]:
np.array([1, 2, 3], dtype=np.float64)

array([1., 2., 3.])

In [109]:
np.array([1, 2, 3], dtype=np.int32)

array([1, 2, 3], dtype=int32)

### arithmetic

In [110]:
arr = np.array([[1, 2, 3], [4, 5, 6]])
arr

array([[1, 2, 3],
       [4, 5, 6]])

In [111]:
arr - arr

array([[0, 0, 0],
       [0, 0, 0]])

In [112]:
arr * arr

array([[ 1,  4,  9],
       [16, 25, 36]])

### boolean arrays

In [113]:
bools = np.array([False, True, True])
bools

array([False,  True,  True])

In [114]:
bools.any()

True

In [115]:
bools.all()

False

### sorting

In [116]:
arr = np.random.randn(10)
arr

array([ 0.04431106,  2.71421597,  1.33844144,  0.43402978,  0.85884941,
        0.87402295,  0.21369091,  0.05295622, -0.84428921,  0.27257602])

In [117]:
arr.sort() # in place
arr

array([-0.84428921,  0.04431106,  0.05295622,  0.21369091,  0.27257602,
        0.43402978,  0.85884941,  0.87402295,  1.33844144,  2.71421597])

### unique

In [118]:
names = np.array(['Tom', 'Jerry', 'Amy', 'Jerry'])
names

array(['Tom', 'Jerry', 'Amy', 'Jerry'], dtype='<U5')

In [119]:
np.unique(names)

array(['Amy', 'Jerry', 'Tom'], dtype='<U5')

In [120]:
sorted(set(names))

['Amy', 'Jerry', 'Tom']

## pandas series

Creation

In [121]:
import pandas as pd
obj = pd.Series([4, -7, 2])
obj

0    4
1   -7
2    2
dtype: int64

In [122]:
mdata = {'Ohio': 35000, 'Texas': 70000, 'Utah': 5000}
obj2 = pd.Series(mdata)
obj2

Ohio     35000
Texas    70000
Utah      5000
dtype: int64

In [123]:
states = ['Califonia', 'Ohio', 'Texas', 'Utah']
obj3 = pd.Series(mdata, index=states)
obj3

Califonia        NaN
Ohio         35000.0
Texas        70000.0
Utah          5000.0
dtype: float64

arithmetic

In [124]:
obj[[1]] * 2

1   -14
dtype: int64

In [125]:
obj * 2

0     8
1   -14
2     4
dtype: int64

In [126]:
obj2 + obj3

Califonia         NaN
Ohio          70000.0
Texas        140000.0
Utah          10000.0
dtype: float64

`in`

In [127]:
-7 in obj

False

In [128]:
0 in obj

True

Check `null`

In [129]:
pd.isnull(obj3)

Califonia     True
Ohio         False
Texas        False
Utah         False
dtype: bool

In [130]:
pd.notnull(obj3)

Califonia    False
Ohio          True
Texas         True
Utah          True
dtype: bool

### drop

In [131]:
new_obj = obj.drop(1)
new_obj

0    4
2    2
dtype: int64

### indexing, selection, filtering

In [132]:
obj = pd.Series([4, -7, 2], index=['a', 'b', 'c'])
obj

a    4
b   -7
c    2
dtype: int64

In [133]:
obj['b']

-7

In [134]:
obj[1]

-7

In [135]:
obj[[1]]

b   -7
dtype: int64

In [136]:
obj[1:2]

b   -7
dtype: int64

In [137]:
obj[0:]

a    4
b   -7
c    2
dtype: int64

In [138]:
obj[:2]

a    4
b   -7
dtype: int64

In [139]:
# slicing with labels behaves differently than normal Python slicing in that the end-point is inclusive
obj['b':'c']

b   -7
c    2
dtype: int64

In [140]:
obj[[1, 2, 0]]

b   -7
c    2
a    4
dtype: int64

### arithmetic method

In [141]:
s1 = pd.Series([3, -1, 4], index=['a', 'b', 'd'])
s2 = pd.Series([2, -5, 0], index=['b', 'c', 'd'])
s1

a    3
b   -1
d    4
dtype: int64

In [142]:
s2

b    2
c   -5
d    0
dtype: int64

In [143]:
s1 + s2

a    NaN
b    1.0
c    NaN
d    4.0
dtype: float64

### sorting

In [144]:
obj = pd.Series(range(4), index=['b', 'd', 'a', 'c'])
obj

b    0
d    1
a    2
c    3
dtype: int64

In [145]:
obj.sort_index()

a    2
b    0
c    3
d    1
dtype: int64

In [146]:
obj.sort_values()

b    0
d    1
a    2
c    3
dtype: int64

### ranking

In [147]:
ser = pd.Series([7, -1, 3, 7, 0, 3])
ser

0    7
1   -1
2    3
3    7
4    0
5    3
dtype: int64

In [148]:
ser.rank(method='first')

0    5.0
1    1.0
2    3.0
3    6.0
4    2.0
5    4.0
dtype: float64

### `is_unique`

In [149]:
obj = pd.Series(range(5), index=['a', 'b', 'c', 'b', 'a'])
obj

a    0
b    1
c    2
b    3
a    4
dtype: int64

In [150]:
obj.index.is_unique

False

### `.isin()`

In [151]:
obj.isin([1, 2])

a    False
b     True
c     True
b    False
a    False
dtype: bool

## pandas dataframe

In [152]:
data = {'state': ['Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'Year':[2000, 2001, 2000, 2001],
        'pop':[1.5, 1.7, 2.4, 2.9]}
df = pd.DataFrame(data)
df

Unnamed: 0,state,Year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Nevada,2000,2.4
3,Nevada,2001,2.9


In [153]:
df[['state']]

Unnamed: 0,state
0,Ohio
1,Ohio
2,Nevada
3,Nevada


### drop

In [154]:
df.drop(2) # row

Unnamed: 0,state,Year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
3,Nevada,2001,2.9


In [155]:
df.drop('pop', axis='columns')

Unnamed: 0,state,Year
0,Ohio,2000
1,Ohio,2001
2,Nevada,2000
3,Nevada,2001


In [156]:
df.drop(columns='pop')

Unnamed: 0,state,Year
0,Ohio,2000
1,Ohio,2001
2,Nevada,2000
3,Nevada,2001


### indexing, selection, filtering

In [157]:
df = pd.DataFrame(np.arange(16).reshape(4, 4),
                  index=['Ohio', 'Colorado', 'Utah', 'New York'],
                  columns=['one', 'two', 'three', 'four'])
df

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [158]:
df['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64

In [159]:
df[['two']]

Unnamed: 0,two
Ohio,1
Colorado,5
Utah,9
New York,13


In [160]:
df[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


`loc` & `iloc`

In [161]:
df.loc['Ohio']

one      0
two      1
three    2
four     3
Name: Ohio, dtype: int64

In [162]:
df.loc['Ohio', 'two']

1

In [163]:
df.loc[:'Utah', 'two']

Ohio        1
Colorado    5
Utah        9
Name: two, dtype: int64

In [164]:
df.iloc[0]

one      0
two      1
three    2
four     3
Name: Ohio, dtype: int64

In [165]:
df.iloc[0, 1]

1

In [166]:
df.iloc[:, :2]

Unnamed: 0,one,two
Ohio,0,1
Colorado,4,5
Utah,8,9
New York,12,13


### arithmetic method

In [167]:
df1 = pd.DataFrame(np.arange(4).reshape(2, 2),
                   index=['a', 'b'],
                   columns=list('bc'))
df2 = pd.DataFrame(np.arange(6).reshape(3, 2),
                   index=['b', 'c', 'd'],
                   columns=list('ab'))
df1

Unnamed: 0,b,c
a,0,1
b,2,3


In [168]:
df2

Unnamed: 0,a,b
b,0,1
c,2,3
d,4,5


In [169]:
df1 + df2

Unnamed: 0,a,b,c
a,,,
b,,3.0,
c,,,
d,,,


Operations between DataFrame and Series

In [170]:
df

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [171]:
ser = df.iloc[0]
ser

one      0
two      1
three    2
four     3
Name: Ohio, dtype: int64

In [172]:
df - ser

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,4,4,4,4
Utah,8,8,8,8
New York,12,12,12,12


### `apply`, `applymap`, `map`

In [173]:
df.apply(lambda x: x.max()-x.min())

one      12
two      12
three    12
four     12
dtype: int64

In [174]:
df.apply(lambda x: x.max()-x.min(), axis='columns')

Ohio        3
Colorado    3
Utah        3
New York    3
dtype: int64

In [175]:
df.applymap(lambda x: '%.2f' % x)

Unnamed: 0,one,two,three,four
Ohio,0.0,1.0,2.0,3.0
Colorado,4.0,5.0,6.0,7.0
Utah,8.0,9.0,10.0,11.0
New York,12.0,13.0,14.0,15.0


In [176]:
df['two'].map(lambda x: '%.2f' % x)

Ohio         1.00
Colorado     5.00
Utah         9.00
New York    13.00
Name: two, dtype: object

### sorting

In [177]:
df.sort_index()

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
New York,12,13,14,15
Ohio,0,1,2,3
Utah,8,9,10,11


In [178]:
df.sort_index(axis='columns')

Unnamed: 0,four,one,three,two
Ohio,3,0,2,1
Colorado,7,4,6,5
Utah,11,8,10,9
New York,15,12,14,13


In [179]:
df.sort_values('one')

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


### ranking

In [180]:
df.rank()

Unnamed: 0,one,two,three,four
Ohio,1.0,1.0,1.0,1.0
Colorado,2.0,2.0,2.0,2.0
Utah,3.0,3.0,3.0,3.0
New York,4.0,4.0,4.0,4.0


In [181]:
df.rank(axis='columns')

Unnamed: 0,one,two,three,four
Ohio,1.0,2.0,3.0,4.0
Colorado,1.0,2.0,3.0,4.0
Utah,1.0,2.0,3.0,4.0
New York,1.0,2.0,3.0,4.0


### `.isin()`

In [182]:
df[df['two'].isin([2, 5, 6, 9])]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11


## date & time

### datetime -> string

In [183]:
from datetime import datetime
stamp = datetime(2011, 3, 3)
stamp

datetime.datetime(2011, 3, 3, 0, 0)

In [184]:
str(stamp)

'2011-03-03 00:00:00'

In [185]:
stamp.strftime('%Y-%m-%d')

'2011-03-03'

### string -> datetime

In [186]:
# method 1
value = '2019-01-20'
datetime.strptime(value, '%Y-%m-%d')

datetime.datetime(2019, 1, 20, 0, 0)

In [187]:
# method 2
pd.to_datetime(value)

Timestamp('2019-01-20 00:00:00')

In [188]:
# method 3
from dateutil.parser import parse
parse(value)

datetime.datetime(2019, 1, 20, 0, 0)