### Lists, Tuples, Dictionaries, Sets, Dataframes -- basics

#### References
1. Python for data analysis
2. Think stats: exploratory data analysis
3. https://pandas.pydata,org
#### Purpose
1. Work in the abstract 
2. Keep up to date with changes in the library
3. Explore new ways of doing common tasks --- get better

In [1]:
import pandas as pd
import numpy as np

### Tuples:

In [2]:
# start with the start
# tuples
tup = 4, 5, 6
nested_tup = (4, 5, 6), (7, 8)
# call tuple on a list
a = [1, 2, 3]
b = tuple(a)
tup_string = tuple('string')
print(tup_string[1], nested_tup, b)


t ((4, 5, 6), (7, 8)) (1, 2, 3)


In [3]:
# unpacking tuples
tup = (4,5,6)
a, b, c = tup
b

5

In [4]:
# unpacking nested tuples
tup = 4, 5, (6, 7)
a, b, (c, d) = tup
d

7

In [5]:
# swapping variables
a, b = 1, 2
b, a = a, b
b

1

In [6]:
# iterating over sequences of tuples (or lists)
seq = [(1, 2, 3,), (4,5,6), (7,8,9)]
def iter_print(s):
    for a, b, c in s:
        print('a={0}, b={1}, c={2}'.format(a, b, c))
iter_print(seq)
# check how each letter is assigned a postion in the list index

a=1, b=2, c=3
a=4, b=5, c=6
a=7, b=8, c=9


In [7]:
# more or less new feature for python (the reason I do this)
# *rest to pick a out just a few element from a list
# this is also a feature in JavaScript

values = 1, 2, 3, 4, 5, 6
a, b, *rest = values
print((a,b), rest)

(1, 2) [3, 4, 5, 6]


### Lists - Arrays:

In [8]:
# lists adding and removing elements
a_string = ['ham', 'spam', 'eggs', 'butter', 'toast', 'jelly']
a_sequence = [1, 2, 3, 4, 5, 6, 7, 8]
a_string.append('bacon')
a_string.insert(2, 'coffee')
a_string

['ham', 'spam', 'coffee', 'eggs', 'butter', 'toast', 'jelly', 'bacon']

In [9]:
# pop
# removes the element at the indicated index
# returns the the values removed
# a_sequence = [1, 2, 3, 4, 5, 6, 7, 8]
a_sequence.pop(3)


4

In [10]:
# remove
a_string.remove('spam')
# if you load this cell again and DO NOT reload "a_string"  or change 'spam' then:
# ValueError: list.remove(x): x not in list
a_string

['ham', 'coffee', 'eggs', 'butter', 'toast', 'jelly', 'bacon']

In [11]:
# concatenating and combining lists
two_lists = a_string + a_sequence
c = ['f', 4, 6, 9]
two_lists.extend(c)
two_lists
# see the difference between + and append next cell

['ham',
 'coffee',
 'eggs',
 'butter',
 'toast',
 'jelly',
 'bacon',
 1,
 2,
 3,
 5,
 6,
 7,
 8,
 'f',
 4,
 6,
 9]

In [12]:
# with append the list is added as an element
# to better understand you could always get: len(a_string) before appending
# then do that again after appending, the length should increase by one only
a_string.append(a_sequence)
a_string

['ham',
 'coffee',
 'eggs',
 'butter',
 'toast',
 'jelly',
 'bacon',
 [1, 2, 3, 5, 6, 7, 8]]

In [13]:
# see how the list is just stuck on to the end
# calling 
a_string[-1]
# gives another list

[1, 2, 3, 5, 6, 7, 8]

In [14]:
# sorting lists
# first get rid of the hanging list
a_string.pop(-1)

[1, 2, 3, 5, 6, 7, 8]

In [15]:
a_string

['ham', 'coffee', 'eggs', 'butter', 'toast', 'jelly', 'bacon']

In [16]:
# okay onwards
# sort this in place
a_string.sort()
a_string

['bacon', 'butter', 'coffee', 'eggs', 'ham', 'jelly', 'toast']

In [17]:
# sort it by a key
a_string.sort(key=len)
a_string

['ham', 'eggs', 'bacon', 'jelly', 'toast', 'butter', 'coffee']

In [18]:
# good to know stuff on lists
import bisect
bisect.bisect(a_string, 'eggs')

3

In [19]:
bisect.bisect(a_string, 'bacon')

0

In [20]:
# bisect is assuming that the list is sorted alphabetically
# test the theory
bisect.bisect(a_string, 'toast')
# should outpt 7

7

In [21]:
# or whatever the last element
# slicing and notation always good to review this stuff
d = list('hammerdirt')
d

['h', 'a', 'm', 'm', 'e', 'r', 'd', 'i', 'r', 't']

In [22]:
# okay here we go
a_slice = d[2:7]
# insert some stuff
d[2:7] = ['w', 'o', 'r', 'k','s']
print(a_slice, d)

['m', 'm', 'e', 'r', 'd'] ['h', 'a', 'w', 'o', 'r', 'k', 's', 'i', 'r', 't']


In [23]:
# index from the end
d[-5:-1]
# notice how the last element is not included

['k', 's', 'i', 'r']

In [24]:
# get back to orignal
e = list('hammerdirt')
# getting every second element
e[::2]

['h', 'm', 'e', 'd', 'r']

In [25]:
# getting every third element
e[::3]

['h', 'm', 'd', 't']

In [26]:
#write it backwards
e[::-1]

['t', 'r', 'i', 'd', 'r', 'e', 'm', 'm', 'a', 'h']

In [27]:
# mapping values to position 
# will not go over the enumerate method
def map_it(e):
    mapping = {}
    for i, l in enumerate(e):
        mapping[l] = i
    return mapping
map_it(e)
# spits out a dictionary that correlates list element to list position

{'h': 0, 'a': 1, 'm': 3, 'e': 4, 'r': 8, 'd': 6, 'i': 7, 't': 9}

In [28]:
e

['h', 'a', 'm', 'm', 'e', 'r', 'd', 'i', 'r', 't']

In [29]:
# difference between sorted and sort
# sort is in place sorted return a new list
# observe
a_string.sort()
# no output

In [30]:
sorted(e)
# output

['a', 'd', 'e', 'h', 'i', 'm', 'm', 'r', 'r', 't']

In [31]:
e
# original list is untouched

['h', 'a', 'm', 'm', 'e', 'r', 'd', 'i', 'r', 't']

In [32]:
# zip 
seq_1 = ['a', 'b', 'c', 'd', 'e']
seq_2 = [1, 2, 3, 4, 5]
seq_3 = zip(seq_1, seq_2)
list(seq_3)

[('a', 1), ('b', 2), ('c', 3), ('d', 4), ('e', 5)]

In [33]:
# unpacking zipped elements
mapping ={}
for b, (c, d) in enumerate(zip(seq_1, seq_2)):
    print('this is '+ str(b))
    mapping[b] = c, d
  
mapping

this is 0
this is 1
this is 2
this is 3
this is 4


{0: ('a', 1), 1: ('b', 2), 2: ('c', 3), 3: ('d', 4), 4: ('e', 5)}

In [34]:
# which means that you could call this
print(mapping[0], mapping[0][0])

('a', 1) a


In [35]:
# this is a good one if you recieve data in pairs but you want to group element wise
# for example you want to put eggs, ham and hash in one group and toast, butter, jelly in another 

break_fast = [('ham', 'toast'), ('eggs', 'jelly'), ('hash', 'butter')]
eggs, toast = zip(*break_fast)
print(eggs, toast)

('ham', 'eggs', 'hash') ('toast', 'jelly', 'butter')


### Dictionaries:

In [36]:
# dicts
empty = {}
# use what we know to fill it :
g = ['happy', 'abrupt', 'messy', 'manic', 'excellent', 'rigorous', 'debonair', 'idealistic', 'radical', 'tasty']
e = list('hammerdirt')
# I know its not fair that I throw in the zip fucntion there,
# scroll bakckup and look at the zip function to better understand whats going on
for i, (f, h) in enumerate(zip(e, g)):
    empty[h]=f
empty
# if the variables are reversed on the constructor then we would be missing
# some keys in the dict--- only unique values are allowed as dcitionary keys

{'happy': 'h',
 'abrupt': 'a',
 'messy': 'm',
 'manic': 'm',
 'excellent': 'e',
 'rigorous': 'r',
 'debonair': 'd',
 'idealistic': 'i',
 'radical': 'r',
 'tasty': 't'}

In [37]:
# chekc to see if the 'Key" h is in the 'empty' dictionary:
'h' in empty

False

In [38]:
# but happy is !
'happy' in empty

True

In [39]:
# you can boolean search for keys in a dict but not in values
# then you could follow up with something like this
if 'happy' in empty:
    print(empty['happy'])

h


In [40]:
# there you get the value of 'happy'
# change the value of happy by setting it
empty['happy'] = 'H'

In [41]:
if 'happy' in empty:
    print(empty['happy'])

H


In [42]:
# pop or delete to remove key/value pairs from a dict
empty['add_me'] = 'A'
empty['add_me2'] = 'B'

In [43]:
empty

{'happy': 'H',
 'abrupt': 'a',
 'messy': 'm',
 'manic': 'm',
 'excellent': 'e',
 'rigorous': 'r',
 'debonair': 'd',
 'idealistic': 'i',
 'radical': 'r',
 'tasty': 't',
 'add_me': 'A',
 'add_me2': 'B'}

In [44]:
# use pop to remove a value and store it for later
gone = empty.pop('add_me')
del empty['add_me2']
print('What did you remove? ' + gone)

What did you remove? A


In [45]:
# another way to make a dict from two lists
# faster than the for loop in block 35
empty = dict(zip(g,e))
empty

{'happy': 'h',
 'abrupt': 'a',
 'messy': 'm',
 'manic': 'm',
 'excellent': 'e',
 'rigorous': 'r',
 'debonair': 'd',
 'idealistic': 'i',
 'radical': 'r',
 'tasty': 't'}

In [46]:
# getting a list of dict keys is a common oepration
# there is built in method for that (we have seen it before)
# one way to do it:
empty_keys = list(empty.keys())

In [47]:
# another way, save some typing
empty_keys_2 = [*empty]

In [48]:
print(empty_keys_2, empty_keys)
# same output

['happy', 'abrupt', 'messy', 'manic', 'excellent', 'rigorous', 'debonair', 'idealistic', 'radical', 'tasty'] ['happy', 'abrupt', 'messy', 'manic', 'excellent', 'rigorous', 'debonair', 'idealistic', 'radical', 'tasty']


In [49]:
# put this back to a dict there are several options:
# write a function
def make_dict(a):
    e_3 = {}
    for word in a:
        letter = word[0]
        if letter not in e_3:
            e_3[letter] = [word]
        else:
            e_3[letter].append(word)
    return e_3
make_dict(empty_keys_2)

{'h': ['happy'],
 'a': ['abrupt'],
 'm': ['messy', 'manic'],
 'e': ['excellent'],
 'r': ['rigorous', 'radical'],
 'd': ['debonair'],
 'i': ['idealistic'],
 't': ['tasty']}

In [50]:
# another way  to is to set the default
def make_dict_two(a):
    e_4 = {}
    for word in a:
        letter = word[0]
        e_4.setdefault(letter, []).append(word)
    return e_4
make_dict_two(empty_keys_2)
# same result we save a few lines

{'h': ['happy'],
 'a': ['abrupt'],
 'm': ['messy', 'manic'],
 'e': ['excellent'],
 'r': ['rigorous', 'radical'],
 'd': ['debonair'],
 'i': ['idealistic'],
 't': ['tasty']}

In [51]:
# finally use an import
from collections import defaultdict
def make_dict_three(a):
    e_5 = defaultdict(list)
    for s in a:
        e_5[s[0]].append(s)
    return e_5
make_dict_three(empty_keys)

defaultdict(list,
            {'h': ['happy'],
             'a': ['abrupt'],
             'm': ['messy', 'manic'],
             'e': ['excellent'],
             'r': ['rigorous', 'radical'],
             'd': ['debonair'],
             'i': ['idealistic'],
             't': ['tasty']})

### Sets:

In [52]:
# sets are one way to remove duplicates from a list
h = [2, 3, 3, 2, 5, 9, 5]
set(h)

{2, 3, 5, 9}

In [53]:
# the union of two sets
i = {'h', 'a', 'm', 'm', 'e', 'r', 'd', 'i', 'r', 't'}
j = {'h', 'a', 'm', 'e', 'r', 'd', 'i', 't'}
i.union(j)

{'a', 'd', 'e', 'h', 'i', 'm', 'r', 't'}

In [54]:
# the binary operator works also
i|j

{'a', 'd', 'e', 'h', 'i', 'm', 'r', 't'}

In [55]:
# the intersection
i.intersection(j)

{'a', 'd', 'e', 'h', 'i', 'm', 'r', 't'}

In [56]:
# or the & works also
i & j

{'a', 'd', 'e', 'h', 'i', 'm', 'r', 't'}

#### DataFrames and Series

In [57]:
obj = pd.Series([4.5, 7.3, -5.3, 3.6], index=['d', 'b', 'a', 'c'])

In [58]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])

In [59]:
# notice the NaN value on obj2? e was added to the index with no value
obj, obj2

(d    4.5
 b    7.3
 a   -5.3
 c    3.6
 dtype: float64, a   -5.3
 b    7.3
 c    3.6
 d    4.5
 e    NaN
 dtype: float64)

In [60]:
frame = pd.DataFrame(np.arange(9).reshape((3,3)), index=['a', 'c', 'd'], columns=['Ohio', 'Texas', 'California'])

In [61]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [62]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [63]:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [64]:
frame.loc[['a', 'b', 'c', 'd'], states]
# this is deprecated and reindex needs to be used

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


In [65]:
frame.reindex(index=['a', 'b', 'c', 'd'], columns=states)
# no error 

Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


In [66]:
# drop method
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])

In [67]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [68]:
new_obj = obj.drop('c')

In [69]:
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [70]:
states.append('New York')
data = pd.DataFrame(np.arange(16).reshape((4,4,)), index=states, columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Texas,0,1,2,3
Utah,4,5,6,7
California,8,9,10,11
New York,12,13,14,15


In [71]:
data.drop(states[:2])

Unnamed: 0,one,two,three,four
California,8,9,10,11
New York,12,13,14,15


In [72]:
cols = ['one', 'two', 'three', 'four']
data.drop(cols[1], axis=1)
# you can allways use inplace=True if you want to change the df 

Unnamed: 0,one,three,four
Texas,0,2,3
Utah,4,6,7
California,8,10,11
New York,12,14,15


### indexing selection and filtering

#### indexing refers to the syntax obj[,,,,] 

In [73]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [74]:
# general indexing rules apply on a series
print(obj[b], obj[1])
print(obj[2:4])

4.0 1.0
c    2.0
d    3.0
dtype: float64


In [75]:
# boolean
obj[obj < 2]

a    0.0
b    1.0
dtype: float64

### slicing

In [76]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [77]:
obj['b':'c']
# the endpoint is inclusive.... departurte from std python

b    1.0
c    2.0
dtype: float64

In [78]:
# change the values of a series
obj_2 = obj.copy()
obj_2['b':'c'] = 5

In [79]:
obj_2

a    0.0
b    5.0
c    5.0
d    3.0
e    4.0
dtype: float64

In [80]:
# slice some dataframes
data

Unnamed: 0,one,two,three,four
Texas,0,1,2,3
Utah,4,5,6,7
California,8,9,10,11
New York,12,13,14,15


In [81]:
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Utah,4,5,6,7
California,8,9,10,11
New York,12,13,14,15


In [82]:
# note for dataframes the index syntax should work
data[:2]
# ok that is reassuring

Unnamed: 0,one,two,three,four
Texas,0,1,2,3
Utah,4,5,6,7


In [83]:
# establish a truth table
data < 5

Unnamed: 0,one,two,three,four
Texas,True,True,True,True
Utah,True,False,False,False
California,False,False,False,False
New York,False,False,False,False


### Move this to another note book