<a href="https://colab.research.google.com/github/jamestheengineer/data-science-from-scratch-Python/blob/master/Chapter_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Chapter 2: A Crash Course in Python


In [0]:
# The pound sign marks the start of a comment. Python itself
# ignores the comments, but they're helpful for anyone reading the code.
for i in [1, 2, 3, 4, 5]:
  for j in [1, 2, 3, 4, 5]:
    print(j)
    print(i + j)
  print (i)
print ("done looping")

In [0]:
# Whitespace is ignored inside parenthesis
long_winded_computation = (1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + 11 + 12 +
                           13 + 14 + 15 + 16 + 17 + 18 + 19 + 20)

In [0]:
# Use it to make things easier to read
list_of_lists = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
easier_to_read_list_of_lists = [[1, 2, 3],
                                [4, 5, 6],
                                [7, 8, 9]]

In [0]:

# You can use backslashes to also indicate a statement continues to the next line
two_plus_three = 2 + \
                 3

In [0]:
# Whitespace formatting can cause issues copying and pasting
for i in [1, 2, 3, 4, 5]:

  # notice the blank line
  print(i)

In [0]:
# Jupyter totally doesn't care

In [0]:
# Python uses import to get stuff from other modules
import re
my_regex = re.compile("[0-9]+", re.I)

In [0]:
my_regex

In [0]:
# If you already have a different module of the same name in your module, you can alias
import re as regex
my_regex = regex.compile("[0-9]+", regex.I)

In [0]:
# Standard convention to import matplotlib is
import matplotlib.pyplot as plt
# plt.plot(...)

In [0]:
# You can import specific capabilities and use them without qualification
from collections import defaultdict, Counter
lookup = defaultdict(int)
my_counter = Counter()

In [0]:
# Be careful about importing everything, which my overwrite stuff you already have
match = 10
from re import *
print(match)

In [0]:
# Functions!
def double(x):
  """
  THis is where you put an optional docstring that explains what the function
  does. For example, this function miltiplies its input by 2.
  """
  return x * 2

In [0]:
# Python's functions are first class, which means we can assign them to variables 
# and pass them to other functions
def apply_to_one(f):
  """Calls the function f with 1 as its argument"""
  return f(1)

my_double = double
x = apply_to_one(my_double)
print(x)
# Lambdas!
y = apply_to_one(lambda x: x + 4)
print(y)
another_double = lambda x: 2 * x # don't do this
def another_double(x):
  """Do this instead"""
  return 2 * x

In [0]:
# Default args
def my_print(message = "my default message"):
  print(message)

my_print("hello")
my_print()

In [0]:
def full_name(first = "What's-his-name", last = "Something"):
  return first + " " + last 

print(full_name("Joel", "Grus"))
print(full_name("Joel"))
print(full_name(last="Grus"))

In [0]:
# Strings can be delimited by single or double quotes
single_quote_string = 'data science'
double_quoted_string = "data science"

# Backslashes for special characters
tab_string = "\t" # represents the tab character
len(tab_string) # should be 1

In [0]:
# You can use raw strings to capture special characters
not_tab_string = r"\t" # two chars
len(not_tab_string) # should be 2

In [0]:
# Multi-line strings with three double quotes
multi_line_string = """This is the first line.
and this is the second line
and this is the third line"""

In [0]:
print(multi_line_string)

In [0]:
# Python 3.6 added the f-string, which simplifies substitution into strings
first_name = "Joel"
last_name = "Grus"
full_name_one_way = first_name + " " + last_name
full_name_second_way = "{0} {1}".format(first_name, last_name)
full_name_THE_way = f"{first_name} {last_name}"

In [0]:
# We'll use exceptions occasionally
try:
  print( 0 / 0)
except ZeroDivisionError:
  print("cannot divide by zero")


In [0]:
# Probably the most fundamental data structure in Python is the list.
integer_list = [1, 2, 3]
heterogeneous_list = ["string", 0.1, True]
list_of_lists = [integer_list, heterogeneous_list, []]
list_length = len(integer_list)
list_sum = sum(integer_list)
print(list_length, list_sum)

In [0]:
# Get or set list items with square brackets
x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
zero = x[0]
one = x[1]
nine = x[-1]
eight = x[-2]
x[0] = -1
print(zero, one, nine, eight, x)

In [0]:
# Slicing in Python. Don't forget you can slice strings and other "sequential" types
first_three = x[:3]
three_to_end = x[3:]
one_to_four = x[1:5] # inclusive of first, exclusive of last. Not sure how I feel about that.
last_three = x[-3:] 
without_first_and_last = x[1:-1]
copy_of_x = x[:]
print(first_three, three_to_end, one_to_four, last_three, without_first_and_last, copy_of_x)

In [0]:
# A slice can take a stride
every_third = x[::3]
five_to_three = x[5:2:-1]
print(every_third, five_to_three)

In [0]:
# *in* operator to test membership
1 in [1, 2, 3]
0 in [1, 2, 3]

In [0]:
# You can concatenate and modify relatively easily
x = [1, 2, 3]
x.extend([4, 5, 6])
print(x)

In [0]:
x = [1, 2, 3]
y = x + [4, 5, 6]
print(y)

In [0]:
x = [1, 2, 3]
x.append(0)
print(x)
y = x[-1]
print(y)
z = len(x)
print(z)

In [0]:
# You can also unpack
x, y = [1, 2]
print(x, y)

In [0]:
# Common idiom is to use underscore for things you throw away
_, y = [1, 2]
print(_, y) # Underscore is still a thing, though

In [0]:
# Tuples are lists immutable cousins. Parenthesis versus brackets
my_list = [1, 2]
my_tuple = (1, 2)
other_tuple = 3, 4
my_list[1] = 3
try:
  my_tuple[1] = 3
except TypeError:
  print("cannot modify a tuple")

# Tuples are a convenient way to return multiple values from functions:
def sum_and_product(x, y):
  return (x + y), (x * y)
sp = sum_and_product(2, 3)
print(sp)
s, p = sum_and_product(5, 10)
print(s, p)

In [0]:
# Tuples and lists can also be used for multiple assignment
x, y = 1, 2
print(x, y)
x, y = y, x
print(x, y)

In [0]:
# Dictionaries for values and keys
empty_dict = {} # Pythonic
empty_dict2 = dict() # less Pythonic
grades = {"Joel": 80, "Tim": 95} # literal


In [0]:
# Look up values using square brackets
joels_grade = grades["Joel"]
print(joels_grade)

In [0]:
# You'll get a KeyError exception if you look up something not in there
try:
  kates_grade = grades["Kate"]
except KeyError:
  print("no grade for Kate!")

In [0]:
# You can check for existence with "in"
joel_has_grade = "Joel" in grades # True
kate_has_grade = "Kate" in grades # False

In [0]:
# The "get" method will return a default value instead of an exception
joels_grade = grades.get("Joel", 0) # equals 80
kates_grade = grades.get("Kate", 0) # equals 0
no_ones_grade = grades.get("No One") # default is None
print(joels_grade, kates_grade, no_ones_grade)

In [0]:
# Assignment is also done with brackets
grades["Tim"] = 99  # replaces old value
grades["Kate"] = 100 # adds third entry
num_students = len(grades)
print(num_students)

In [0]:
# Dictionaries are a covenient way to represent structured data
tweet = {
    "user": "joelgrus",
    "text": "Data Science is Awesome",
    "retweet_count": 100,
    "hashtags": ["#data","#science","#datascience","#awesome","#yolo"]
}
print(tweet)

In [0]:
# You can look through keys, as well
tweet_keys = tweet.keys() # iterable for the keys
tweet_values = tweet.values() # iterable for the values
tweet_items = tweet.items() # iterable for the (key, value) tuples

print("user" in tweet_keys) # True, but not Pythonic
print("user" in tweet) # Pythonic way
print("joelgrus" in tweet_values) # True (slow but the only way to check)
# Dictionary keys must be "hashable" so you can't use lists. Use tuples or strings
print(tweet_keys)

In [0]:
# A convenience container that will automatically add things if you want is the defaultdict
# Otherwise, you have to handle things manually like these three options
document = ["the", "quick", "brown", "fox", "jumped", "over", "the", "log"]

word_counts = {}
for word in document:
  if word in word_counts:
    word_counts[word] +=1
  else:
    word_counts[word] = 1

word_counts = {}
for word in document:
  try:
    word_counts[word] += 1
  except KeyError:
    word_counts[word] = 1

word_counts = {}
for word in document:
  previous_count = word_counts.get(word, 0)
  word_counts[word] = previous_count + 1

In [0]:
print(word_counts)

In [0]:
from collections import defaultdict

word_counts = defaultdict(int) # int() produces 0
for word in document:
  word_counts[word] += 1
print(word_counts)


In [0]:
# Also useful with list or dict or your own function
dd_list = defaultdict(list) # list produces an empty list
dd_list[2].append(1) # now dd_list contains {2: [1]}
dd_dict = defaultdict(dict) # dict() produces an empty dict
dd_dict["Joel"]["City"] = "Seattle" # {"Joel: {"City": "Seattle"}}
dd_pair = defaultdict(lambda: [0,0])
dd_pair[2][1] = 1 # now dd_pair contains {2: [0, 1]}

In [0]:
# A Counter turns a sequence of values into a defaultdict(int)-like object mapping keys to counts
from collections import Counter
c = Counter([0, 1, 2, 3, 4])
word_counts = Counter(document)
print(word_counts)

In [0]:
# Print the 10 most common words and their counts
for word, count in word_counts.most_common(10):
  print(word, count)

In [0]:
# Sets are a collection of distinct elements
primes_below_10 = {2, 3, 5, 7}
s = set() # since you can't initialize an empty set with brackets because that's an empty dict
s.add(1)
s.add(2)
s.add(2)
x = len(s)
y = 2 in s
z = 3 in s
print(s, x, y, z)

In [0]:
# "in" is very fast for sets
stopwords_list = ["a", "an", "at"] + ["other", "words"] + ["yet", "you"]
"zip" in stopwords_list
stopwords_set = set(stopwords_list)
"zip" in stopwords_set # very fast

In [0]:
# Also good for the distinct elements in a collection
item_list = [1, 2, 3, 1, 2, 3]
num_items = len(item_list)
item_set = set(item_list)
num_distinct_items = len(item_set)
distinct_item_list = list(item_set)
print(num_items, num_distinct_items)

In [0]:
# Control flow 
if 1 > 2:
  message = "if only 1 were greater than two..."
elif 1 > 3:
  message = "elif stands for 'else if'"
else:
  message = "when all else fails use else (if you want to)"
print(message)


In [0]:
# Ternary if-then-else
x = 3
parity = "even" if x % 2 == 0 else "odd"
print(parity)

In [0]:
# While
x = 0
while x < 10:
  print(f"{x} is less than 10")
  x += 1

In [0]:
# For in
for x in range(10):
  print(f"{x} is less than 10")

In [0]:
# Continue and break
for x in range(10):
  if x == 3:
    continue # go to next iteration
  if x == 5:
    break # quit entirely
  print(x)

In [0]:
# Booleans are capitalized
one_is_less_than_two = 1 < 2
true_equals_false = True == False

In [0]:
# Python uses "None" for null
x = None
assert x == None # not Pythonic
assert x is None # Pythonic

In [0]:
# Python lets you use any value where it expects a Boolean. The following are all Falsy
False
None
[] # empty list 
{} # empty dict
""
set()
0
0.0

In [0]:
#s = some_function_that_returns_a_string()
#if s:
#  first_char = s[0]
#else:
#  first_char = ""

# or

# first_char = s and s[0] # since Python returns the second value when the first is true, and the first value when it's not

safe_x = x or 0
safe_x = x if x is not None else 0

# All function that returns when every element is truthy
all([True, 1, {3}]) # True
all([True, 1, []]) # False
any([True, 1, {}]) # True
all([]) # True
any([]) # False


In [0]:
# Sorting
x = [4, 1, 2, 3]
y = sorted(x)
x.sort()
print(y, x)

In [0]:
# Sort the list by absolute value from largest to smallest
x = sorted([-4, 1, -2, 3], key=abs, reverse=True)


In [0]:
# Sort the words and counts from highest count to lowest
wc = sorted(word_counts.items(),
            key=lambda word_and_count: word_and_count[1],
            reverse=True)
print(wc)

In [0]:
# List comprehensions - creating lists from other lists based on some criteria
even_numbers = [x for x in range(5) if x % 2 == 0]
print(even_numbers)
squares = [x * x for x in range(5)]
print(squares)
even_squares = [x * x for x in even_numbers]
print(even_squares)

In [0]:
# You can turn lists in dicts or sets
square_dict = {x: x * x for x in range(5)}
print(square_dict)
square_set = {x * x for x in [1, -1]}
print(square_set)

In [0]:
# If you don't need the value from the list, it is common to use an underscore as the variable
zeros = [0 for _ in even_numbers]
print(zeros)

In [0]:
# List comprehensions can contain multiple fors:
pairs = [(x, y)
          for x in range(10)
          for y in range(10)]
print(pairs)

In [0]:
# And later fors can use earlier ones:
increasing_pairs = [(x, y)              # only pairs with x < y
                    for x in range(10)  
                    for y in range(x + 1, 10)]
print(increasing_pairs)

In [0]:
# Using types and automated tests are two ways to verify correct code in Python. 
# In this book, we'll keep tests simple with asserts
assert 1 + 1 == 2
assert 1 + 1 == 2, "1 + 1 should equal 2 but didn't"
# assert 1 + 2 == 2, "1 + 1 should equal 2 but didn't"

In [0]:
# Functions in Python
def smallest_item(xs):
  return min(xs)

assert smallest_item([10, 20, 5, 40]) == 5
assert smallest_item([1, 0, -1, 2]) == -1

In [0]:
# Less common is to assert things about inputs to functions:
def smallest_item(xs):
  assert xs, "empty list has no smallest item"
  return min(xs)

In [3]:
# Classes in Python
class CountingClicker:
  """A class can/should have a docstring, just like a function"""
  def __init__(self, count = 0):
    self.count = count

  def __repr__(self):
    return f"CountingClicker(count={self.count})"

  def click(self, num_times = 1):
    """Click the clicker some number of times."""
    self.count += num_times

  def read(self):
    return self.count
  
  def reset(self):
    self.count = 0

clicker1 = CountingClicker()
clicker2 = CountingClicker(100)
clicker3 = CountingClicker(count=100)
print(clicker1.count, clicker2.count)

clicker = CountingClicker()
assert clicker.read() == 0, "clicker should start with count 0"
clicker.click()
clicker.click()
assert clicker.read() == 2, "after two clicks, clicker should have count 2"
clicker.reset()
assert clicker.read() == 0, "after reset, clicker should be back to 0"

0 100


In [0]:
# A subclass inherits all the behavior of its parent class.
class NoResetClicker(CountingClicker):
  # This class has all the same methods as CountingClicker

  # Except that it has a reset method that does nothing.
  def reset(self):
    pass

clicker2 = NoResetClicker()
assert clicker2.read() == 0
clicker2.click()
assert clicker2.read() == 1
clicker2.reset()
assert clicker2.read() == 1, "reset shouldn't do anything"

In [0]:
# Generators are used for lazy evaluation. Regular "range" does things lazily, so no need for things like the below:
def generate_range(n):
  i = 0
  while i < n:
    yield i # every call to yield produces a value of the generator
    i += 1

for i in generate_range(10):
  print(f"i: {i}")

In [0]:
# You can gen infinite sequences, if you want.
def natural_numbers():
  """returns 1, 2, 3, 4, 5..."""
  n = 1
  while True:
    yield n
    n += 1

In [0]:
# Another way to create generators using "for" comprehensions wrapped in parenthesis
evens_below_20 = (i for i in generate_range(20) if i % 2 == 0)
print(evens_below_20)

In [0]:
# Generator comprehensions don't do any work until you iterate over them using "for" or "next"
# None of these comprehensions does anything until we iterate
data = natural_numbers()
evens = (x for x in data if x % 2 == 0)
even_squares = (x ** 2 for x in evens)
even_squares_ending_in_six = (x for x in even_squares if x % 10 == 6)


In [0]:
print(data, evens, even_squares, even_squares_ending_in_six)

In [0]:
# Python's enumerate function gives you the index as well as the value in a list or generator
names = ["Alice", "Bob", "Charlie", "Debbie"]

# not Pythonic
for i in range(len(names)):
  print(f"name {i} is {names[i]}")

# also not Pythonic
i = 0
for name in names:
  print(f"name {i} is {names[i]}")
  i += 1

# Pythonic
for i, name in enumerate(names):
  print(f"names {i} is {names[i]}")

In [0]:
# Random numbers are kinda important in data science
import random
random.seed(10) # this ensures we get the same results every time

four_uniform_randoms = [random.random() for _ in range(4)]
print(four_uniform_randoms)

In [0]:
random.seed(10)
print(random.random())
random.seed(10)
print(random.random())

In [0]:
print(random.randrange(10))  # choose randomly from range(10) = [0, 1, 2, ...9]
print(random.randrange(3, 6)) # choose randomly from range(3, 6) = [3, 4, 5]

In [0]:
# Shuffle
up_to_ten = [*range(1,11)]
random.shuffle(up_to_ten)
print(up_to_ten)

In [0]:
# random.choice to randomly pick one element
my_best_friend = random.choice(["Alice", "Bob", "Charlie"])
print(my_best_friend)

In [0]:
# Here's how you choose random numbers without replacement
lottery_numbers = range(60)
winning_numbers = random.sample(lottery_numbers, 6)
print(winning_numbers)

In [0]:
# With replacement
four_with_replacement = [random.choice(range(10)) for _ in range(4)]
print(four_with_replacement)

In [0]:
# Regular expressions
import re

re_examples = [                                      # All of these are True, because
               not re.match("a", "cat"),              # 'cat' doesn't start with 'a'
               re.search("a", "cat"),                 # 'cat' has an 'a' in it
               not re.search("c", "dog"),             # 'dog' doesn't have a 'c' in it
               3 == len(re.split("[ab]", "carbs")),   # split on a or b to ['c','r','s']
               "R-D-" == re.sub("[0-9]", "-", "R2D2") # replace digits with dashes
]
assert all(re_examples) # all regex examples should be True

In [0]:
# Zip and argument unpacking
list1 = ['a', 'b', 'c']
list2 = [1, 2, 3]

# Zip is lazy
pairs = [pair for pair in zip(list1, list2)]    # is [('a', 1), ('b', 2), ('c', 3)]
print(pairs)

In [0]:
# Zip stops on the shorter list. And you can also 'unzip' with the unpack operator
letters, numbers = zip(*pairs) # same as letters, numbers = zip(('a',1), ('b',2),('c',3))
print(letters, numbers)

In [0]:
# You can use arg unpacking with any function:
def add(a, b): return a + b

add(1, 2) # returns 3
try:
  add([1,2])
except TypeError:
  print("add expects two inputs")

add(*[1,2]) # returns 3

In [0]:
# args and kwargs
def doubler(f):
  # Here we define a new function that keeps a reference to f
  def g(x):
    return 2 * f(x)
  # And return that new function
  return g

In [0]:
# Works sometimes
def f1(x):
  return x + 1

g = doubler(f1)
assert g(3) == 8, "(3 + 1) * 2 should equal 8"
assert g(-1) == 0, "(-1 + 1) * 2 should equal 0"

In [0]:
# But not with functions with more than one argument
def f2(x, y):
  return x + y

g = doubler(f2)
try:
  g(1, 2)
except TypeError:
  print("as defined, g only takes one argument")

In [0]:
# We can solve this problem with unpacking and a little bit of magic
def magic(*args, **kwargs):
  print("unnamed args:", args)
  print("keyword args:", kwargs)

magic(1, 2, 3, key="word", key2="word2")

In [0]:
# This works the other way around, too
def other_way_magic(x, y, z):
  return x + y + z

x_y_list = [1, 2]
z_dict = {"z": 3}
assert other_way_magic(*x_y_list, **z_dict) == 6, "1 + 2 + 3 should be 6"

In [0]:
# Here's the correct way to do the doubler
def doubler_correct(f):
  """works no matter what kind of inputs f expects"""
  def g(*args, **kwargs):
    """whatever arguments g is supplied, pass them through to f"""
    return 2 * f(*args, **kwargs)
  return g

g = doubler_correct(f2)
assert g(1, 2) == 6, "doubler should work now"

In [0]:
# Generally, your code will be more correct and more readable if you are explicit about what arguments
# you require, but you can use args and kwargs when you don't have other options

# Python doesn't usually care about types of objects as long as we use them in valid ways
def add(a, b):
  return a + b

# All valid adds
assert add(10, 5) == 15
assert add([1, 2], [3]) == [1, 2, 3]
assert add("hi ", "there") == "hi there"

# But not this one
try:
  add(10, "five")
except TypeError:
  print("cannot add an int to a string")

In [0]:
# Newer versions of Python let you do type annotations to make code clearer of intent,
# even though it will still add things if it can. You can use mypy to check before running code.
def add(a: int, b: int) -> int:
  return a + b

# These are both still OK 
add(10, 5)
add("hi", "there")

In [0]:
# Use the typing module to help clarify type annotations
from typing import List # note capital L

def total(xs: List[float]) -> float:
  return sum(total)
  

In [0]:
# You can use type annotations on variables, but most of the time it is obvious
x: int = 5

# But sometimes not
values = [] # what type?
best_so_far = None # what type?

# So we us the Optional type hint
from typing import Optional

values: List[int] = []
best_so_far: Optional[float] = None # allowed to be either a float or None


In [0]:
# The type annotations in this snippet are all uncessary
from typing import Dict, Iterable, Tuple

# keys are strings, values are ints
counts: Dict[str, int] = {'data': 1, 'science': 2}

# lists and generators are both iterable
lazy = True
if lazy:
  evens: Iterable[int] = (x for x in range(10) if x % 2 == 0)
else:
  evens = [0, 2, 4, 6, 8]

# tuples specify type for each element
triple: Tuple[int, float, int] = (10, 2.3, 5)

In [0]:
# For functions
from typing import Callable

# The type hint says that repeater is a function that takes
# two arguments, a string and an int, and returns a string.
def twice(repeater: Callable[[str,int], str], s: str) -> str:
  return repeater(s, 2)

def comma_repeater(s: str, n: int) -> str:
  n_copies = [s for _ in range(n)]
  return ', '.join(n_copies)

assert twice(comma_repeater, "type hints") == "type hints, type hints"

In [0]:
# Type annotations are just Python objects so we can assign them to variables to them easier to refer to
Number = int
Numbers = List[Number]

def tota(xs: Numbers) -> Number:
  return sum(xs)