<a href="https://colab.research.google.com/github/jamestheengineer/data-science-from-scratch-Python/blob/master/Chapter_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Chapter 2: A Crash Course in Python


In [17]:
# The pound sign marks the start of a comment. Python itself
# ignores the comments, but they're helpful for anyone reading the code.
for i in [1, 2, 3, 4, 5]:
  for j in [1, 2, 3, 4, 5]:
    print(j)
    print(i + j)
  print (i)
print ("done looping")

1
2
2
3
3
4
4
5
5
6
1
1
3
2
4
3
5
4
6
5
7
2
1
4
2
5
3
6
4
7
5
8
3
1
5
2
6
3
7
4
8
5
9
4
1
6
2
7
3
8
4
9
5
10
5
done looping


In [0]:
# Whitespace is ignored inside parenthesis
long_winded_computation = (1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + 11 + 12 +
                           13 + 14 + 15 + 16 + 17 + 18 + 19 + 20)

In [0]:
# Use it to make things easier to read
list_of_lists = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
easier_to_read_list_of_lists = [[1, 2, 3],
                                [4, 5, 6],
                                [7, 8, 9]]

In [0]:

# You can use backslashes to also indicate a statement continues to the next line
two_plus_three = 2 + \
                 3

In [21]:
# Whitespace formatting can cause issues copying and pasting
for i in [1, 2, 3, 4, 5]:

  # notice the blank line
  print(i)

1
2
3
4
5


In [0]:
# Jupyter totally doesn't care

In [0]:
# Python uses import to get stuff from other modules
import re
my_regex = re.compile("[0-9]+", re.I)

In [24]:
my_regex

re.compile(r'[0-9]+', re.IGNORECASE|re.UNICODE)

In [0]:
# If you already have a different module of the same name in your module, you can alias
import re as regex
my_regex = regex.compile("[0-9]+", regex.I)

In [0]:
# Standard convention to import matplotlib is
import matplotlib.pyplot as plt
# plt.plot(...)

In [0]:
# You can import specific capabilities and use them without qualification
from collections import defaultdict, Counter
lookup = defaultdict(int)
my_counter = Counter()

In [28]:
# Be careful about importing everything, which my overwrite stuff you already have
match = 10
from re import *
print(match)

<function match at 0x7fde95742268>


In [0]:
# Functions!
def double(x):
  """
  THis is where you put an optional docstring that explains what the function
  does. For example, this function miltiplies its input by 2.
  """
  return x * 2

In [0]:
# Python's functions are first class, which means we can assign them to variables 
# and pass them to other functions
def apply_to_one(f):
  """Calls the function f with 1 as its argument"""
  return f(1)

my_double = double
x = apply_to_one(my_double)

# Lambdas!
y = apply_to_one(lambda x: x + 4)

another_double = lambda x: 2 * x # don't do this
def another_double(x):
  """Do this instead"""
  return 2 * x

In [31]:
# Default args
def my_print(message = "my default message"):
  print(message)

my_print("hello")
my_print()

hello
my default message


In [32]:
def full_name(first = "What's-his-name", last = "Something"):
  return first + " " + last 

print(full_name("Joel", "Grus"))
print(full_name("Joel"))
print(full_name(last="Grus"))

Joel Grus
Joel Something
What's-his-name Grus


In [33]:
# Strings can be delimited by single or double quotes
single_quote_string = 'data science'
double_quoted_string = "data science"

# Backslashes for special characters
tab_string = "\t" # represents the tab character
len(tab_string) # should be 1

1

In [34]:
# You can use raw strings to capture special characters
not_tab_string = r"\t" # two chars
len(not_tab_string) # should be 2

2

In [0]:
# Multi-line strings with three double quotes
multi_line_string = """This is the first line.
and this is the second line
and this is the third line"""

In [36]:
print(multi_line_string)

This is the first line.
and this is the second line
and this is the third line


In [0]:
# Python 3.6 added the f-string, which simplifies substitution into strings
first_name = "Joel"
last_name = "Grus"
full_name_one_way = first_name + " " + last_name
full_name_second_way = "{0} {1}".format(first_name, last_name)
full_name_THE_way = f"{first_name} {last_name}"

In [38]:
# We'll use exceptions occasionally
try:
  print( 0 / 0)
except ZeroDivisionError:
  print("cannot divide by zero")


cannot divide by zero


In [39]:
# Probably the most fundamental data structure in Python is the list.
integer_list = [1, 2, 3]
heterogeneous_list = ["string", 0.1, True]
list_of_lists = [integer_list, heterogeneous_list, []]
list_length = len(integer_list)
list_sum = sum(integer_list)
print(list_length, list_sum)

3 6


In [40]:
# Get or set list items with square brackets
x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
zero = x[0]
one = x[1]
nine = x[-1]
eight = x[-2]
x[0] = -1
print(zero, one, nine, eight, x)

0 1 9 8 [-1, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [41]:
# Slicing in Python. Don't forget you can slice strings and other "sequential" types
first_three = x[:3]
three_to_end = x[3:]
one_to_four = x[1:5] # inclusive of first, exclusive of last. Not sure how I feel about that.
last_three = x[-3:] 
without_first_and_last = x[1:-1]
copy_of_x = x[:]
print(first_three, three_to_end, one_to_four, last_three, without_first_and_last, copy_of_x)

[-1, 1, 2] [3, 4, 5, 6, 7, 8, 9] [1, 2, 3, 4] [7, 8, 9] [1, 2, 3, 4, 5, 6, 7, 8] [-1, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [42]:
# A slice can take a stride
every_third = x[::3]
five_to_three = x[5:2:-1]
print(every_third, five_to_three)

[-1, 3, 6, 9] [5, 4, 3]


In [43]:
# *in* operator to test membership
1 in [1, 2, 3]
0 in [1, 2, 3]

False

In [44]:
# You can concatenate and modify relatively easily
x = [1, 2, 3]
x.extend([4, 5, 6])
print(x)

[1, 2, 3, 4, 5, 6]


In [45]:
x = [1, 2, 3]
y = x + [4, 5, 6]
print(y)

[1, 2, 3, 4, 5, 6]


In [46]:
x = [1, 2, 3]
x.append(0)
print(x)
y = x[-1]
print(y)
z = len(x)
print(z)

[1, 2, 3, 0]
0
4


In [47]:
# You can also unpack
x, y = [1, 2]
print(x, y)

1 2


In [48]:
# Common idiom is to use underscore for things you throw away
_, y = [1, 2]
print(_, y) # Underscore is still a thing, though

1 2


In [49]:
# Tuples are lists immutable cousins. Parenthesis versus brackets
my_list = [1, 2]
my_tuple = (1, 2)
other_tuple = 3, 4
my_list[1] = 3
try:
  my_tuple[1] = 3
except TypeError:
  print("cannot modify a tuple")

# Tuples are a convenient way to return multuple values from functions:
def sum_and_product(x, y):
  return (x + y), (x * y)
sp = sum_and_product(2, 3)
print(sp)
s, p = sum_and_product(5, 10)
print(s, p)

cannot modify a tuple
(5, 6)
15 50


In [50]:
# Tuples and lists can also be used for multiple assignment
x, y = 1, 2
print(x, y)
x, y = y, x
print(x, y)

1 2
2 1


In [0]:
# Dictionaries for values and keys
empty_dic = {} # Pythonic
empty_dic2 = dict() # less Pythonic
grades = {"Joel": 80, "Tim": 95} # literal


In [52]:
# Look up values using square brackets
joels_grade = grades["Joel"]
print(joels_grade)

80


In [53]:
# You'll get a KeyError exception if you look up something not in there
try:
  kates_grade = grades["Kate"]
except KeyError:
  print("no grade for Kate!")

no grade for Kate!


In [0]:
# You can check for existence with "in"
joel_has_grade = "Joel" in grades # True
kate_has_grade = "Kate" in grades # False

In [55]:
# The "get" method will return a default value instead of an exception
joels_grade = grades.get("Joel", 0) # equals 80
kates_grade = grades.get("Kate", 0) # equals 0
no_ones_grade = grades.get("No One") # default is None
print(joels_grade, kates_grade, no_ones_grade)

80 0 None


In [56]:
# Assignment is also done with brackets
grades["Tim"] = 99  # replaces old value
grades["Kate"] = 100 # adds third entry
num_students = len(grades)
print(num_students)

3


In [57]:
# Dictionaries are a covenient way to represent structured data
tweet = {
    "user": "joelgrus",
    "text": "Data Science is Awesome",
    "retweet_count": 100,
    "hashtags": ["#data","#science","#datascience","#awesome","#yolo"]
}
print(tweet)

{'user': 'joelgrus', 'text': 'Data Science is Awesome', 'retweet_count': 100, 'hashtags': ['#data', '#science', '#datascience', '#awesome', '#yolo']}


In [58]:
# You can look through keys, as well
tweet_keys = tweet.keys() # iterable for the keys
tweet_values = tweet.values() # iterable for the values
tweet_items = tweet.items() # iterable for the (key, value) tuples

"user" in tweet_keys # True, but not Pythonic
"user" in tweet # Pythonic way
"joelgrus" is tweet_values # True (slow but the only way to check)
# Dictionary keys must be "hashable" so you can't use lists. Use tuples or strings
print(tweet_keys)

dict_keys(['user', 'text', 'retweet_count', 'hashtags'])


In [0]:
# A convenience container that will automatically add things if you want is the defaultdict
# Otherwise, you have to handle things manually like these three options
document = ["the", "quick", "brown", "fox", "jumped", "over", "the", "log"]

word_counts = {}
for word in document:
  if word in word_counts:
    word_counts[word] +=1
  else:
    word_counts[word] = 1

word_counts = {}
for word in document:
  try:
    word_counts[word] += 1
  except KeyError:
    word_counts[word] = 1

word_counts = {}
for word in document:
  previous_count = word_counts.get(word, 0)
  word_counts[word] = previous_count + 1

In [60]:
print(word_counts)

{'the': 2, 'quick': 1, 'brown': 1, 'fox': 1, 'jumped': 1, 'over': 1, 'log': 1}


In [61]:
from collections import defaultdict

word_counts = defaultdict(int) # int() produces 0
for word in document:
  word_counts[word] += 1
print(word_counts)


defaultdict(<class 'int'>, {'the': 2, 'quick': 1, 'brown': 1, 'fox': 1, 'jumped': 1, 'over': 1, 'log': 1})


In [0]:
# Also useful with list or dict or your own function
dd_list = defaultdict(list) # list produces an empty list
dd_list[2].append(1) # now dd_list contains {2: [1]}
dd_dict = defaultdict(dict) # dict() produces an empty dict
dd_dict["Joel"]["City"] = "Seattle" # {"Joel: {"City": "Seattle"}}
dd_pair = defaultdict(lambda: [0,0])
dd_pair[2][1] = 1 # now dd_pair contains {2: [0, 1]}

In [63]:
# A Counter turns a sequence of values into a defaultdict(int)-like object mapping keys to counts
from collections import Counter
c = Counter([0, 1, 2, 3, 4])
word_counts = Counter(document)
print(word_counts)

Counter({'the': 2, 'quick': 1, 'brown': 1, 'fox': 1, 'jumped': 1, 'over': 1, 'log': 1})


In [64]:
# Print the 10 most common words and their counts
for word, count in word_counts.most_common(10):
  print(word, count)

the 2
quick 1
brown 1
fox 1
jumped 1
over 1
log 1


In [65]:
# Sets are a collection of distinct elements
primes_below_10 = {2, 3, 5, 7}
s = set() # since you can't initialize an empty set with brackets because that's an empty dict
s.add(1)
s.add(2)
s.add(2)
x = len(s)
y = 2 in s
z = 3 in s
print(s, x, y, z)

{1, 2} 2 True False


In [66]:
# "in" is very fast for sets
stopwords_list = ["a", "an", "at"] + ["other", "words"] + ["yet", "you"]
"zip" in stopwords_list
stopwords_set = set(stopwords_list)
"zip" in stopwords_set # very fast

False

In [67]:
# Also good for the distinct elements in a collection
item_list = [1, 2, 3, 1, 2, 3]
num_items = len(item_list)
item_set = set(item_list)
num_distinct_items = len(item_set)
distinct_item_list = list(item_set)
print(num_items, num_distinct_items)

6 3


In [68]:
# Control flow 
if 1 > 2:
  message = "if only 1 were greater than two..."
elif 1 > 3:
  message = "elif stands for 'else if'"
else:
  message = "when all else fails use else (if you want to)"
print(message)


when all else fails use else (if you want to)


In [69]:
# Ternary if-then-else
x = 3
parity = "even" if x % 2 == 0 else "odd"
print(parity)

odd


In [70]:
# While
x = 0
while x < 10:
  print(f"{x} is less than 10")
  x += 1

0 is less than 10
1 is less than 10
2 is less than 10
3 is less than 10
4 is less than 10
5 is less than 10
6 is less than 10
7 is less than 10
8 is less than 10
9 is less than 10


In [71]:
# For in
for x in range(10):
  print(f"{x} is less than 10")

0 is less than 10
1 is less than 10
2 is less than 10
3 is less than 10
4 is less than 10
5 is less than 10
6 is less than 10
7 is less than 10
8 is less than 10
9 is less than 10


In [72]:
# Continue and break
for x in range(10):
  if x == 3:
    continue # go to next iteration
  if x == 5:
    break # quit entirely
  print(x)

0
1
2
4


In [0]:
# Booleans are capitalized
one_is_less_than_two = 1 < 2
true_equals_false = True == False

In [0]:
# Python uses "None" for null
x = None
assert x == None # not Pythonic
assert x is None # Pythonic

In [75]:
# Python lets you use any value where it expects a Boolean. The following are all Falsy
False
None
[] # empty list 
{} # empty dict
""
set()
0
0.0

0.0

In [76]:
#s = some_function_that_returns_a_string()
#if s:
#  first_char = s[0]
#else:
#  first_char = ""

# or

# first_char = s and s[0] # since Python returns the second value when the first is true, and the first value when it's not

safe_x = x or 0
safe_x = x if x is not None else 0

# All function that returns when every element is truthy
all([True, 1, {3}]) # True
all([True, 1, []]) # False
any([True, 1, {}]) # True
all([]) # True
any([]) # False


False

In [77]:
# Sorting
x = [4, 1, 2, 3]
y = sorted(x)
x.sort()
print(y, x)

[1, 2, 3, 4] [1, 2, 3, 4]


In [0]:
# Sort the list by absolute value from largest to smallest
x = sorted([-4, 1, -2, 3], key=abs, reverse=True)


In [80]:
# Sort the words and counts from highest count to lowest
wc = sorted(word_counts.items(),
            key=lambda word_and_count: word_and_count[1],
            reverse=True)
print(wc)

[('the', 2), ('quick', 1), ('brown', 1), ('fox', 1), ('jumped', 1), ('over', 1), ('log', 1)]
