# List comprehension, generators, iteration exercises

## 1. Comprehension

Convert the following for loops into comprehensions:

In [1]:
l = []
for i in range(-5, 10, 2):
    l.append(i-2)
l

[-7, -5, -3, -1, 1, 3, 5, 7]

In [2]:
[i-2 for i in range(-5, 10, 2)]

[-7, -5, -3, -1, 1, 3, 5, 7]

In [3]:
l = []
for i in range(100):
    if i % 10 == 4:
        l.append(i)
l

[4, 14, 24, 34, 44, 54, 64, 74, 84, 94]

In [4]:
[i for i in range(100) if i % 10 == 4]

[4, 14, 24, 34, 44, 54, 64, 74, 84, 94]

In [5]:
l1 = [12, 1, 0, 13, -3, -4, 0, 2]
l2 = []

for e in l1:
    if e % 2 == 1:
        l2.append(e)
l2

[1, 13, -3]

In [6]:
[e for e in l1 if e % 2 == 1]

[1, 13, -3]

In [7]:
l1 = [12, 1, 0, 13, -3, -4, 0, 2]
l2 = []

for e in l1:
    if e % 2 == 1:
        l2.append(True)
    else:
        l2.append(False)
l2

[False, True, False, True, True, False, False, False]

In [8]:
# solution 1
[True if e % 2 == 1 else False for e in l1]
# solution 2
[e % 2 == 1 for e in l1]

[False, True, False, True, True, False, False, False]

In [9]:
l1 = [3, 5, 7, 11, 13, 17, 19]
l2 = [2, 4, 6, 8, 10]

products = []

for x in l1:
    for y in l2:
        products.append(x*y)
products

[6,
 12,
 18,
 24,
 30,
 10,
 20,
 30,
 40,
 50,
 14,
 28,
 42,
 56,
 70,
 22,
 44,
 66,
 88,
 110,
 26,
 52,
 78,
 104,
 130,
 34,
 68,
 102,
 136,
 170,
 38,
 76,
 114,
 152,
 190]

In [10]:
[x*y for x in l1 for y in l2]

[6,
 12,
 18,
 24,
 30,
 10,
 20,
 30,
 40,
 50,
 14,
 28,
 42,
 56,
 70,
 22,
 44,
 66,
 88,
 110,
 26,
 52,
 78,
 104,
 130,
 34,
 68,
 102,
 136,
 170,
 38,
 76,
 114,
 152,
 190]

In [11]:
l1 = [3, 5, 7, 11, 13, 17, 19]
l2 = [2, 4, 6, 8, 10]

products = []

for x in l1:
    for y in l2:
        if (x + y) % 3 == 0:
            products.append(x*y)

In [12]:
[x * y for x in l1 for y in l2 if (x + y) % 3 == 0]

[18, 20, 50, 14, 56, 44, 110, 26, 104, 68, 170, 38, 152]

In [13]:
fruits = ["apple", "plum", "pear", "avocado"]

mtx = []
for fruit in fruits:
    row = []
    for i, c in enumerate(fruit):
        row.append(c*(i+1))
    mtx.append(row)
    
mtx

[['a', 'pp', 'ppp', 'llll', 'eeeee'],
 ['p', 'll', 'uuu', 'mmmm'],
 ['p', 'ee', 'aaa', 'rrrr'],
 ['a', 'vv', 'ooo', 'cccc', 'aaaaa', 'dddddd', 'ooooooo']]

In [14]:
[[c*(i+1) for i, c in enumerate(fruit)] for fruit in fruits]

[['a', 'pp', 'ppp', 'llll', 'eeeee'],
 ['p', 'll', 'uuu', 'mmmm'],
 ['p', 'ee', 'aaa', 'rrrr'],
 ['a', 'vv', 'ooo', 'cccc', 'aaaaa', 'dddddd', 'ooooooo']]

In [15]:
text = "ababaacdsadb"

char_freqs = {}

for c in text:
    try:
        char_freqs[c] += 1
    except KeyError:
        char_freqs[c] = 1
        
char_freqs

{'a': 5, 'b': 3, 'c': 1, 'd': 2, 's': 1}

In [16]:
# set(text) prevents counting the same character as many times as it appears in the string
{c: text.count(c) for c in set(text)}

{'a': 5, 'b': 3, 'c': 1, 'd': 2, 's': 1}

In [17]:
d1 = {"a": 1, "b": 3, "c": 2}
d2 = {"a": 2, "b": 1}

d3 = {}

for key in set(d1.keys()) | set(d2.keys()):
    max_val = max(d1.get(key, 0), d2.get(key, 0))
    d3[key] = max_val

d3

{'a': 2, 'b': 3, 'c': 2}

In [18]:
{key: max(d1.get(key, 0), d2.get(key, 0)) for key in set(d1.keys()) | set(d2.keys())}

{'a': 2, 'b': 3, 'c': 2}

## 2. Generators

The following piece of code downloads a small sample of the Hungarian Webcorpus. We will work on this in later exercises.

The corpus contains a single word-per-line and sentence boundaries are denoted by empty lines.

The file has 4 columns separated by TABs:
1. original word
2. lemma (stemmed word)
3. morphological analysis
4. morphological analysis candidates.

Take a look at the file before continuing.

In [19]:
import os
import urllib.request

fn = 'web2-4p-9-17'
zipname = fn + '.zip'

if not os.path.exists(zipname):
    print("Downloading corpus")
    webcorp_url = "http://avalon.aut.bme.hu/~judit/resources/webcorp_parts/web2-4p-9-17.zip"
    u = urllib.request.URLopener()
    u.retrieve(webcorp_url, zipname)

if not os.path.exists(fn):
    from zipfile import ZipFile
    with ZipFile(zipname) as myzip:
        myzip.extractall()

## 2.1. Write a generator function that yields one sentence at a time as a list of tokens. Make sure to yield the very last sentence of the file as well.

In [20]:
def read_sentences(filename):
    with open(filename, encoding="utf8") as f:
        sent = []
        for line in f:
            # empty line -> sentence boundary
            if not line.strip():
                if sent:
                    yield sent
                sent = []
            else:
                word = line.split("\t")[0]
                sent.append(word)
        # let's not forget the last sentence
        if sent:
            yield sent
    
sentence_iter = read_sentences(fn)
sentence = next(sentence_iter)

assert(len(sentence) == 19)
assert isinstance(sentence, list)

next_sentence = next(sentence_iter)
assert len(next_sentence) == 7

sentences = read_sentences(fn)

import types
assert isinstance(sentences, types.GeneratorType)

sentences = list(sentences)

assert(len(sentences) == 90764)

## 2.2 Write a generator function that yields one sentence at a time but skips short sentences. The length limit should be a parameter of the generator which defaults to 5.

In [21]:
def read_long_sentences(filename, min_length=5):
    for sentence in read_sentences(filename):
        if len(sentence) >= min_length:
            yield sentence
    
sentences = read_long_sentences(fn)
assert isinstance(sentences, types.GeneratorType)

sentences = list(sentences)
assert len(sentences) == 85163

sentences = read_long_sentences(fn, 15)

sentences = list(sentences)
assert len(sentences) == 50059

## 3. Binary search tree

Create a binary search tree class for integers. Write tests for your solution as well.

Implement the following:
- iteration protocol for the tree. Traversal should be in-order (increasing order).
- sum(tree) - sum of all the elements
- min(tree), max(tree) - smallest, largest element
- len(tree) - number of nodes

In [22]:
# there are many ways to implement binary search trees, this is my approach

class Tree:
    def __init__(self, values=None):
        self.value = None
        self.left = None
        self.right = None
        self.is_leaf = True
        if values is not None:
            for v in values:
                self.insert(v)
    
    def insert(self, value):
        if not isinstance(value, int):
            raise TypeError("Tree only supports integers")
        if self.is_leaf is True:
            self.value = value
            self.left = Tree()
            self.right = Tree()
            self.is_leaf = False
        if self.value == value:
            return
        elif value < self.value:
            self.left.insert(value)
        else:
            self.right.insert(value)
                
    def __len__(self):
        if self.is_leaf:
            return 0
        return len(self.left) + 1 + len(self.right)
    
    def __iter__(self):
        if self.is_leaf:
            return
        yield from self.left
        yield self.value
        yield from self.right
        
    def __str__(self):
        return ", ".join(str(v) for v in self)
            
t = Tree() 
assert len(t) == 0
t.insert(2)
assert 2 in t
assert len(t) == 1

t.insert(-10)
t.insert(100)
assert len(t) == 3

# inserting the same value again
t.insert(2)
assert len(t) == 3
print(t)

# min, max and sum are supported for iterables
assert sum(t) == 92
assert min(t) == -10
assert max(t) == 100

-10, 2, 100
