# Procesiranje podatkov

## List Comprehension

### How to Create Lists in Python

#### Using for Loops

In [1]:
squares = []
for i in range(10):
    squares.append(i * i)

print(squares)

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]


#### Using map() Objects

In [2]:
txns = [1.09, 23.56, 57.84, 4.56, 6.78]
TAX_RATE = 0.08

def get_price_with_tax(txn):
    return txn * (1 + TAX_RATE)

In [3]:
final_prices = map(get_price_with_tax, txns)

In [4]:
final_prices = list(final_prices)

In [5]:
final_prices

[1.1772000000000002, 25.4448, 62.467200000000005, 4.9248, 7.322400000000001]

In [6]:
rounded = []
for price in final_prices:
    rounded.append(round(price, 2))

print(rounded)

[1.18, 25.44, 62.47, 4.92, 7.32]


#### Using List Comprehensions

    squares = []
    for i in range(10):
        squares.append(i * i)

    print(squares)

In [7]:
squares = [i * i for i in range(10)]

In [8]:
squares

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

    new_list = [expression for member in iterable]

In [9]:
txns = [1.09, 23.56, 57.84, 4.56, 6.78]
TAX_RATE = .08
def get_price_with_tax(txn):
    return txn * (1 + TAX_RATE)
    #return round(txn * (1 + TAX_RATE),2)

In [10]:
final_prices = [round(get_price_with_tax(i), 2) for i in txns] 
print(final_prices)

[1.18, 25.44, 62.47, 4.92, 7.32]


### Benefits of Using List Comprehensions

### How to Supercharge Your Comprehensions

#### Using Conditional Logic

    new_list = [expression for member in iterable]

    new_list = [expression for member in iterable (if conditional)]

In [11]:
sentence = 'the rocket came back from mars'

vowels = [i for i in sentence if i in 'aeiou']

print(vowels)

['e', 'o', 'e', 'a', 'e', 'a', 'o', 'a']


In [12]:
sentence = 'The rocket, who was named Ted, came back \
from Mars because he missed his friends.'

In [13]:
def is_consonant(letter):
    vowels = 'aeiou'
    return letter.isalpha() and letter.lower() not in vowels

In [14]:
consonants = [i.lower() for i in sentence if is_consonant(i)]

In [15]:
print(consonants)

['t', 'h', 'r', 'c', 'k', 't', 'w', 'h', 'w', 's', 'n', 'm', 'd', 't', 'd', 'c', 'm', 'b', 'c', 'k', 'f', 'r', 'm', 'm', 'r', 's', 'b', 'c', 's', 'h', 'm', 's', 's', 'd', 'h', 's', 'f', 'r', 'n', 'd', 's']


    new_list = [expression (if conditional) for member in iterable]

In [16]:
original_prices = [1.25, -9.45, 10.22, 3.78, -5.92, 1.16]
prices = [i if i > 0 else 0 for i in original_prices]
print(prices)

[1.25, 0, 10.22, 3.78, 0, 1.16]


In [17]:
def get_price(price): 
    return price if price > 0 else 0

prices = [get_price(i) for i in original_prices]

In [18]:
print(prices)

[1.25, 0, 10.22, 3.78, 0, 1.16]


#### Using Set and Dictionary Comprehensions

In [19]:
quote = "life, uh, finds a way"
unique_vowels = {i for i in quote if i in 'aeiou'}

print(unique_vowels)

{'i', 'a', 'e', 'u'}


In [20]:
squares = {i: i * i for i in range(10)}
print(squares)

{0: 0, 1: 1, 2: 4, 3: 9, 4: 16, 5: 25, 6: 36, 7: 49, 8: 64, 9: 81}


### When Not to Use a List Comprehension in Python

#### Watch Out for Nested Comprehensions

In [21]:
cities = ['Austin', 'Tacoma', 'Topeka', 'Sacramento', 'Charlotte']

In [22]:
temps = {city: [0 for _ in range(7)] for city in cities}

In [23]:
temps

{'Austin': [0, 0, 0, 0, 0, 0, 0],
 'Tacoma': [0, 0, 0, 0, 0, 0, 0],
 'Topeka': [0, 0, 0, 0, 0, 0, 0],
 'Sacramento': [0, 0, 0, 0, 0, 0, 0],
 'Charlotte': [0, 0, 0, 0, 0, 0, 0]}

In [24]:
matrix = [[i for i in range(5)] for _ in range(6)]

In [25]:
matrix

[[0, 1, 2, 3, 4],
 [0, 1, 2, 3, 4],
 [0, 1, 2, 3, 4],
 [0, 1, 2, 3, 4],
 [0, 1, 2, 3, 4],
 [0, 1, 2, 3, 4]]

In [26]:
matrix = [
[0, 0, 0],
[1, 1, 1],
[2, 2, 2],]

In [27]:
flat = [num for row in matrix for num in row]

In [28]:
flat

[0, 0, 0, 1, 1, 1, 2, 2, 2]

In [29]:
matrix = [
    [0, 0, 0],
    [1, 1, 1],
    [2, 2, 2],
]
flat = []
for row in matrix:
    for num in row:
        flat.append(num)

flat

[0, 0, 0, 1, 1, 1, 2, 2, 2]

#### Profile to Optimize Performance

In [30]:
import random
import timeit
TAX_RATE = .08
txns = [random.randrange(100) for _ in range(100000)]
def get_price(txn):
    return txn * (1 + TAX_RATE)

In [31]:
def get_prices_with_map():
    return list(map(get_price, txns))

def get_prices_with_comprehension():
    return [get_price(txn) for txn in txns]

def get_prices_with_loop():
    prices = []
    for txn in txns:
        prices.append(get_price(txn))
    return prices

In [32]:
timeit.timeit(get_prices_with_map, number=100)

1.6914971999940462

In [33]:
timeit.timeit(get_prices_with_comprehension, number=100)

2.2438498999981675

In [34]:
timeit.timeit(get_prices_with_loop, number=100)

2.796779600001173

## Lambda function

In [35]:
def identity(x):
    return x

In [36]:
lambda x: x

<function __main__.<lambda>(x)>

In [37]:
def add_one(x):
    return x + 1

In [38]:
lambda x: x + 1

<function __main__.<lambda>(x)>

In [39]:
(lambda x: x + 1)(2)

3

In [40]:
add_one = lambda x: x + 1
add_one(2)

3

In [41]:
# Naloga
# Using `lambda` (new way).
new_add = lambda a, b: a + b
new_add(4,7)

11

In [42]:
unsorted = [('b', 6), ('a', 10), ('d', 0), ('c', 4)]

In [43]:
# sort on the second tuple (the integer)
print(sorted(unsorted, key=lambda x: x[1]))

[('d', 0), ('c', 4), ('b', 6), ('a', 10)]


### Uses of Lambda Expressions

#### Classic Functional Constructs

```python
list(map(lambda x: x.upper(), ['cat', 'dog', 'cow']))

list(filter(lambda x: 'o' in x, ['cat', 'dog', 'cow']))

from functools import reduce
reduce(lambda acc, x: f'{acc} | {x}', ['cat', 'dog', 'cow'])
```

#### Key Functions

In [44]:
ids = ['id1', 'id2', 'id30', 'id3', 'id22', 'id100']
print(sorted(ids)) # Lexicographic sort 
sorted_ids = sorted(ids, key=lambda x: int(x[2:])) # Integer sort 
print(sorted_ids)

['id1', 'id100', 'id2', 'id22', 'id3', 'id30']
['id1', 'id2', 'id3', 'id22', 'id30', 'id100']


#### UI Frameworks

#### Python Interpreter

### Are Lambdas Pythonic or Not?

## The Map Function

    # Pseudocode for map.
    def map(func, seq):
        # Return `Map` object with
        # the function applied to every
        # element.
        return Map(
            func(x)
            for x in seq
        )

In [45]:
values = [1, 2, 3, 4, 5]

# Note: We convert the returned map object to
# a list data structure.
add_10 = list(map(lambda x: x + 10, values))
add_20 = list(map(lambda x: x + 20, values))

print(add_10)

[11, 12, 13, 14, 15]


In [46]:
print(add_20)

[21, 22, 23, 24, 25]


### Vaja

In [47]:
with open('data/example_log.txt') as file:
    lines = file.readlines()
    ip_addresses = list(map(lambda x: x.split()[0], lines))
    print(ip_addresses[:10])

['200.155.108.44', '36.139.255.202', '50.112.115.219', '10.0.25.26', '233.154.7.24', '241.220.141.78', '191.198.138.97', '172.40.187.145', '225.119.46.80', '97.218.117.229']


### Alternatives to map

In [48]:
list(map(lambda x: x.capitalize(), ['cat', 'dog', 'cow']))

['Cat', 'Dog', 'Cow']

In [49]:
[x.capitalize() for x in ['cat', 'dog', 'cow']]

['Cat', 'Dog', 'Cow']

In [50]:
# comprehations alternativa - zgornji primer
with open('data/example_log.txt') as file:
    lines = file.readlines()
    ip_addresses = [line.split()[0] for line in lines]
    print(ip_addresses[:10])

['200.155.108.44', '36.139.255.202', '50.112.115.219', '10.0.25.26', '233.154.7.24', '241.220.141.78', '191.198.138.97', '172.40.187.145', '225.119.46.80', '97.218.117.229']


## The Filter Function

    # Pseudocode for filter.
    def filter(evaluate, seq):
        # Return `Map` object with
        # the evaluate function applied to every
        # element.
        return Map(
            x for x in seq
            if evaluate(x) is True
        )

In [1]:
values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

# Note: We convert the returned filter object to
# a list data structure.
even = list(filter(lambda x: x % 2 == 0, values))
odd = list(filter(lambda x: x % 2 == 1, values))

print(even)

[2, 4, 6, 8, 10]


In [2]:
print(odd)

[1, 3, 5, 7, 9]


### Vaja

In [53]:
# comprehations alternativa - zgornji primer
with open('data/example_log.txt') as file:
    lines = file.readlines()
    ip_addresses = list(map(lambda x: x.split()[0], lines))
    filtered_ips = list(filter(lambda x: int(x.split('.')[0]) <= 20, ip_addresses))
    print(filtered_ips[:10])

['10.0.25.26', '4.31.18.29', '10.3.25.58', '5.237.70.145', '4.186.143.85', '7.205.198.134', '2.98.108.99', '20.123.163.219', '17.192.186.123', '19.137.101.141']


### Alternatives to filter

In [54]:
list(filter(lambda x: x%2 == 0, range(11)))

[0, 2, 4, 6, 8, 10]

In [55]:
[x for x in range(11) if x%2 == 0]

[0, 2, 4, 6, 8, 10]

In [56]:
# comprehations alternativa - zgornji primer
with open('data/example_log.txt') as file:
    lines = file.readlines()
    ip_addresses = [line.split()[0] for line in lines]
    filtered_ips = [ip for ip in ip_addresses if int(ip.split('.')[0]) <= 20]
    print(filtered_ips[:10])

['10.0.25.26', '4.31.18.29', '10.3.25.58', '5.237.70.145', '4.186.143.85', '7.205.198.134', '2.98.108.99', '20.123.163.219', '17.192.186.123', '19.137.101.141']


## The Reduce Function

In [57]:
from functools import reduce

values = [1, 2, 3, 4]

summed = reduce(lambda a, b: a + b, values)
print(summed)

10


<img alt="diagram of reduce" src="https://dq-content.s3.amazonaws.com/263/s5_reduce_function.svg">

In [58]:
from functools import reduce

values = [1, 2, 3, 4, 5]

# By convention, we add `_` as a placeholder for an input
# we do not use.
first_value = reduce(lambda a, _: a, values)
print(first_value)

1


### Vaja

In [59]:
from functools import reduce

with open('data/example_log.txt') as file:
    lines = file.readlines()
    ip_addresses = list(map(lambda x: x.split()[0], lines))
    filtered_ips = list(filter(lambda x: int(x.split('.')[0]) <= 20, ip_addresses))
    count_all = reduce(lambda x, _: 2 if isinstance(x, str) else x + 1, lines)
    count_filtered = reduce(lambda x, _: 2 if isinstance(x, str) else x + 1, filtered_ips)
    ratio = count_filtered / count_all

print(ratio)

0.07770304186326674


### Alternatives to reduce

In [60]:
import functools
pairs = [(1, 'a'), (2, 'b'), (3, 'c')]
functools.reduce(lambda acc, pair: acc + pair[0], pairs, 0)

6

In [61]:
pairs = [(1, 'a'), (2, 'b'), (3, 'c')]

In [62]:
sum(x[0] for x in pairs)

6

In [63]:
pairs = [(1, 'a'), (2, 'b'), (3, 'c')]
sum(x for x, _ in pairs)

6

In [64]:
# comprehations alternativa - zgornji primer
with open('data/example_log.txt') as file:
    lines = file.readlines()
    ip_addresses = [line.split()[0] for line in lines]
    filtered_ips = [ip for ip in ip_addresses if int(ip.split('.')[0]) <= 20]
    count_all = sum(1 for line in lines)
    count_filtered = sum(1 for filtered_ip in filtered_ips)
    ratio = count_filtered / count_all
    print(ratio)

0.07770304186326674


In [65]:
# alternativa
with open('data/example_log.txt') as file:
    lines = file.readlines()
    ip_addresses = [line.split()[0] for line in lines]
    filtered_ips = [ip for ip in ip_addresses if int(ip.split('.')[0]) <= 20]
    ratio = len(filtered_ips) / len(lines)
    print(ratio)

0.07770304186326674


## Primer: Parsanje IPjev

In [66]:
!head -n 3 data/example_log.txt

200.155.108.44 - - [30/Nov/2017:11:59:54 +0000] "PUT /categories/categories/categories HTTP/1.1" 401 963 "http://www.yates.com/list/tags/category/" "Mozilla/5.0 (Windows CE) AppleWebKit/5332 (KHTML, like Gecko) Chrome/13.0.864.0 Safari/5332"
36.139.255.202 - - [30/Nov/2017:11:59:54 +0000] "PUT /search HTTP/1.1" 404 171 "https://www.butler.org/main/tag/category/home.php" "Mozilla/5.0 (Macintosh; PPC Mac OS X 10_5_0) AppleWebKit/5332 (KHTML, like Gecko) Chrome/15.0.813.0 Safari/5332"
50.112.115.219 - - [30/Nov/2017:11:59:54 +0000] "POST /main/blog HTTP/1.1" 404 743 "http://deleon-bender.com/categories/category.html" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_5_5 rv:2.0; apn-IN) AppleWebKit/531.48.1 (KHTML, like Gecko) Version/4.0 Safari/531.48.1"


In [67]:
with open('data/example_log.txt') as file:
    lines = file.readlines()

> [ipaddress — IPv4/IPv6 manipulation library](https://docs.python.org/3/library/ipaddress.html)

In [68]:
import ipaddress

def is_ip_private(ip):
    ip_object = ipaddress.ip_address(ip)
    return ip_object.is_private and not ip_object.is_reserved  and not ip_object.is_loopback

In [69]:
status_codes = []
for log in lines:
    log_splited = log.split()
    ip = log_splited[0]
    if is_ip_private(ip):
        status_code = int(log_splited[8])
        status_codes.append(status_code)

In [70]:
print(status_codes)

[404, 401, 404, 404, 200, 200, 404, 200, 200, 200, 200, 200, 404, 401, 401, 401, 404, 200, 401, 200, 200, 200, 200, 401, 200, 200, 401, 401, 404, 401, 404, 200, 401, 401, 200, 401, 200, 200, 401, 200, 404, 404, 200, 401, 200, 200, 404, 404, 404, 200, 200, 401, 401, 200, 200, 401, 404, 401, 404, 401, 200, 200]


In [71]:
from collections import Counter

In [72]:
status_counter = Counter(status_codes)

In [73]:
status_counter.most_common()

[(200, 28), (401, 19), (404, 15)]

In [74]:
for code, count in status_counter.most_common():
    print(f'Status code {code}: {count}x')

Status code 200: 28x
Status code 401: 19x
Status code 404: 15x


### Pandas alternativa

In [75]:
import ipaddress

def is_ip_private(ip):
    ip_object = ipaddress.ip_address(ip)
    return ip_object.is_private and not ip_object.is_reserved  and not ip_object.is_loopback

In [76]:
import pandas as pd
df = pd.read_csv('data/example_log.txt', sep=' ', header=None)
df.drop(labels=[1,2,3,4,5,7,8,9], axis=1, inplace=True)
df.rename(columns={0:'ip', 6:'status_code'}, inplace=True)
df['is_private'] = df['ip'].apply(lambda ip: is_ip_private(ip))
status_codes_private_ips = df[df['is_private'] == True].copy()

In [77]:
result = status_codes_private_ips['status_code'].value_counts().to_dict()
result

{200: 28, 401: 19, 404: 15}