# Procesiranje podatkov

## List Comprehension

In [1]:
squares = []

for i in range(10):
    squares.append(i*i)

print(squares)

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]


In [2]:
# map() funkcija

cene = [1.03, 25.26, 23.8, 5.89]
DAVEK = 0.22

def dodaj_davek(cena):
    return cena * (1 + DAVEK)


In [4]:
map(dodaj_davek, cene)

[1.2566, 30.8172, 29.036, 7.1857999999999995]

In [5]:
for val in map(dodaj_davek, cene):
    print(val)

1.2566
30.8172
29.036
7.1857999999999995


In [7]:
koncne_cene = list(map(dodaj_davek, cene))

zaokrozene = []

for cena in koncne_cene:
    zaokrozene.append(round(cena, 2))

print(zaokrozene)

[1.26, 30.82, 29.04, 7.19]


In [8]:
squares = []

for i in range(10):
    squares.append(i*i)

print(squares)

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]


    new_list = [expression for member in iterable]

In [9]:
squares  = [i*i for i in range(10)]

In [11]:
koncne_cene = [round(dodaj_davek(cena), 2) for cena in cene]
koncne_cene

[1.26, 30.82, 29.04, 7.19]

    new_list = [expression for member in iterable (if conditional)]

In [13]:
stavek = 'python je uporaben'
samoglasniki = [i.upper() for i in stavek if i in 'aeiou']
samoglasniki

['O', 'E', 'U', 'O', 'A', 'E']

In [14]:
besedilo = """Python je interpretni visokoravni večnamenski programski jezik, ki ga je ustvaril Guido van Rossum leta 1990. Jezik je dobil ime po priljubljeni angleški televizijski nanizanki Leteči cirkus Montyja Pythona. Python podpira dinamične podatkovne tipe, kar ga naredi drugačnega od npr. Jave ali družine C."""

In [19]:
urejene_besede = [beseda.lower().replace('.','').replace(',','') for beseda in besedilo.split() if len(beseda) > 5]
urejene_besede

['python',
 'interpretni',
 'visokoravni',
 'večnamenski',
 'programski',
 'jezik',
 'ustvaril',
 'rossum',
 'priljubljeni',
 'angleški',
 'televizijski',
 'nanizanki',
 'leteči',
 'cirkus',
 'montyja',
 'pythona',
 'python',
 'podpira',
 'dinamične',
 'podatkovne',
 'naredi',
 'drugačnega',
 'družine']

    new_list = [expression (if conditional) for member in iterable]

In [24]:

cene = [1.03, 25.26, 23.8, 5.89]
DAVEK1 = 0.22
DAVEK2 = 0.35
 
def dodaj_davek(cena, DAVEK):
    return cena * (1 + DAVEK)

koncne_cene = [dodaj_davek(cena, DAVEK1) if cena < 10 else dodaj_davek(cena, DAVEK2) for cena in cene if cena > 2]
koncne_cene


[34.101000000000006, 32.13, 7.1857999999999995]

In [27]:
#  Dictionary Comprehensions

squares = {str(i): i*i for i in range(10)}
squares

{'0': 0,
 '1': 1,
 '2': 4,
 '3': 9,
 '4': 16,
 '5': 25,
 '6': 36,
 '7': 49,
 '8': 64,
 '9': 81}

In [29]:
matrix = [[i for i in range(5)] for _ in range(6)]
matrix

[[0, 1, 2, 3, 4],
 [0, 1, 2, 3, 4],
 [0, 1, 2, 3, 4],
 [0, 1, 2, 3, 4],
 [0, 1, 2, 3, 4],
 [0, 1, 2, 3, 4]]

In [30]:
import random
import timeit

TAX_RATE = .08
txns = [random.randrange(100) for _ in range(100000)]

def get_price(txn):
    return txn * (1 + TAX_RATE)

In [31]:
def get_prices_with_map():
    return list(map(get_price, txns))

def get_prices_with_comprehension():
    return [get_price(txn) for txn in txns]

def get_prices_with_loop():
    prices = []
    for txn in txns:
        prices.append(get_price(txn))
    return prices

In [32]:
%%timeit
get_prices_with_map()

12.6 ms ± 286 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [33]:
%%timeit
get_prices_with_comprehension()

15.7 ms ± 269 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [34]:
%%timeit
get_prices_with_loop()

18.5 ms ± 158 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Lambda function

In [35]:
def add_10(x):
    return x + 10

In [36]:
lambda x: x + 10

<function __main__.<lambda>(x)>

In [37]:
ids = ['id25', 'id35','id2','id45','id50','id125']

sorted(ids)

['id125', 'id2', 'id25', 'id35', 'id45', 'id50']

In [38]:
sorted(ids, key=lambda x: int(x[2:]))

['id2', 'id25', 'id35', 'id45', 'id50', 'id125']

In [39]:
a = lambda x: x + 10

In [40]:
a(45)

55

## The Map Function

In [41]:
vrednosti = [1,2,3,4,5]

In [44]:

add10 = list(map(lambda x: x + 10, vrednosti))
add10

[11, 12, 13, 14, 15]

In [47]:
# NALOGA 1: pridobimo IPje v list

with open('./data/example_log.txt') as f:
    lines = f.readlines()
    ip_adderesses = list(map(lambda x: x.split()[0], lines))
    print(ip_adderesses[:10])

['200.155.108.44', '36.139.255.202', '50.112.115.219', '10.0.25.26', '233.154.7.24', '241.220.141.78', '191.198.138.97', '172.40.187.145', '225.119.46.80', '97.218.117.229']


In [48]:
# Uporaba list comprehension
with open('./data/example_log.txt') as f:
    lines = f.readlines()
    ip_adderesses = [line.split()[0] for line in lines]
    print(ip_adderesses[:10])

['200.155.108.44', '36.139.255.202', '50.112.115.219', '10.0.25.26', '233.154.7.24', '241.220.141.78', '191.198.138.97', '172.40.187.145', '225.119.46.80', '97.218.117.229']


### Filter Function

In [52]:
vrednosti = list(range(15))
lihe = list(filter(lambda x: x % 2 == 1, vrednosti))
lihe

[1, 3, 5, 7, 9, 11, 13]

In [54]:
lihe = [x for x in vrednosti if x % 2 == 1]
lihe

[1, 3, 5, 7, 9, 11, 13]

In [55]:
# NALOGA 2
with open('./data/example_log.txt') as f:
    lines = f.readlines()
    ip_adderesses = list(map(lambda x: x.split()[0], lines))
    filtered_ips = list(filter(lambda x: int(x.split('.')[0]) <= 20, ip_adderesses))
    print(filtered_ips[:10])

['10.0.25.26', '4.31.18.29', '10.3.25.58', '5.237.70.145', '4.186.143.85', '7.205.198.134', '2.98.108.99', '20.123.163.219', '17.192.186.123', '19.137.101.141']


In [56]:
with open('./data/example_log.txt') as f:
    lines = f.readlines()
    ip_adderesses = [line.split()[0] for line in lines]
    filtered_ips = [ip for ip in ip_adderesses if int(ip.split('.')[0]) <= 20]
    print(filtered_ips[:10])


['10.0.25.26', '4.31.18.29', '10.3.25.58', '5.237.70.145', '4.186.143.85', '7.205.198.134', '2.98.108.99', '20.123.163.219', '17.192.186.123', '19.137.101.141']


### Primer: Parsanje IPjev

In [57]:
with open('./data/example_log.txt') as f:
    lines = f.readlines()

In [58]:
import ipaddress

def is_ip_private(ip):
    ip_object = ipaddress.ip_address(ip)
    return ip_object.is_private and not ip_object.is_reserved and not ip_object.is_loopback

In [61]:
is_ip_private('212.251.66.5')

False

In [62]:
is_ip_private('10.0.14.4')

True

In [63]:
status_codes = []

for line in lines:
    ip = line.split()[0]
    if is_ip_private(ip):
        status_codes.append(int(line.split()[8]))


In [66]:
from collections import Counter

dict(Counter(status_codes))

{404: 15, 401: 19, 200: 28}

### Pandas alternativa

In [67]:
import ipaddress

def is_ip_private(ip):
    ip_object = ipaddress.ip_address(ip)
    return ip_object.is_private and not ip_object.is_reserved and not ip_object.is_loopback

In [68]:
import pandas as pd

In [84]:
df = pd.read_csv('data/example_log.txt', sep=' ', header=None)
df = df[[0,6]]
df.rename(columns={0:'ip', 6:'status_code'}, inplace=True)

df['is_private'] = df['ip'].apply(lambda ip: is_ip_private(ip))

private_ips_df = df[df['is_private'] == True].copy()
result = private_ips_df['status_code'].value_counts().to_dict()

private_ips_df.head()


Unnamed: 0,ip,status_code,is_private
3,10.0.25.26,404,True
13,172.16.45.5,401,True
19,10.3.25.58,404,True
631,10.182.249.118,404,True
731,10.56.168.95,200,True


In [85]:
result

{200: 28, 401: 19, 404: 15}