In [None]:
### DECORATORS

In [10]:
def apply(func, a, b):
    return func(a, b)

def add(a, b):
    return a + b

def sub(a, b):
    return a - b

apply(add, 1, 2), apply(sub, 1, 2)


(3, -1)

In [11]:
def power(n):
    def func(number):
        return number**n
    return func

pow2 = power(2)
pow3 = power(3)

pow2(3), pow3(3)


(9, 27)

In [12]:
import time
import random

def stopwatch(f):
    def func():
        tic = time.time()
        result = f()
        print(f"this function took: {time.time() - tic}")
        return result
    return func

def sleep_random():
    time.sleep(random.random())
    return "Done!"

timed_sleep = stopwatch(sleep_random)


In [13]:
sleep_random()
timed_sleep()


this function took: 0.939068078994751


'Done!'

In [14]:
import time
import random

def stopwatch(f):
    def func(*args, **kwargs):
        tic = time.time()
        result = f(*args, **kwargs)
        print(f"this function took: {time.time() - tic}")
        return result
    return func

def sleep_random(s):
    t = s + random.random()
    time.sleep(t)
    return "Done"

timed_sleep = stopwatch(sleep_random)


In [16]:
sleep_random(s=2)
timed_sleep(s=2)


this function took: 2.238582134246826


'Done'

In [17]:
import time
import random

def stopwatch(f):
    def func(*args, **kwargs):
        tic = time.time()
        result = f(*args, **kwargs)
        print(f"this function took: {time.time() - tic}")
        return result
    return func

@stopwatch
def sleep_random(s):
    t = s + random.random()
    time.sleep(t)
    return f"Done"


In [18]:
sleep_random(1)
sleep_random(2)
sleep_random(3)


this function took: 1.0470223426818848
this function took: 2.7133960723876953
this function took: 3.5265591144561768


'Done'

In [19]:
import time
import random

def stopwatch(f):
    def func(*args, **kwargs):
        tic = time.time()
        result = f(*args, **kwargs)
        print(f"this function took: {time.time() - tic}")
        return result
    return func

@stopwatch
def sleep_random(s):
    """This function sleeps at least for `s` seconds."""
    return time.sleep(s + random.random())

timed_sleep = stopwatch(sleep_random)


In [20]:
help(sleep_random)


Help on function func in module __main__:

func(*args, **kwargs)



In [21]:
import time
import random
from functools import wraps

def print_call1(f):
    @wraps(f)
    def func(*args, **kwargs):
        print(f"print-call 1 args: {args}")
        result = f(*args, **kwargs)
        return result
    return func


def print_call2(f):
    @wraps(f)
    def func(*args, **kwargs):
        print(f"print-call 2 args: {args}")
        result = f(*args, **kwargs)
        return result
    return func

@print_call2
@print_call1
@print_call2
@print_call1
def sleep_random(s):
    """This function sleeps at least for `s` seconds."""
    return time.sleep(s + random.random()/100)

sleep_random(1.5)


print-call 2 args: (1.5,)
print-call 1 args: (1.5,)
print-call 2 args: (1.5,)
print-call 1 args: (1.5,)


In [22]:
import time
import random
from functools import wraps

def loggg(show_name=True, show_time=True):
    def stopwatch(f):
        @wraps(f)
        def func(*args, **kwargs):
            tic = time.time()
            result = f(*args, **kwargs)
            log_text = "call"
            if show_name:
                log_text = f"{log_text} {f.__name__}"
            if show_time:
                log_text = f"{log_text} time:{time.time() - tic}"
            print(log_text)
            return result
        return func
    return stopwatch

@loggg(show_name=False, show_time=True)
def sleep_random(s):
    """This function sleeps at least for `s` seconds."""
    return time.sleep(s + random.random()/100)

sleep_random(1)


call time:1.0043551921844482


In [23]:
import time
import random
from functools import wraps

def loggg(func_in=None, *, show_name=True, show_time=True):
    def stopwatch(f):
        @wraps(f)
        def func(*args, **kwargs):
            tic = time.time()
            result = f(*args, **kwargs)
            result = "call"
            if show_name:
                result = f"{result} {f.__name__}"
            if show_time:
                result = f"{result} time:{time.time() - tic}"
            return result
        return func

    # This is where the "magic" happens.
    if func_in is None:
        return stopwatch
    else:
        return stopwatch(func_in)

@loggg
def sleep_random(s):
    """This function sleeps at least for `s` seconds."""
    return time.sleep(s + random.random()/100)

sleep_random(1)


'call sleep_random time:1.0069313049316406'

In [24]:
from functools import wraps
import datetime as dt

def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"just ran step {func.__name__} shape={result.shape} took {time_taken}s")
        return result
    return wrapper


In [30]:
import pandas as pd

df = pd.read_csv("bigmac.csv")

@log_step
def start_pipeline(dataf):
    return dataf.copy()

@log_step
def set_dtypes(dataf):
    return (dataf
            .assign(date=lambda d: pd.to_datetime(d['date']))
            .sort_values(['currency_code', 'date']))

@log_step
def remove_outliers(dataf, min_row_country=32):
    countries = (dataf
                .groupby('currency_code')
                .agg(n=('name', 'count'))
                .loc[lambda d: d['n'] >= min_row_country]
                .index)
    return (dataf
            .loc[lambda d: d['currency_code'].isin(countries)])

df_new = (df
  .pipe(start_pipeline)
  .pipe(set_dtypes)
  .pipe(remove_outliers, min_row_country=20))


just ran step start_pipeline shape=(1330, 6) took 0:00:00.000096s
just ran step set_dtypes shape=(1330, 6) took 0:00:00.027047s
just ran step remove_outliers shape=(1248, 6) took 0:00:00.005588s


In [34]:
!pip install retry
from retry import retry

import logging
logging.basicConfig()

@retry(ValueError, tries=5, delay=0.5)
def randomly_fails(p=0.5):
    if random.random() < p:
        raise ValueError("no bueno!")
    return "Done!"

randomly_fails()


Collecting retry
  Downloading retry-0.9.2-py2.py3-none-any.whl (8.0 kB)
Collecting py<2.0.0,>=1.4.26 (from retry)
  Downloading py-1.11.0-py2.py3-none-any.whl (98 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/98.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: py, retry
Successfully installed py-1.11.0 retry-0.9.2




'Done!'

In [None]:
### PARTIAL FIT

In [37]:
!pip install pandas scikit-learn altair




In [38]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.linear_model import SGDRegressor, LinearRegression
from sklearn.model_selection import train_test_split

# Prepare Data
X, y, w = make_regression(n_features=2, n_samples=4000,
                          random_state=42, coef=True, noise=1.0)
y = y + 1.5

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.5,
                                                    random_state=42)


In [39]:
# Run a Baseline Model
mod_lm = LinearRegression()
mod_lm.fit(X_train, y_train)

# Keep the MSE number around for safe-keeps.
normal_mse_test = np.mean((mod_lm.predict(X_test) - y_test)**2)


In [40]:
# Run for Stats
mod_pac = SGDRegressor()
data = []


for i, x in enumerate(X_train):
    # This is where we learn on a single datapoint
    mod_pac.partial_fit([x], [y_train[i]])

    # This is where we measure and save stats
    data.append({
        'c0': mod_pac.intercept_[0],
        'c1': mod_pac.coef_.flatten()[0],
        'c2': mod_pac.coef_.flatten()[1],
        'mse_test': np.mean((mod_pac.predict(X_test) - y_test)**2),
        'normal_mse_test': normal_mse_test,
        'i': i
    })

df_stats = pd.DataFrame(data)


In [41]:
import altair as alt

alt.data_transformers.disable_max_rows()

pltr1 = (pd.melt(df_stats[['i', 'c1', 'c2']], id_vars=["i"]))
pltr2 = (pd.melt(df_stats[['i', 'normal_mse_test', 'mse_test']], id_vars=["i"]))

p1 = (alt.Chart(pltr1, title='SGD evolution of weights')
        .mark_line()
        .encode(x='i', y='value', color='variable', tooltip=['i', 'value', 'variable'])
        .properties(width=300, height=150)
        .interactive())

p2 = (alt.Chart(pltr2, title='SGD evolution of mse')
        .mark_line()
        .encode(x='i', y='value', color='variable', tooltip=['i', 'value', 'variable'])
        .properties(width=350, height=150)
        .interactive())

p1 | p2


In [42]:
from sklearn.linear_model import PassiveAggressiveRegressor

# Set jump coefficients
c_cold, c_warm = 0.1, 0.01

# Run for Stats
mod_pac = PassiveAggressiveRegressor(C=c_cold)
data = []

for i, x in enumerate(X_train):
    mod_pac.partial_fit([x], [y_train[i]])
    data.append({
        'c0': mod_pac.intercept_[0],
        'c1': mod_pac.coef_.flatten()[0],
        'c2': mod_pac.coef_.flatten()[1],
        'mse_test': np.mean((mod_pac.predict(X_test) - y_test)**2),
        'normal_mse_test': normal_mse_test,
        'i': i
    })
    if i == 500:
        mod_pac.C = c_warm

df_stats = pd.DataFrame(data)


In [43]:
alt.data_transformers.disable_max_rows()

pltr1 = (pd.melt(df_stats[['i', 'c1', 'c2']], id_vars=["i"]))
pltr2 = (pd.melt(df_stats[['i', 'normal_mse_test', 'mse_test']], id_vars=["i"]))

q1 = (alt.Chart(pltr1, title='PA evolution of weights')
        .mark_line()
        .encode(x='i', y='value', color='variable', tooltip=['i', 'value', 'variable'])
        .properties(width=300, height=150)
        .interactive())

q2 = (alt.Chart(pltr2, title='PA evolution of mse')
        .mark_line()
        .encode(x='i', y='value', color='variable', tooltip=['i', 'value', 'variable'])
        .properties(width=350, height=150)
        .interactive())

(p1 | p2) & (q1 | q2)


In [44]:
import pandas as pd
from sklearn.datasets import make_regression

X, y, w = make_regression(n_features=2, n_samples=4000,
                    random_state=42, coef=True, noise=1.0)
y = y + 1.5

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.5,
                                                    random_state=42)

df_save = pd.DataFrame(X).assign(y=y)
df_save.columns = ["x1", "x2", "y"]
df_save.to_csv("batch_example.csv", index=False)


In [45]:
chunked = pd.read_csv("batch_example.csv", chunksize=1000)
for chunk in chunked:
    print(chunk)


           x1        x2           y
0    0.703440  2.154929  105.937806
1    0.233043 -0.718065  -33.129001
2    0.038003  0.120031    7.485904
3    0.138078 -1.886129  -85.280455
4    1.451144  0.959271   51.824080
..        ...       ...         ...
995  0.720997 -0.545186  -21.753520
996  0.520756 -0.327806  -12.341583
997 -0.079641  0.452372   21.827234
998 -0.781156 -0.259800  -13.139468
999 -0.006071  0.838491   41.078189

[1000 rows x 3 columns]
            x1        x2           y
1000  0.335058  0.316156   17.435744
1001 -1.196789  0.893698   39.177474
1002 -0.657035  0.994558   46.519187
1003 -1.222128  0.712998   29.077980
1004  1.515445  0.381734   23.242345
...        ...       ...         ...
1995 -0.089234 -0.037571   -0.030647
1996 -1.016683 -0.244080  -12.443826
1997 -1.111458  0.246505    9.386135
1998 -0.569833  0.329509   15.378565
1999 -1.448014 -2.198806 -107.512385

[1000 rows x 3 columns]
            x1        x2          y
2000  0.479003 -0.861310 -37.474159
20

In [46]:
mod = SGDRegressor()
chunked = pd.read_csv("batch_example.csv", chunksize=1000)

for chunk in chunked:
    x_to_train = chunk[['x1', 'x2']].values
    y_to_train = chunk['y'].values
    mod.partial_fit(x_to_train, y_to_train)


In [47]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.linear_model import SGDClassifier, PassiveAggressiveClassifier, LogisticRegression
from sklearn.model_selection import train_test_split

# First generate data
X, y = make_classification(n_samples=20000, n_features=2, n_redundant=0,
                     random_state=42, n_clusters_per_class=1)

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.5,
                                                    random_state=42)

# Next train a baseline model.
mod_lmc = LogisticRegression()
mod_lmc.fit(X_train, y_train)

normal_acc_train = np.mean(mod_lmc.predict(X_train) == y_train)
normal_acc_test = np.mean(mod_lmc.predict(X_test) == y_test)


In [48]:
mod_sgd = SGDClassifier()
data = []

for i, x in enumerate(X_train):
    # Pay attention to `classes` here, we need it!
    mod_sgd.partial_fit([x], [y_train[i]], classes=[0, 1])
    data.append({
        'c1': mod_sgd.coef_.flatten()[0],
        'c2': mod_sgd.coef_.flatten()[1],
        'mod_sgd': np.mean(mod_sgd.predict(X_test) == y_test),
        'normal_acc_test': normal_acc_test,
        'i': i
    })

df_stats = pd.DataFrame(data)


In [49]:
pltr1 = (pd.melt(df_stats[['i', 'c1', 'c2']], id_vars=["i"]))
pltr2 = (pd.melt(df_stats[['i', 'normal_acc_test', 'mod_sgd']], id_vars=["i"]))

q1 = (alt.Chart(pltr1, title='SGD evolution of weights')
        .mark_line()
        .encode(x='i', y='value', color='variable', tooltip=['i', 'value', 'variable'])
        .properties(width=300, height=150)
        .interactive())

q2 = (alt.Chart(pltr2, title='PA evolution of accuracy')
        .mark_line()
        .encode(x='i', y='value', color='variable', tooltip=['i', 'value', 'variable'])
        .properties(width=350, height=150)
        .interactive())

(q1 | q2)


In [50]:
mod_sgd = SGDClassifier()
data = []

# We've added an extra loop here
for i, x in enumerate(X_train):
    # Pay attention to `classes` here, we need it!
    mod_sgd.partial_fit([x], [y_train[i]], classes=[0, 1])
    data.append({
        'c1': mod_sgd.coef_.flatten()[0],
        'c2': mod_sgd.coef_.flatten()[1],
        'mod_sgd': np.mean(mod_sgd.predict(X_test) == y_test),
        'normal_acc_test': normal_acc_test,
        'i': i
    })

df_stats = pd.DataFrame(data)


In [51]:
mod_sgd = SGDClassifier()
data = []

for j in range(3):
    for i, x in enumerate(X_train):
        # Pay attention to `classes` here, we need it!
        mod_sgd.partial_fit([x], [y_train[i]], classes=[0, 1])
        data.append({
            'c1': mod_sgd.coef_.flatten()[0],
            'c2': mod_sgd.coef_.flatten()[1],
            'mod_sgd': np.mean(mod_sgd.predict(X_test) == y_test),
            'normal_acc_test': normal_acc_test,
            'i': i + X_train.shape[0] * j
        })

df_stats = pd.DataFrame(data)
