In [10]:
import numpy as np 
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.linear_model import SGDRegressor, LinearRegression
from sklearn.model_selection import train_test_split

In [11]:
# Prepare Data
X, y, w = make_regression(n_features=2, n_samples=4000, 
                          random_state=42, coef=True, noise=1.0)
y = y + 1.5

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.5,
                                                    random_state=42)

# Run a Baseline Model
mod_lm = LinearRegression()
mod_lm.fit(X_train, y_train)

# Keep the MSE number around for safe-keeps.
normal_mse_test = np.mean((mod_lm.predict(X_test) - y_test)**2)

In [12]:
# Run for Stats
mod_pac = SGDRegressor()
data = []


for i, x in enumerate(X_train):
    mod_pac.partial_fit([x], [y_train[i]])
    data.append({
        'c0': mod_pac.intercept_[0],
        'c1': mod_pac.coef_.flatten()[0],
        'c2': mod_pac.coef_.flatten()[1],
        'mse_test': np.mean((mod_pac.predict(X_test) - y_test)**2),
        'normal_mse_test': normal_mse_test,
        'i': i
    })

df_stats = pd.DataFrame(data)

In [13]:
import altair as alt

alt.data_transformers.disable_max_rows()

pltr1 = (pd.melt(df_stats[['i', 'c1', 'c2']], id_vars=["i"]))
pltr2 = (pd.melt(df_stats[['i', 'normal_mse_test', 'mse_test']], id_vars=["i"]))

p1 = (alt.Chart(pltr1, title='SGD evolution of weights')
        .mark_line()
        .encode(x='i', y='value', color='variable', tooltip=['i', 'value', 'variable'])
        .properties(width=300, height=150)
        .interactive())

p2 = (alt.Chart(pltr2, title='SGD evolution of mse')
        .mark_line()
        .encode(x='i', y='value', color='variable', tooltip=['i', 'value', 'variable'])
        .properties(width=350, height=150)
        .interactive())

p1 | p2

In [44]:
from sklearn.linear_model import PassiveAggressiveRegressor

# Set jump coefficients
c_cold, c_warm = 0.1, 0.01

# Run for Stats
mod_pac = PassiveAggressiveRegressor(C=c_cold)
data = []

for i, x in enumerate(X_train):
    mod_pac.partial_fit([x], [y_train[i]])
    data.append({
        'c0': mod_pac.intercept_[0],
        'c1': mod_pac.coef_.flatten()[0],
        'c2': mod_pac.coef_.flatten()[1],
        'mse_test': np.mean((mod_pac.predict(X_test) - y_test)**2),
        'normal_mse_test': normal_mse_test,
        'i': i
    })
    if i == 500:
        mod_pac.C = c_warm

df_stats = pd.DataFrame(data)

In [45]:
alt.data_transformers.disable_max_rows()

pltr1 = (pd.melt(df_stats[['i', 'c1', 'c2']], id_vars=["i"]))
pltr2 = (pd.melt(df_stats[['i', 'normal_mse_test', 'mse_test']], id_vars=["i"]))

q1 = (alt.Chart(pltr1, title='PA evolution of weights')
        .mark_line()
        .encode(x='i', y='value', color='variable', tooltip=['i', 'value', 'variable'])
        .properties(width=300, height=150)
        .interactive())

q2 = (alt.Chart(pltr2, title='PA evolution of mse')
        .mark_line()
        .encode(x='i', y='value', color='variable', tooltip=['i', 'value', 'variable'])
        .properties(width=350, height=150)
        .interactive())

(p1 | p2) & (q1 | q2)

In [79]:
X, y, w = make_regression(n_features=2, n_samples=4000, 
                          random_state=42, coef=True, noise=1.0)
y = y + 1.5

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.5,
                                                    random_state=42)

df_save = pd.DataFrame(X).assign(y=y)
df_save.columns = ["x1", "x2", "y"]
df_save.to_csv("batch_example.csv", index=False)

In [103]:
chunked = pd.read_csv("batch_example.csv", chunksize=1000)
for chunk in chunked:
    print(chunk)

           x1        x2           y
0    0.703440  2.154929  105.937806
1    0.233043 -0.718065  -33.129001
2    0.038003  0.120031    7.485904
3    0.138078 -1.886129  -85.280455
4    1.451144  0.959271   51.824080
..        ...       ...         ...
995  0.720997 -0.545186  -21.753520
996  0.520756 -0.327806  -12.341583
997 -0.079641  0.452372   21.827234
998 -0.781156 -0.259800  -13.139468
999 -0.006071  0.838491   41.078189

[1000 rows x 3 columns]
            x1        x2           y
1000  0.335058  0.316156   17.435744
1001 -1.196789  0.893698   39.177474
1002 -0.657035  0.994558   46.519187
1003 -1.222128  0.712998   29.077980
1004  1.515445  0.381734   23.242345
...        ...       ...         ...
1995 -0.089234 -0.037571   -0.030647
1996 -1.016683 -0.244080  -12.443826
1997 -1.111458  0.246505    9.386135
1998 -0.569833  0.329509   15.378565
1999 -1.448014 -2.198806 -107.512385

[1000 rows x 3 columns]
            x1        x2          y
2000  0.479003 -0.861310 -37.474159
20

In [102]:
mod = SGDRegressor()
chunked = pd.read_csv("batch_example.csv", chunksize=1000)

for chunk in chunked:
    x_to_train = chunk[['x1', 'x2']].values
    y_to_train = chunk['y'].values
    mod.partial_fit(x_to_train, y_to_train)

In [104]:
np.mean((mod.predict(X_test) - y_test)**2)

1.0195954388448722

In [110]:
import numpy as np 
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.linear_model import SGDClassifier, PassiveAggressiveClassifier, LogisticRegression
from sklearn.model_selection import train_test_split

In [130]:
X, y = make_classification(n_samples=20000, n_features=2, n_redundant=0, 
                           random_state=42, n_clusters_per_class=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.5,
                                                    random_state=42)

In [139]:
mod_lmc = LogisticRegression()
mod_lmc.fit(X_train, y_train)

normal_acc_train = np.mean(mod_lmc.predict(X_train) == y_train)
normal_acc_test = np.mean(mod_lmc.predict(X_test) == y_test)

mod_sgd = SGDClassifier()
data = []

for j in range(3):
    for i, x in enumerate(X_train):
        mod_sgd.partial_fit([x], [y_train[i]], classes=[0, 1])
        data.append({
            'c1': mod_sgd.coef_.flatten()[0],
            'c2': mod_sgd.coef_.flatten()[1],
            'mod_sgd': np.mean(mod_sgd.predict(X_test) == y_test),
            'normal_acc_test': normal_acc_test,
            'i': i + X_train.shape[0] * j
        })

df_stats = pd.DataFrame(data)

In [140]:
pltr1 = (pd.melt(df_stats[['i', 'c1', 'c2']], id_vars=["i"]))
pltr2 = (pd.melt(df_stats[['i', 'normal_acc_test', 'mod_sgd']], id_vars=["i"]))

q1 = (alt.Chart(pltr1, title='SGD evolution of weights')
        .mark_line()
        .encode(x='i', y='value', color='variable', tooltip=['i', 'value', 'variable'])
        .properties(width=300, height=150)
        .interactive())

q2 = (alt.Chart(pltr2, title='PA evolution of accuracy')
        .mark_line()
        .encode(x='i', y='value', color='variable', tooltip=['i', 'value', 'variable'])
        .properties(width=350, height=150)
        .interactive())

(q1 | q2)