In [32]:
import pandas as pd 
from joblib import dump
from rich.console import Console

from sklearn.pipeline import make_pipeline, make_union
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

console = Console()

In [33]:
# Load the training data
df = pd.read_csv("clinc_oos-plus.csv").loc[lambda d: d['split'] == 'train']
console.log("Training data loaded.")

X = df['text'].to_list()
y = df['label']

# Make a very basic machine learning pipeline
pipe = make_pipeline(
    CountVectorizer(),
    LogisticRegression()
)

pipe.fit(X, y)
console.log("ML Pipeline fitted.")

dump(pipe, 'pipe.joblib')
console.log("Joblib pickle saved.")

In [50]:
from joblib import load

trained = load("pipe.joblib")

In [51]:
trained

Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('logisticregression', LogisticRegression())])

In [52]:
trained.predict(["hello"])

array([82])

In [53]:
class EvilThing:
    def predict(self, X):
        print("fooled you!")
        return [1 for _ in X]

In [54]:
evil_pipe = EvilThing()

In [55]:
from joblib import dump, load

dump(evil_pipe, "pipe-evil.joblib")

['pipe-evil.joblib']

In [57]:
from joblib import dump, load

pipe_loaded = load("pipe-evil.joblib")
pipe_loaded.predict(["hello"])

fooled you!


[1]

In [26]:
import hashlib 

def calc_checksum(path):
    md5_hash = hashlib.md5()

    with open(path, "rb") as f:
        content = f.read()
    md5_hash.update(content)
    digest = md5_hash.hexdigest()
    print(digest)

calc_checksum("pipe.joblib")
calc_checksum("pipe-evil.joblib")

04a415025a812c2a69cb3552d83ee275
0b119f868ac251eee25af5c4b0c2064d


In [78]:
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_wine

X, y = load_wine(return_X_y=True)

clf = LogisticRegression(max_iter=10_000)

In [80]:
prop_before = dir(clf)
clf.fit(X, y)
prop_after = dir(clf)

In [81]:
[e for e in prop_after if e not in prop_before]

['classes_', 'coef_', 'intercept_', 'n_features_in_', 'n_iter_']

In [84]:
clf.intercept_

array([-16.93248808,  22.88678838,  -5.9543003 ])

In [72]:
import h5py

def save_coefficients(classifier, filename):
    """Save the coefficients of a linear model into a .h5 file."""
    with h5py.File(filename, 'w') as hf:
        hf.create_dataset("coef",  data=classifier.coef_)
        hf.create_dataset("intercept",  data=classifier.intercept_)
        hf.create_dataset("classes", data=classifier.classes_)

def load_coefficients(classifier, filename):
    """Attach the saved coefficients to a linear model."""
    with h5py.File(filename, 'r') as hf:
        coef = hf['coef'][:]
        intercept = hf['intercept'][:]
        classes = hf['classes'][:]
    classifier.coef_ = coef
    classifier.intercept_ = intercept
    classifier.classes_ = classes

In [85]:
save_coefficients(clf, "clf.h5")

In [86]:
lr = LogisticRegression()
load_coefficients(lr, "clf.h5")

In [87]:
lr.predict(X)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [88]:
pipe

Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('logisticregression', LogisticRegression())])