In [None]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from urllib.parse import urlparse
from nltk.corpus import stopwords
from nltk import tokenize
from sklearn.model_selection import train_test_split
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import joblib
import os
import re
import string
import html

from pprint import pprint
from time import time
from datetime import datetime

In [None]:
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')

In [None]:
import os
from dotenv import load_dotenv, dotenv_values 
load_dotenv() 

## Priveedly: Training a Simple Content Recommender (Classifier) for Personal Use

This notebook is originally for use with [Priveedly](https://blog.kjamistan.com/priveedly-your-private-and-personal-content-reader-and-recommender.html), a personal use content aggregator system available on [GitHub](https://github.com/kjam/priveedly).

- There is a YouTube video to walk you through the notebook at a high level, in case it is helpful! 
- There are some links below to learn more about how to use scikit-learn.
- I welcome feedback and contributions via GitHub!
- Most importantly: HAVE FUN playing with ML concepts!


# Getting text from Postgres into Pandas

In [None]:
if os.path.isfile('data/cleaned.csv'):
    print ("SKIP TO LOADING CLEANED DF!!!")

In [None]:
engine = create_engine(os.getenv('LOCAL_DB_CONNSTR'))

In [None]:
sites_df = pd.read_sql(
    "select title, url, description, site_name, interesting from sites_sitepost WHERE published::date >= '2023-01-01'", 
    con=engine)

In [None]:
feeds_df = pd.read_sql(
    "select feeds_feedentry.title as title, feeds_feedentry.url as url, feeds_feedentry.description as description, feeds_feed.title as site_name, interesting from feeds_feedentry JOIN feeds_feed ON feeds_feed.id = feed_id WHERE published::date >= '2023-01-01'", 
    con=engine)

In [None]:
reddit_df = pd.read_sql(
    "select sites_redditpost.title as title, sites_redditpost.url as url, sites_redditpost.description as description, sites_subreddit.name as site_name, interesting from sites_redditpost JOIN sites_subreddit ON sites_redditpost.id = sites_subreddit.id WHERE published::date >= '2023-01-01'", 
    con=engine)

In [None]:
content_df = pd.concat([reddit_df, sites_df, feeds_df])

# Evaluating target

In [None]:
content_df.interesting = content_df.interesting.astype(int)

In [None]:
content_df.interesting.value_counts()

In [None]:
content_df.interesting.value_counts().iloc[0] / content_df.shape[0]

In [None]:
content_df.interesting.value_counts().plot.bar()

# Preparing the text data

You'll need to take this code and put it into the priveedly rate_all.py script (see management_commands/rate_all.py) once you are running your pipeline in production. 

If you are using non-English languages, you probably want to play around and adjust this preparation to fit what works for you. I would love if you want to contribute any interesting additional notebooks to the repo! :)

In [None]:
def tokenize_url(url_str):
    parsed_url = urlparse(url_str)
    return parsed_url.netloc, ' '.join(parsed_url.path.split('/')).replace('-', ' '), parsed_url.query.replace('?', ' ').replace('=', ' ')

def prepare_content(pandas_row):
    netloc, path, query = tokenize_url(pandas_row.url)
    return ' '.join([pandas_row.title, pandas_row.description, pandas_row.site_name])

CLEAN_NUMBERS = re.compile('[0-9,\\.$\\%]+')
CLEAN_NUMBERS_AND_ONE_LETTER = re.compile('([a-z]\\d+)|(\\d+[a-z])|(\\d+[a-z]\\d+)')
CLEAN_REPEATED_PUNCTUATION = re.compile('[!\\-\\/:-@-`’–{-~"“”\\[\\]]+')

def remove_tags_and_lowercase(text): 
    # some parts from https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
    if BeautifulSoup(text, "html.parser").find():
        try:
            soup = BeautifulSoup(text)
            text = soup.get_text()
        except:
            pass
    cleantext = html.unescape(text).encode('unicode_escape').decode('unicode_escape')
    # you can try this line or other similar things  if you want to be more deliberate about cleaning!
    #cleantext = re.sub(CLEAN_NUMBERS_AND_ONE_LETTER, '', cleantext)
    cleantext = re.sub(CLEAN_NUMBERS, '', cleantext)
    cleantext = re.sub(CLEAN_REPEATED_PUNCTUATION, '', cleantext)
    return cleantext.lower()

removal = set(stopwords.words('english')).union(set(string.punctuation))

def tokenize_content(text):
    return [w for w in tokenize.word_tokenize(remove_tags_and_lowercase(text)) if w.lower() not in removal]

In [None]:
content_df['full_text'] = content_df.apply(prepare_content, axis=1)

In [None]:
content_df['cleaned_text'] = content_df['full_text'].map(lambda x: ' '.join(tokenize_content(x)))

In [None]:
sample = content_df.sample(20)

In [None]:
sample[["full_text", "cleaned_text"]]

In [None]:
content_df.to_csv("data/cleaned.csv")

### Now you can always load this way

In [None]:
content_df = pd.read_csv("data/cleaned.csv")

### Dealing with class imbalance

My classes are really lopsided. Yours might be different! If you notice that yours are more even, you can use the orig_X_train as the X_train (and so forth!).

To help with my lopsided classes, I will use [Imbalanced Learn](https://imbalanced-learn.org/).

In [None]:
oversampler = RandomOverSampler(sampling_strategy=0.15)

In [None]:
orig_X_train, orig_X_test, orig_y_train, orig_y_test = train_test_split(content_df.cleaned_text, content_df.interesting, 
                                                    test_size=0.3, stratify=content_df.interesting)

In [None]:
Counter(orig_y_train), Counter(orig_y_test)

In [None]:
X_res, y_res = oversampler.fit_resample(content_df[["cleaned_text"]].to_numpy(), content_df.interesting.to_numpy())

In [None]:
Counter(y_res)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_res.flatten(), y_res, test_size=0.3)

In [None]:
Counter(y_train), Counter(y_test)

### Let's build some NLP pipelines with Scikit-learn!

Scikit-learn is a great library for building machine learning models, especially with smaller personalized datasets, like this one! It has everything you need to get started and a great learning community and documentation.

Want to learn more about scikit-learn and different machine learning models? Check out:

- [Scikit-learn crash course](https://www.youtube.com/watch?v=0B5eIE_1vpU)
- [Scikit-learn online learning course](https://inria.github.io/scikit-learn-mooc/)
- [Calmcode](https://calmcode.io)
- [probabl's YouTube Channel (some advanced topics)](https://www.youtube.com/@probabl_ai)

Hat tip to [Vincent](https://github.com/koaning) for helping me assemble these resources!

In [None]:
svc_pipeline = Pipeline(
    [
        ("vect", TfidfVectorizer()),
        ("clf", SVC()),  # more complex, but maybe not worth it
    ]
)

In [None]:
bayes_pipeline = Pipeline(
    [
        ("vect", TfidfVectorizer()),
        ("clf", ComplementNB()), # better at imbalance
    ]
)

In [None]:
logreg_pipeline = Pipeline(
    [
        ("vect", TfidfVectorizer()),
        ("clf", LogisticRegression()),  # simple, but maybe good enough
    ]
)

For looking up parameters to test, take a look at the following:

- [TF-IDF Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)
- [SVC Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html)
- [Complement Naive Bayes Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.ComplementNB.html)
- [LogisticRegression Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

In [None]:
base_parameter_grid = {
    "vect__max_df": (0.8, 0.9),
    "vect__min_df": (0.01, 0.03),
    "vect__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
    #"vect__norm": ("l1", "l2"),
}

In [None]:
svc_parameter_grid = {
    "clf__C": (1, 10), # inverse of regularization strength (smaller = more regularization)
    "clf__kernel": ('rbf', 'sigmoid', 'poly') 
}


In [None]:
cnb_parameter_grid = {
    "clf__alpha": np.logspace(-6, 6, 13), # Additive (Laplace/Lidstone) smoothing parameter 
}

In [None]:
logreg_parameter_grid = {
    "clf__C": (1, 10), # inverse of regularization strength (smaller = more regularization)
    "clf__solver": ("lbfgs", "liblinear", "newton-cholesky"), 
}

### Start by testing each model separately

You can eventually productionize this with Weights and Biases, or just find the type of model that works best for your data and stick with that, updating only the training dataset over time. 

After you get your first model or two working, you likely also decide: oh I really only want to test SVC or I like having a fast LR model. Or even, I want to compare these simple models with a deep learning model or a local LLM.

To test each one, change the lines below to reflect your changes:

- use the parameter grid you set up above
- change the model_name to something you will remember
- change the estimator to the pipeline that you are evaluating

In [None]:
parameter_grid = base_parameter_grid.copy()
parameter_grid.update(logreg_parameter_grid) #CHANGE HERE: logreg_parameter_grid, cnb_parameter_grid, svc_parameter_grid
model_name = "LR" # CHANGE HERE suggestion: LR, CNB, SVC

random_search = RandomizedSearchCV(
    estimator=logreg_pipeline, # CHANGE HERE: logreg_pipeline, bayes_pipeline, svc_pipeline
    param_distributions=parameter_grid,
    n_iter=20,
    random_state=0,
    n_jobs=4,
    verbose=1,
)

print("Performing grid search...")
print("Hyperparameters to be evaluated:")
pprint(parameter_grid)

In [None]:
t0 = time()
random_search.fit(X_train, y_train)
print(f"Done in {time() - t0:.3f}s")

In [None]:
print("Best parameters combination found:")
best_parameters = random_search.best_estimator_.get_params()
for param_name in sorted(parameter_grid.keys()):
    print(f"{param_name}: {best_parameters[param_name]}")

In [None]:
test_accuracy = random_search.score(X_test, y_test)
print(f"Accuracy of the best parameters using CV random search: {random_search.best_score_:.3f}")
print(f"Accuracy on test set: {test_accuracy:.3f}")

In [None]:
y_pred = random_search.predict(X_test)

In [None]:
human_labels = {0: 'not interesting',
                1: 'interesting'}

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test, y_pred), display_labels=[human_labels[c] for c in random_search.classes_])
disp.plot()

In [None]:
experiment_time = datetime.now().strftime("%Y%m%d_%H_%M")
with open("experiments/{}_{}.txt".format(experiment_time, model_name), 'w') as documentation_file:
    for param_name in sorted(parameter_grid.keys()):
        documentation_file.write(f"{param_name}: {best_parameters[param_name]}")
    documentation_file.write(f"Accuracy on the random search: {random_search.best_score_:.3f}")
    documentation_file.write(f"Accuracy on test set: {test_accuracy:.3f}")                       

In [None]:
logreg_pipeline.set_params(**best_parameters) # CHANGE THIS: logreg_pipeline, bayes_pipeline, svc_pipeline

In [None]:
joblib.dump(logreg_pipeline, "experiments/models/{}_{}_pipeline.pkl".format(experiment_time, model_name)) # CHANGE THIS: logreg_pipeline, bayes_pipeline, svc_pipeline

In [None]:
pipeline = logreg_pipeline # CHANGE THIS: logreg_pipeline, bayes_pipeline, svc_pipeline

If you ever want to load again, you can just:


In [None]:
pipeline = joblib.load('experiments/models/20250121_19_46_SVC_pipeline.pkl')

### Investigating / interpreting your model

So now you have an idea of the accuracy, but will it work for what you want to use it for? 

Let's say that it's really good at recognizing exactly your interests based on some silly keywords that you don't think will hold in practice. Or let's say you're also just curious about what keywords might be most interesting to you and want to have a look at the inner workings of your system. Either way, it's a good idea to investigate the model in order to qualitatively compare the models you've trained and determine which model you want to use.

The following parts of the notebook can help you investigate and figure out how you think about the model decisions.

#### Note: LIME Text Explainer doesn't appear to work for my data with SVC; but that might be different for you ! Let me know if it does!

In [None]:
from lime.lime_text import LimeTextExplainer


explainer = LimeTextExplainer(class_names=[human_labels[c] for c in pipeline.classes_])

In [None]:
sample_df = content_df.groupby("interesting").sample(n=20)

In [None]:
pipeline.named_steps

In [None]:
vectorizer = pipeline.named_steps['vect']
estimator = pipeline.named_steps['clf']

In [None]:
# this is a fix for the SVC problem in LIME (see https://github.com/marcotcr/lime/issues/465)
def classifier_fn(X):
    vectorized_text_instance =  vectorizer.transform(X)
    decision =                  estimator.decision_function(vectorized_text_instance)
    reshaped_decision =         np.array(decision).reshape(-1, 1)
    return reshaped_decision

In [None]:
for example in sample_df.cleaned_text: 
    try:
        if hasattr(pipeline, 'predict_proba'):
            exp = explainer.explain_instance(example, pipeline.predict_proba, labels=pipeline.classes_) 
        elif "SVC" in str(estimator): # this is hacky :(
            exp = explainer.explain_instance(text_instance=example, classifier_fn=classifier_fn, labels=(0,))
        exp.show_in_notebook()
    except Exception as e:
        print(e)
        print('problem with this example')

In [None]:
from sklearn.inspection import permutation_importance

if hasattr(estimator, 'feature_log_prob_'): # bayesian
    neg_class_prob_sorted = estimator.feature_log_prob_[0, :].argsort()[::-1]
    pos_class_prob_sorted = estimator.feature_log_prob_[1, :].argsort()[::-1]
elif hasattr(estimator, 'coef_'): # logreg
    pos_class_prob_sorted = estimator.coef_[0, :].argsort()[::-1]
    neg_class_prob_sorted = estimator.coef_[0, :].argsort()
elif hasattr(estimator, 'kernel'): # svm
    X = vectorizer.transform(X_train).toarray() # this is inefficient and it might run out of memory or timeout :(
                                                # if this happens restart kernel and don't rerun  
    perm_importance = permutation_importance(estimator, X, y_train)
    pos_class_prob_sorted = perm_importance.importances_mean.argsort()
    neg_class_prob_sorted = perm_importance.importances_mean.argsort()[::-1]


feature_names = vectorizer.get_feature_names_out()

print(np.take(feature_names, neg_class_prob_sorted[:100]))
print(np.take(feature_names, pos_class_prob_sorted[:100]))


In [None]:
def find_word_rank(query):
    i, = np.where(feature_names == query)
    try:
        pos_i = np.where(pos_class_prob_sorted == i)
        neg_i = np.where(neg_class_prob_sorted == i)
        if pos_i < neg_i:
            print("ranked in positive score at position #{} out of {}".format(pos_i[0][0], pos_class_prob_sorted.shape[0]))
        else:
            print("ranked in negative score at position #{} out of {}".format(neg_i[0][0], neg_class_prob_sorted.shape[0]))
    except ValueError:
        print('token not found')


In [None]:
find_word_rank("crypto")

In [None]:
find_word_rank("cryptography")

### If this is the main one you want to use, store it as pipeline.pkl and upload it to your server :)

In [None]:
joblib.dump(pipeline, "pipeline.pkl")