# model
> use a classifier to detect differences

In [None]:
#|default_exp model

In [None]:
#|export
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from ft_drift.parse import ChatData

In [None]:
#|export
def prep_data(f1='file_a.jsonl', 
              f2='file_b.jsonl'):
    c1,c2 = ChatData.load_jsonl(f1), ChatData.load_jsonl(f2)
    m1, m2 = c1.to_md(), c2.to_md()
    
    # Create labels
    a_labels = [0] * len(m1)
    b_labels = [1] * len(m2)

    # Combine datasets
    data = m1 + m2
    labels = a_labels + b_labels

    # Create a DataFrame
    df = pd.DataFrame({'text': data, 'label': labels})
    return df

In [None]:
_df = prep_data()
assert len(_df) == 4568

Loaded 2284 rows from file_a.jsonl
Loaded 2284 rows from file_b.jsonl


In [None]:
#|export
def model(df):
    "Fit a model and calculate diagnostics."
    X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)
    
    # Create the pipeline
    pat = r"(?u)[\w|<>\-`]+"
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(min_df=.05, 
                                  ngram_range=(1, 3), 
                                  lowercase=False,
                                  token_pattern=pat)
        ),
        ('clf', RandomForestClassifier(random_state=42))
    ])
    
    # Fit the model
    pipeline.fit(X_train, y_train)
    y_pred_prob = pipeline.predict_proba(X_test)[:, 1]

    # Compute ROC AUC score
    pipeline.roc_auc = roc_auc_score(y_test, y_pred_prob)

    # Compute feat impt
    tfidf = pipeline.named_steps['tfidf']
    feature_names = tfidf.get_feature_names_out()
    importances = pipeline.named_steps['clf'].feature_importances_
    indices = np.argsort(importances)[::-1]
    pipeline.top_features = pd.DataFrame({'Feature': [feature_names[i] for i in indices], 
                                          'Importance': importances[indices]})
    return pipeline

In [None]:
clf = model(_df)

In [None]:
clf.roc_auc

0.9652849641638879

In [None]:
clf.top_features.head(15)

Unnamed: 0,Feature,Importance
0,<|END-UI-FORMAT|> Role,0.071926
1,<|UI-FORMAT|> id,0.051172
2,Role function <|JSON-FORMAT|>,0.051035
3,<|END-UI-FORMAT|> Role assistant,0.05068
4,<|UI-FORMAT|>,0.050151
5,<|END-JSON-FORMAT|> Role assistant,0.048927
6,<|END-JSON-FORMAT|> Role,0.047124
7,<|JSON-FORMAT|>,0.046406
8,```json id,0.042374
9,assistant ```json,0.039353
