In [1]:
import os

import pandas as pd
import numpy as np

EMBEDDING_MODEL = 'text-embedding-3-small'

In [176]:
data_file = os.path.join(os.environ['GEEKTIME_AI_COURSE_DATA'], 'toutiao_cat_data.txt')
df = pd.read_csv(data_file, sep='_!_', names=['id', 'code', 'category', 'title', 'keywords'], index_col='id',engine='python')
df = df.fillna('')
df['combined'] = '标题: ' + df.title.str.strip() + '; 关键字: ' + df.keywords.str.strip()

In [177]:
# https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken
import tiktoken

encoding = tiktoken.encoding_for_model(EMBEDDING_MODEL)  # cl100k_base
df['n_tokens'] = df.combined.apply(lambda x: len(encoding.encode(x)))

In [178]:
df_filtered = df[df.n_tokens <= 8000]
len(df), len(df_filtered)

(382688, 382688)

In [212]:
import backoff
from openai import OpenAI, RateLimitError

client = OpenAI(
    base_url=os.environ['OPENAI_API_BASE'],
    api_key=os.environ['OPENAI_API_KEY'],
)


@backoff.on_exception(backoff.expo, RateLimitError)
def get_embeddings(texts):
    texts = [text.replace('\n', ' ') for text in texts]
    response = client.embeddings.create(
        input=texts,
        model=EMBEDDING_MODEL,
    )
    return [item.embedding for item in response.data]

In [217]:
df_filtered['embedding'] = pd.Series(np.nan, dtype='object')

In [None]:
combined = df_filtered[df_filtered.embedding.isna()].combined.head(1000)
embeddings = get_embeddings(combined.to_list())
embedding = pd.Series(embeddings, index=combined.index, name='embedding')

df_filtered.update(embedding)
df_filtered.head()

In [2]:
saved_file = os.path.join(os.environ['GEEKTIME_AI_COURSE_DATA'], 'toutiao_cat_data_embeddings.parquet')

In [None]:
df_filtered.to_parquet(saved_file)

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

training_data = pd.read_parquet(saved_file)
df_training = training_data[training_data.embedding.notna()]

In [None]:
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(
    df_training.embedding.tolist(),
    df_training.category,
    test_size=0.2,
    random_state=42,
)

clf = RandomForestClassifier(n_estimators=300)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
probas = clf.predict_proba(X_test)

report = classification_report(y_test, preds)
print(report)

In [6]:
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(
    df_training.embedding.tolist(),
    df_training.category,
    test_size=0.2,
    random_state=42,
)

clf = LogisticRegression()
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
probas = clf.predict_proba(X_test)

report = classification_report(y_test, preds, zero_division=0)
print(report)

                    precision    recall  f1-score   support

  news_agriculture       0.93      0.88      0.90        32
          news_car       0.97      0.95      0.96        37
      news_culture       0.81      0.86      0.83        35
          news_edu       0.84      0.84      0.84        32
news_entertainment       0.79      0.91      0.85        33
      news_finance       0.91      0.78      0.84        37
         news_game       0.87      0.87      0.87        15
        news_house       0.87      0.93      0.90        29
     news_military       0.78      0.81      0.79        31
       news_sports       1.00      0.95      0.97        39
         news_tech       0.79      0.85      0.82        27
       news_travel       0.74      0.74      0.74        23
        news_world       0.82      0.79      0.81        29
             stock       0.00      0.00      0.00         1

          accuracy                           0.86       400
         macro avg       0.79      0.8