# News classification using OpenAI API

In [None]:
%load_ext autoreload
%autoreload 2

from openai_api import classify
import pandas as pd

Loading required credentials from `.env` file

In [None]:
import openai
from pydantic_settings import BaseSettings

class Credentials(BaseSettings):
    phone: str
    api_hash: str
    api_id: str
    openai_api_key: str

    class Config:
        env_file = "../config/.env"
        env_file_encoding = "utf-8"
        
categories = ["Политика","Финансы", "Спорт", "Знаменитости", "Блог", "История", "Видеоигры", "Кино", "Технологии", "Наука", "Анонс", "Музыка", "Литература", "Медицина", "Быт"]

creds = Credentials()

openai.api_key = creds.openai_api_key

Reading dataset got by scraper

In [None]:
df = pd.read_csv("../data/result.csv")

Retrieving labels from OpenAI

In [None]:
from tqdm import tqdm
import logging

LOGGING_FORMAT = "[%(levelname)s] [%(asctime)s] %(message)s"
logging.basicConfig(format=LOGGING_FORMAT, datefmt="%m-%d %H:%M:%S", force=True, level=logging.INFO)

results = []
errors = 0

for i, row in tqdm(df.iterrows(), total=len(df)):
    text = row['text']
    
    if not pd.isnull(df['predicted'][i]):
        continue
    label = classify(text, categories, "gpt-3.5-turbo", 1)
    if not label:
        errors += 1
        df['predicted'][i] = None
    else:
        df['predicted'][i] = label
        df.to_csv("result.csv", index=False)

Accuracy and confusion matrix of prediction based on human mark-up

In [None]:
len(df[df['predicted'] == df['label']]) / (len(df))

In [None]:
from sklearn.metrics import confusion_matrix
from seaborn import heatmap
df = df.dropna()

labels = df['label'].unique()

heatmap(confusion_matrix(df['label'], df['predicted']), xticklabels=labels, yticklabels=labels, cmap='viridis')