# Basic Training Notebook

In [None]:
import os
import pandas
import mlflow
import catboost
import dateutil
import datetime
from pathlib import Path

from tokenizers import BertWordPieceTokenizer
from sklearn.feature_extraction.text import CountVectorizer

from news_driven_investing.io import read_partitioned_pandas_asset
from news_driven_investing.resources import ENGLISH_STOP_WORDS
from news_driven_investing.preprocessing import compute_stock_price_overnight_changes
from news_driven_investing.config.settings import settings

In [None]:
X = read_partitioned_pandas_asset("video_transcript")
X = X[~X["transcript"].fillna("Exception").str.startswith("Exception")]
X["snippet.channelId"] = X["snippet.channelId"].replace(
    {value: key for key, value in settings.YOUTUBE_CHANNELS.items()}
)
y = read_partitioned_pandas_asset("stock_prices")

Calculate price change from closing the last day (of trading) to opening the next day (of trading)

In [None]:
y = compute_stock_price_overnight_changes(y)

In [None]:
X["snippet.publishedAt"] = X["snippet.publishedAt"].apply(dateutil.parser.parse)
X["snippet.publishedAt"] = X["snippet.publishedAt"].dt.date
X["snippet.publishedAt"] = pandas.to_datetime(X["snippet.publishedAt"])

In [None]:
X = X.sort_values("snippet.publishedAt")

In [None]:
import catboost
from sklearn.model_selection import train_test_split

In [None]:
target = "snippet.channelId"

In [None]:
clf = catboost.CatBoostClassifier(
    verbose=20, 
    od_type="Iter", 
    early_stopping_rounds=21
)

train, test = train_test_split(X, test_size=0.2)
test, validate = train_test_split(test, test_size=0.5)

data_train = catboost.Pool(
    train[["transcript", "snippet.title"]],
    train[target],
    text_features=["transcript", "snippet.title"]
)

data_test = catboost.Pool(
    test[["transcript", "snippet.title"]],
    test[target],
    text_features=["transcript", "snippet.title"]
)

data_validate = catboost.Pool(
    validate[["transcript", "snippet.title"]],
    validate[target],
    text_features=["transcript", "snippet.title"]
)

In [None]:
clf.fit(data_train, eval_set=data_validate)

In [None]:
ypred = clf.predict(data_test)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

M = confusion_matrix(ypred, test[target])
M = pandas.DataFrame(
    M,
    columns=test[target].unique(), 
    index=test[target].unique()
)
M

In [None]:
f1_score(ypred, test[target], average="weighted")