# Basic Training Notebook

In [None]:
import os
import pandas
import mlflow
import dateutil
import datetime
import catboost
from pathlib import Path

from tokenizers import BertWordPieceTokenizer
from sklearn.feature_extraction.text import CountVectorizer

from news_driven_investing.io import read_partitioned_pandas_asset
from news_driven_investing.resources import ENGLISH_STOP_WORDS
from news_driven_investing.preprocessing import compute_stock_price_overnight_changes
from news_driven_investing.config.settings import settings

In [None]:
data = read_partitioned_pandas_asset("video_transcript")
data = data[~data["transcript"].fillna("Exception").str.startswith("Exception")]
data["snippet.channelId"] = data["snippet.channelId"].replace(
    {value: key for key, value in settings.YOUTUBE_CHANNELS.items()}
)
y = read_partitioned_pandas_asset("stock_prices")

In [None]:
analyzer = CountVectorizer(
    lowercase=True,
    min_df=1e-4,
    stop_words=ENGLISH_STOP_WORDS
).build_analyzer()

In [None]:
data["snippet.title"] = data["snippet.title"].apply(analyzer).apply(" ".join)
data["transcript"] = data["transcript"].apply(analyzer).apply(" ".join)

In [None]:
tokenizer = BertWordPieceTokenizer()
tokenizer.train_from_iterator(data["snippet.title"] + " " + data["transcript"], min_frequency=10)

In [None]:
data["snippet.title"] = data["snippet.title"].apply(tokenizer.encode).apply(lambda x: " ".join(x.tokens))
data["transcript"] = data["transcript"].apply(tokenizer.encode).apply(lambda x: " ".join(x.tokens))

Calculate price change from closing the last day (of trading) to opening the next day (of trading)

In [None]:
y = compute_stock_price_overnight_changes(y)

In [None]:
data["snippet.publishedAt"] = data["snippet.publishedAt"].apply(dateutil.parser.parse)
data["snippet.publishedAt"] = data["snippet.publishedAt"].dt.date
data["snippet.publishedAt"] = pandas.to_datetime(data["snippet.publishedAt"])

In [None]:
data = data.groupby(["snippet.publishedAt"], as_index=False)[["snippet.title", "transcript"]].agg(" ".join)

In [None]:
data["next_day"] = data["snippet.publishedAt"] + datetime.timedelta(days=1)

In [None]:
data = data.merge(y, left_on="next_day", right_on="date", how="inner")

In [None]:
target = "Microsoft"

In [None]:
import catboost
from sklearn.model_selection import train_test_split

In [None]:
rgs = catboost.CatBoostRegressor(
    iterations=500,
    od_type="Iter", 
    early_stopping_rounds=51,
    verbose=50,
    learning_rate=0.01
)

text_features = ["snippet.title", "transcript"]
features = text_features

train, test = train_test_split(data, test_size=0.2)
test, validate = train_test_split(test, test_size=0.5)

train = catboost.Pool(
    train[features], 
    train[target],
    text_features=text_features
)

test = catboost.Pool(
    test[features], 
    test[target],
    text_features=text_features
)

validate = catboost.Pool(
    validate[features], 
    validate[target],
    text_features=text_features
)

In [None]:
rgs.fit(train, eval_set=validate)

In [None]:
ypred = rgs.predict(test)

In [None]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(ypred, test.get_label())

In [None]:
test.shape

Baseline performance: Randomly sample

In [None]:
from sklearn.dummy import DummyRegressor
import numpy as np

In [None]:
d = DummyRegressor().fit(np.random.random(train.shape), train.get_label())

In [None]:
ypred_base=d.predict(np.random.random(test.shape))

In [None]:
mean_absolute_error(ypred_base, test.get_label())