In [None]:
import nltk
import sklearn
import pandas as pd
import gensim
import FuzzyTM
import contractions
import os
import re
import numpy as np
from nltk.corpus import stopwords, words
from nltk.stem import WordNetLemmatizer

import matplotlib.pyplot as plt

import plotly.graph_objects as go

import pandas as pd
from datetime import datetime

# Preprocessing
- What documents do we include in the corpus?
- How do we create the labels for the target variable?
- Has the corpus from Kaggle been pre-processed in some way? Do we use other proprocessing steps as well? (use at least stop-word removal)

In [None]:
df_stock = pd.read_csv("../data/raw/AAPL.csv")

df_stock['Label'] = 0
df_stock.loc[(df_stock['Close'] - df_stock['Open']) > 0, 'Label'] = 1
df_stock

In [None]:
fig = go.Figure(data=[go.Candlestick(x=df_stock['Date'], open=df_stock['Open'],
                high=df_stock['High'],
                low=df_stock['Low'],
                close=df_stock['Close'])])

fig.update_layout(
    autosize=False,
    width=1500,
    height=800,)
# fig.show()

In [None]:
df_text = pd.read_csv("../data/raw/us_equities_news_dataset.csv")

In [None]:
df_text

## Select relevant documents (corpus)

In [None]:
print("There are {} documents of which the title contains the word Apple".format(str(df_text['title'].str.contains('[Aa]pple').sum())))
print("There are {} documents of which the title contains the word iPad".format(str(df_text['title'].str.contains('i[Pp]ad').sum())))
print("There are {} documents of which the title contains the word iPod".format(str(df_text['title'].str.contains('i[Pp]od').sum())))
print("There are {} documents of which the title contains the word iPhone".format(str(df_text['title'].str.contains('i[Pp]hone').sum())))
print("There are {} documents of which the title contains the word Steve Jobs".format(str(df_text['title'].str.contains('[Ss]teve [Jj]obs').sum())))
print("There are {} documents of which the title contains the word Tim Cook".format(str(df_text['title'].str.contains('[Tt]im [Cc]ook').sum())))
print("There are {} documents of which the title contains the word iOS".format(str(df_text['title'].str.contains('[Ii][Oo][Ss]').sum())))
print("There are {} documents of which the title contains the word MacOS".format(str(df_text['title'].str.contains('[Mm]ac[Oo][Ss]').sum())))
print("There are {} documents of which the title contains the word Macbook".format(str(df_text['title'].str.contains('[Mm]ac[Bb]ook').sum())))
print("There are {} documents of which the title contains the word Mac Pro".format(str(df_text['title'].str.contains('[Mm]ac [Pp]ro').sum())))

In [None]:
doc_idx = [] # list of indexnumbers that include above titel RegEx
for i in ['[Aa]pple', 'i[Pp]ad', 'i[Pp]od', 'i[Pp]hone', '[Ss]teve [Jj]obs', '[Tt]im [Cc]ook']:
    idx_list = df_text[df_text['title'].str.contains(i)].index
    for idx in idx_list:
        if idx not in doc_idx:
            doc_idx.append(idx)
            
print(len(np.unique(doc_idx)))

In [None]:
for idx in list(df_text.loc[df_text['ticker'] == 'AAPL'].index):
    doc_idx.append(idx)
    
print(len(np.unique(doc_idx)))

In [None]:
doc_idx = np.unique(doc_idx)

In [None]:
corpus = df_text.iloc[doc_idx].reset_index(drop=True)

## Add stock movement label & match dates
- Add stock movement label to each news article in corpus.
    - Additionaly, it sorts on the date
    - Additionaly, it matches the dates for the two data source because of the inner join
    - This means we only include days that have both news articles and (recorded) stock movements

In [None]:
corpus = corpus.merge(df_stock.loc[:,['Date','Label']], on=None, left_on='release_date', right_on='Date', copy=False, sort=True).drop(['Date'], axis=1)
corpus


## Remove duplicates

In [None]:
print(f"{corpus.shape[0]} original number of articles")
corpus.loc[corpus.astype(str).drop_duplicates().index]
print(f"{corpus.shape[0]} after dropping perfect duplicates")
corpus.drop_duplicates(subset=['content'], inplace=True)
print(f"{corpus.shape[0]} after dropping duplicates on content")
corpus.drop_duplicates(subset=['title','release_date'], inplace=True)
print(f"{corpus.shape[0]} after dropping duplicates on title and release_date only")

## Contractions removal

In [None]:
for i in range(len(corpus)):
    corpus.iloc[[i],[4]] = contractions.fix(corpus.iloc[i,4])

## Tokenize text

In [None]:
corpus['unigrams'] = corpus.apply(lambda row: nltk.word_tokenize(row['content']), axis=1)

## Remove article markup

In [None]:
corpus.iloc[0,10]

In [None]:
corpus.iloc[0,4]

In [None]:
# Remove \n

## Removing stop words & lowercasing

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
for idx, document in corpus.iterrows():
    #print(len(document['unigrams']))
    no_stop_words= [token.lower () for token in document['unigrams'] if not token.lower () in stop_words]
    #print(len(document['unigrams']))
    corpus.at[idx, 'unigrams'] = no_stop_words
    #print("====" )

## Punctuation removal

In [None]:
for idx, document in corpus.iterrows():
    #print(len(document['unigrams']))
    no_punctuation = [token for token in document['unigrams'] if token.isalpha()]
    corpus.at[idx, 'unigrams'] = no_punctuation
    #print(len(document['unigrams']))
    #print("======" )


## Remove nonsensical words

In [None]:
english_words = set(words.words())
custom_words = ["apple", "ipad", "ipod", "iphone", "macos", "osx", "macbook", "steve", "jobs", "tim", "cook"]
for word in custom_words:
    english_words.add(word)

for idx, document in corpus.iterrows():
    #print(len(document['unigrams']))
    no_nonsense = [token for token in document['unigrams'] if token in english_words]
    corpus.at[idx, 'unigrams'] = no_nonsense

## Lemmatization

In [None]:
# Lemmatization (takes about 16 seconds)
lemmatizer = WordNetLemmatizer()

for idx, document in corpus.iterrows():
    #print(len(document['unigrams']))
    lemmings= [token if token in custom_words else lemmatizer.lemmatize(token) for token in document['unigrams']]
    #print(len(document['unigrams']))
    corpus.at[idx, 'unigrams'] = lemmings
    #print("====" )

## Top word count

In [None]:
from collections import Counter
counted_doc = Counter(corpus['unigrams'][2])
counted_doc.most_common(10)

# Modeling
- Use BoW representation. But there are still many modeling decisions to be made. Motivate these decisions, pay attention to feature selection and the measures/metrics you use to represent the words/text.
- Which classification algorithm do we choose? Why?
- Which parameters do we use? Why?

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)

X = vectorizer.fit_transform(corpus['unigrams'][:])
df_bow = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names_out())
df_bow.head()

In [None]:
#df_bow.columns[1000:10000]

In [None]:
# Observe low frequency words
df_bow.sum(axis=0).sort_values(ascending = True)[:int(len(df_bow.columns)*0.3)]

In [None]:
# Observe high frequency words
df_bow.sum(axis=0).sort_values(ascending = False)[:int(len(df_bow.columns)*0.3)][:50]

In [None]:
# Create the tf-idf representation using the bag-of-words matrix
tfidf_transform = sklearn.feature_extraction.text.TfidfTransformer(norm=None)
X_tfidf = tfidf_transform.fit_transform(df_bow)

In [None]:
# This is a document x words matrix
X_tfidf

In [None]:
X_tfidf_array = X_tfidf.toarray()

# Train & Evaluate
- What is the experimental design used for training and evaluating?
- How does the model perform? What metrics do we use and why?

## Cross-Validation for time series

In [None]:
from sklearn.model_selection import TimeSeriesSplit

X = X_tfidf
y = corpus["Label"]

ts_cv = TimeSeriesSplit(
    n_splits=5,                         # 5 might not make a lot of sense. Maybe better to do shorter time intervals -> n_splits = ~200
    gap=0,
    max_train_size=None,
    test_size=None,
)

all_splits = list(ts_cv.split(X, y))

## Models

In [None]:
from sklearn.model_selection import cross_validate

# To surpress the non-convergence warnings
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

@ignore_warnings(category=ConvergenceWarning)
def evaluate(model, X, y, cv):
    cv_results = cross_validate(
        model,
        X,
        y,
        cv=cv,
        scoring=["accuracy","neg_mean_absolute_error", "neg_root_mean_squared_error"],
    )
    acc = cv_results["test_accuracy"]
    mae = -cv_results["test_neg_mean_absolute_error"]
    rmse = -cv_results["test_neg_root_mean_squared_error"]
    fit_time = cv_results["fit_time"]
    print(
        f"===== {model} =====   [fit time: {sum(fit_time):.1f}s]\n"
        f"Accuracy:                {acc.mean():.3f} +/- {acc.std():.3f}\n"
        f"Mean Absolute Error:     {mae.mean():.3f} +/- {mae.std():.3f}\n"
        f"Root Mean Squared Error: {rmse.mean():.3f} +/- {rmse.std():.3f}"
    )
    return(cv_results)

In [None]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier, Perceptron

models = [LogisticRegression(), SGDClassifier(), Perceptron()] #RidgeClassifier takes ~2min to run
cv_results_all = []

for model in models:
    cv_results = evaluate(model, X, y, ts_cv)
    cv_results_all.append(cv_results)

### Hyperparameter tuning?