# Import libraries

In [None]:
import nltk
import sklearn
import pandas as pd
import gensim
import FuzzyTM
import contractions
import os
import re
import numpy as np
from nltk.corpus import stopwords, words
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

import matplotlib.pyplot as plt

import plotly.graph_objects as go

import pandas as pd
from datetime import datetime

# Stock price dataset
- What documents do we include in the corpus?
- How do we create the labels for the target variable?
- Has the corpus from Kaggle been pre-processed in some way? Do we use other proprocessing steps as well? (use at least stop-word removal)

## Load stock price data (read)

In [None]:
# Load the stock price data from csv
df_stock = pd.read_csv("../data/raw/AAPL.csv")
df_stock.head()

## Labeling (preprocessing)

In [None]:
# Create labeling; whenever the closing price is higher than the opening price, assign label 1.
df_stock['Label'] = 0
df_stock.loc[(df_stock['Close'] - df_stock['Open']) > 0, 'Label'] = 1
df_stock

## Visualization (EDA)

In [None]:
# Plot candlestick graph, showing the change in stock price of AAPL over time
fig = go.Figure(data=[go.Candlestick(x=df_stock['Date'], open=df_stock['Open'],
                high=df_stock['High'],
                low=df_stock['Low'],
                close=df_stock['Close'])])

fig.update_layout(
    autosize=False,
    width=1500,
    height=800,)
# fig.show()

# News articles dataset

## Load news articles data (read)

In [None]:
# Load the newsarticles dataset
df_text = pd.read_csv("../data/raw/us_equities_news_dataset.csv")
df_text.head()

In [None]:
print(df_text.shape)

## Preprocessing

### Define corpus (document filtering)

In [None]:
# Inspect and count articles with titles relevant to Apple
print("There are {} documents of which the title contains the word Apple".format(str(df_text['title'].str.contains('[Aa]pple').sum())))
print("There are {} documents of which the title contains the word iPad".format(str(df_text['title'].str.contains('i[Pp]ad').sum())))
print("There are {} documents of which the title contains the word iPod".format(str(df_text['title'].str.contains('i[Pp]od').sum())))
print("There are {} documents of which the title contains the word iPhone".format(str(df_text['title'].str.contains('i[Pp]hone').sum())))
print("There are {} documents of which the title contains the word Steve Jobs".format(str(df_text['title'].str.contains('[Ss]teve [Jj]obs').sum())))
print("There are {} documents of which the title contains the word Tim Cook".format(str(df_text['title'].str.contains('[Tt]im [Cc]ook').sum())))
print("There are {} documents of which the title contains the word iOS".format(str(df_text['title'].str.contains('[Ii][Oo][Ss]').sum())))
print("There are {} documents of which the title contains the word MacOS".format(str(df_text['title'].str.contains('[Mm]ac[Oo][Ss]').sum())))
print("There are {} documents of which the title contains the word Macbook".format(str(df_text['title'].str.contains('[Mm]ac[Bb]ook').sum())))
print("There are {} documents of which the title contains the word Mac Pro".format(str(df_text['title'].str.contains('[Mm]ac [Pp]ro').sum())))

In [None]:
doc_idx = [] # index-list of indexnumbers that include above title RegEx
for i in ['[Aa]pple', 'i[Pp]ad', 'i[Pp]od', 'i[Pp]hone', '[Ss]teve [Jj]obs', '[Tt]im [Cc]ook']:
    idx_list = df_text[df_text['title'].str.contains(i)].index
    for idx in idx_list:
        if idx not in doc_idx:
            doc_idx.append(idx)

print(f"The index-list now includes {len(np.unique(doc_idx))} unique articles")

In [None]:
# Append all indices of articles that contain the AAPL stock ticker to the index-list
for idx in list(df_text.loc[df_text['ticker'] == 'AAPL'].index):
    doc_idx.append(idx)
    
print(f"The index-list now includes {len(np.unique(doc_idx))} unique articles")

In [None]:
doc_idx = np.unique(doc_idx)

In [None]:
# Only keep articles relevant to Apple in the corpus and reset index of the resulting dataframe
corpus = df_text.iloc[doc_idx].reset_index(drop=True)

### Add stock movement label & match dates
- Add stock movement label to each news article in corpus.
    - Additionaly, it sorts on the date
    - Additionaly, it matches the dates for the two data source because of the inner join
    - This means we only include days that have both news articles and (recorded) stock movements

In [None]:
corpus = corpus.merge(df_stock.loc[:,['Date','Label']], on=None, left_on='release_date', right_on='Date', copy=False, sort=True).drop(['Date'], axis=1)
display(corpus)

### Remove duplicate documents

In [None]:
print(f"{corpus.shape[0]} original number of articles")
corpus.loc[corpus.astype(str).drop_duplicates().index]
print(f"{corpus.shape[0]} after dropping perfect duplicates")
corpus.drop_duplicates(subset=['content'], inplace=True)
print(f"{corpus.shape[0]} after dropping duplicates on content")
corpus.drop_duplicates(subset=['title','release_date'], inplace=True)
print(f"{corpus.shape[0]} after dropping duplicates on title and release_date only")
corpus.reset_index(drop=True,inplace=True)

### Contractions removal

In [None]:
for i in range(len(corpus)):
    corpus.iloc[[i],[4]] = contractions.fix(corpus.iloc[i,4])

### Remove markup & disclaimers

In [None]:
def striphtml(data):
    p = re.compile(r'<.*?>')
    return p.sub('', data)

def strip_newline(data):
    p = re.compile('\n')
    return p.sub(' ', data)

def strip_disclaimer(data):
    p = re.compile('[Aa]t the time of publication [\w\s]+did not own any direct investments in securities mentioned in this article')
    pp = re.compile('[Hh]e may be an owner indirectly as an investor in a fund')
    ppp = re.compile('[Ff]or previous columns [\w\s]+customers can click on')
    pppp = re.compile('[Tt]he opinions expressed are his own')
    temp = p.sub('', data)
    temp = pp.sub('', temp)
    temp = ppp.sub('', temp)
    temp = pppp.sub('', temp)
    return temp


for i in range(len(corpus)):    
    corpus.iloc[[i],[4]] = striphtml(corpus.iloc[i,4])
    corpus.iloc[[i],[4]] = strip_newline(corpus.iloc[i,4])
    corpus.iloc[[i],[4]] = strip_disclaimer(corpus.iloc[i,4])
    corpus.iloc[[i],[4]] = re.sub('[^a-zA-Z]', ' ',corpus.iloc[i,4]) # remove any further special characters
    corpus.iloc[[i],[4]] = re.sub('  +', ' ',corpus.iloc[i,4]) # remove extra spaces

### Tokenize text

In [None]:
# takes about half a minute
corpus['unigrams'] = corpus.apply(lambda row: nltk.word_tokenize(row['content']), axis=1)

### Removing stop words & lowercasing

In [None]:
stop_words = set(stopwords.words('english'))
# add words from standard intros and endings
more_stopwords = ["columnist", "expressed", "eric", "editing"]
for word in more_stopwords:
    stop_words.add(word)

In [None]:
for idx, document in corpus.iterrows():
    no_stop_words= [token.lower () for token in document['unigrams'] if not token.lower () in stop_words]
    corpus.at[idx, 'unigrams'] = no_stop_words

### Punctuation removal

In [None]:
for idx, document in corpus.iterrows():
    no_punctuation = [token for token in document['unigrams'] if token.isalpha()]
    corpus.at[idx, 'unigrams'] = no_punctuation

### Remove nonsensical words (NOT IN USE CURRENTLY)

In [None]:
# english_words = set(words.words())
# custom_words = ["apple", "ipad", "ipod", "iphone", "macos", "osx", "macbook", "steve", "jobs", "tim", "cook"]
# for word in custom_words:
#     english_words.add(word)

# for idx, document in corpus.iterrows():
#     no_nonsense = [token for token in document['unigrams'] if token in english_words]
#     corpus.at[idx, 'unigrams'] = no_nonsense

### Lemmatization

In [None]:
# Lemmatization (takes about 16 seconds)
lemmatizer = WordNetLemmatizer()

custom_words = ["apple", "ipad", "ipod", "iphone", "macos", "osx", "macbook", "steve", "jobs", "tim", "cook"]

for idx, document in corpus.iterrows():
    lemmings= [token if token in custom_words else lemmatizer.lemmatize(token) for token in document['unigrams']]
    corpus.at[idx, 'unigrams'] = lemmings

## Preprocessed corpus exploration

In [None]:
print(f"Corpus has shape: {corpus.shape}")

In [None]:
corpus.category.value_counts().plot(kind='bar',
                                    color = ['royalblue', 'orange'],
                                   title= f"Corpus category distribution ({len(corpus)} documents total)",
                                   figsize = (15,8))

In [None]:
print(corpus.provider.value_counts().head(10))
corpus.provider.value_counts()[:5].plot(kind='bar',
                                        color='royalblue',
                                   title= f"Corpus top 5 provider distribution ({len(corpus)} documents total)",
                                       figsize=(15,8))

# Bag-of-Words (BoW) representation

## BoW creation (Document-Term Matrix)

In [None]:
count_vectorizer = CountVectorizer(tokenizer = lambda doc: doc, ngram_range=(1,1), lowercase=False)
X = count_vectorizer.fit_transform(corpus['unigrams'][:])
df_bow = pd.DataFrame(X.toarray(),columns=count_vectorizer.get_feature_names_out())
df_bow.head()

## BoW exploration

In [None]:
# Create new dataframe based on word frequency
df_bow_frequency = df_bow.sum(axis=0).sort_values(ascending = False)
word_count_df = pd.DataFrame(zip(df_bow_frequency.index, df_bow_frequency.values), columns = ['Tag', 'Count'])
display(word_count_df.head(10))
print(word_count_df['Count'].describe())

In [None]:
# Plot top 10 most frequently occuring words
word_count_df[:10].plot.bar(x='Tag',
                           y='Count',
                           color = ['royalblue'],
                           title= f"BoW top 10 most frequent words",
                           figsize = (15,8))

In [None]:
def calculate_frequency_count_fraction(df, frequency_threshold, print_bool = False):
    frequency_fraction_count = (df.Count <= frequency_threshold).sum() / len(df)
    if print_bool == True:
        print(f"{frequency_fraction_count} % of the unique words only occur {frequency_threshold} time(s)")
    return frequency_fraction_count

In [None]:
frac_list = []
for i in range (0, 100):
    fraction_count = calculate_frequency_count_fraction(word_count_df, i)
    frac_list.append(fraction_count)

plt.figure(figsize=(20, 10))
plt.grid()
plt.xticks(ticks=np.arange(0,100,5))
plt.ylabel("Fractional proportion")
plt.xlabel("Maximum word frequency")
plt.title("Cumulative distribution of maximum word count-frequency over entire corpus", size = 20)
plt.plot(frac_list, marker='o')

## BoW filtering

In [None]:
count_vectorizer = CountVectorizer(tokenizer = lambda doc: doc, ngram_range=(1,1), lowercase=False, min_df = 15, max_df = 0.95)
X = count_vectorizer.fit_transform(corpus['unigrams'][:])
X = X.toarray()
df_bow = pd.DataFrame(X,columns=count_vectorizer.get_feature_names_out())
df_bow.head()

In [None]:
# Create new dataframe based on word frequency
df_bow_frequency = df_bow.sum(axis=0).sort_values(ascending = False)
word_count_df = pd.DataFrame(zip(df_bow_frequency.index, df_bow_frequency.values), columns = ['Tag', 'Count'])
display(word_count_df.head(10))
print(word_count_df['Count'].describe())

In [None]:
# Create the tf-idf representation using the bag-of-words matrix
tfidf_transform = sklearn.feature_extraction.text.TfidfTransformer(norm=None)
X_tfidf = tfidf_transform.fit_transform(df_bow)

# This is a document x words matrix
X_tfidf

X_tfidf_array = X_tfidf.toarray()

# Train & Evaluate
- What is the experimental design used for training and evaluating?
- How does the model perform? What metrics do we use and why?

## Cross-Validation for time series

In [None]:
from sklearn.model_selection import TimeSeriesSplit

X = X_tfidf_array
y = corpus["Label"]

ts_cv = TimeSeriesSplit(
    n_splits=5,                         # 5 might not make a lot of sense. Maybe better to do shorter time intervals -> n_splits = ~200
    gap=0,
    max_train_size=None,
    test_size=None
)

all_splits = list(ts_cv.split(X, y))

## Models

In [None]:
from sklearn.model_selection import cross_validate

# To surpress the non-convergence warnings
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

@ignore_warnings(category=ConvergenceWarning)
def evaluate(model, X, y, cv):
    cv_results = cross_validate(
        model,
        X,
        y,
        cv=cv,
        scoring=["accuracy"],
    )
    acc = cv_results["test_accuracy"]
    fit_time = cv_results["fit_time"]
    print(
        f"===== {model} =====   [fit time: {sum(fit_time):.1f}s]\n"
        f"Accuracy:                {acc.mean():.3f} +/- {acc.std():.3f}\n"
    )
    return(cv_results)

In [None]:
from sklearn.linear_model import LogisticRegression

def gridsearch_count_vectorizer(df_bow):
    
    cv_results_all = []
    ngrams = [(1,1),(1,2),(1,3), (2,2), (3,3)]
    termfreqlist = [2,3]

    test_df = pd.DataFrame(columns=['ngram','term_frequency','accuracy'])

    for ngram in ngrams:
        for termfreq in termfreqlist:

            count_vectorizer = CountVectorizer(tokenizer = lambda doc: doc, ngram_range=(1,1), lowercase=False, min_df = 15, max_df = 0.95)
            X = count_vectorizer.fit_transform(corpus['unigrams'][:])
            X = X.toarray()


            #create a model instance and split dataset to train and test sets
            model = LogisticRegression()
            cv_results = evaluate(model, X, y, ts_cv)
            cv_results_all.append(cv_results)
            
            
            res_dict = {'ngram':ngram,'feature_num':feat,'accuracy':classification['accuracy']}
            res_df = pd.DataFrame([res_dict])
            test_df = pd.concat([test_df,res_df], ignore_index=True, axis=0)
            
    return test_df

In [None]:
score_df = gridsearch_count_vectorizer(df_bow)
score_df

In [None]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier, Perceptron
from sklearn.ensemble import RandomForestClassifier

models = [LogisticRegression(), SGDClassifier(), Perceptron(), LogisticRegression(solver='saga',penalty='l1'), RandomForestClassifier()] #RidgeClassifier takes ~2min to run
cv_results_all = []

for model in models:
    cv_results = evaluate(model, X, y, ts_cv)
    cv_results_all.append(cv_results)

### Hyperparameter tuning?