In [518]:
import pandas as pd
from pathlib import Path
import numpy as np
from numpy.random import seed
seed(1)
import calendar
import os
from dotenv import load_dotenv
import alpaca_trade_api as tradeapi
import hvplot.pandas
import matplotlib.pyplot as plt
from collections import Counter
from panel.interact import interact
import panel as pn
pn.extension('plotly')
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

import tensorflow as tf
from tensorflow import random
random.set_seed(2)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

from keras.callbacks import EarlyStopping
earlystopping = EarlyStopping(monitor='loss', patience=3)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\kn_na\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [408]:
def read_file(ticker):
    file_path = Path('companies_tweet_data_10y/' + ticker + '.csv')
    tweet_df = pd.read_csv(file_path, 
                           parse_dates=[['date','time']], 
                           infer_datetime_format=True, 
                           usecols=['date',"tweet",'time','retweets_count','likes_count'])
    tweet_df.set_index('date_time', inplace=True)
    tweet_df.index.name = None
    tweet_df.sort_index(axis=0, inplace=True)
    
    tweet_df = tweet_df.between_time('00:00:00','15:59:00', include_end=False)

    # Drop time in the index labels
    tweet_df.index = tweet_df.index.date
    tweet_df=tweet_df[(tweet_df['retweets_count']>=tweet_df['retweets_count'].mean())\
                      | (tweet_df['likes_count']>=tweet_df['likes_count'].mean())]
    
    return tweet_df

In [409]:
def sentiment_analysis(df):
    tweet_sentiments = []

    for tweet in df["tweet"]:
        try:
            sentiment = analyzer.polarity_scores(tweet) # get sentiment score
            compound = sentiment["compound"]
            pos = sentiment["pos"]
            neu = sentiment["neu"]
            neg = sentiment["neg"]
        
            tweet_sentiments.append({"compound": compound,
                                     "positive": pos,
                                     "negative": neg,
                                     "neutral": neu
                                    })
        
        except AttributeError:
            pass
    
    # Create DataFrame
    sentiments_df = pd.DataFrame(tweet_sentiments, index=df.index)

    # Reorder DataFrame columns
    df = df.join(sentiments_df)
    
    sentiment_score_df = df.groupby(level=0)[['compound']].count()
    sentiment_score_df["avg_sentiments"] = df.groupby(level=0)[['compound']].mean()
    #sentiment_score_df.rename(columns= {'compound':"article_counts"}, inplace=True)
    sentiment_score_df.drop(columns='compound', inplace=True)
    
    return sentiment_score_df

In [410]:
def stock_prices():
    load_dotenv()
    
    alpaca_api_key = os.getenv("ALPACA_API_KEY")
    alpaca_secret_key = os.getenv("ALPACA_SECRET_KEY")

    # Create the Alpaca API object
    api = tradeapi.REST(
        alpaca_api_key,
        alpaca_secret_key,
        api_version = "v2"
    )
    
    #start_date = pd.Timestamp("2009-07-15", tz="America/New_York").isoformat()
    start_date = pd.Timestamp("2015-01-15", tz="America/New_York").isoformat()
    end_date = pd.Timestamp("2021-01-15", tz="America/New_York").isoformat()
    tickers = ["PFE", "MRNA", "REGN", "MSFT", "SNE"]

    # Create the shares DataFrame
    timeframe = "1D"

    stock_tickers = api.get_barset(
        tickers,
        timeframe,
        start = start_date,
        end = end_date
    ).df

    stock_tickerspx = pd.DataFrame()
    stock_tickerspx["PFE"] = stock_tickers["PFE"]["close"]
    stock_tickerspx["MRNA"] = stock_tickers["MRNA"]["close"]
    stock_tickerspx["REGN"] = stock_tickers["REGN"]["close"]
    stock_tickerspx["MSFT"] = stock_tickers["MSFT"]["close"]
    stock_tickerspx["SNE"] = stock_tickers["SNE"]["close"]

    # Drop the time component of the date
    stock_tickerspx.index = stock_tickers.index.date
    index = stock_tickerspx.index
    index.name = "Date"
    stock_tickerspx.reset_index(level=0, inplace=True)
    stock_tickerspx.set_index(pd.to_datetime(stock_tickerspx['Date'], infer_datetime_format=True), 
                              inplace=True, 
                              drop=True
                             )
    stock_tickerspx.drop(columns=['Date'], inplace=True)

    return stock_tickerspx

In [411]:
def manipulate_df(df, ticker):
    signals_df1 = df[[ticker]].copy()
    short_window = 20
    long_window = 100
    signals_df1["SMA20"] = signals_df1[ticker].rolling(window=short_window).mean()
    signals_df1["SMA100"] = signals_df1[ticker].rolling(window=long_window).mean()
    signals_df1["Signal"] = 0.0        
    signals_df1["Signal"][short_window:] = np.where(
        signals_df1["SMA20"][short_window:] > signals_df1["SMA100"][short_window:], 1.0, 0.0
        )   
    signals_df1["Entry/Exit"] = signals_df1["Signal"].diff()
    signals_df1['daily_return']=signals_df1[ticker].pct_change()
    signals_df1['actual_signal'] = np.where(signals_df1['daily_return'] > 0, 1.0, 0.0)
    signals_df1["SMA_shift"]=signals_df1['SMA20'].shift()
    signals_df1.dropna(inplace=True, subset=['daily_return'])
    
    return signals_df1

In [412]:
def final_df(df1, df2, ticker):
    df3=df1.join([df2])
    df3=df3[[ticker,"SMA20","daily_return","actual_signal","avg_sentiments",'SMA_shift']]
    
    return df3

## Data Creation

### Regression 1 Feature

In [413]:
def dataset_regression_1f(df, ticker):
    df = df[['SMA_shift',ticker,'avg_sentiments']]
    df.dropna(inplace=True, subset=['avg_sentiments','SMA_shift',ticker])
    X=df[['SMA_shift']].values.reshape(-1,1)
    y=df[[ticker]].values.reshape(-1,1)

    split = int(0.8 * len(X))

    X_train = X[: split]
    X_test = X[split:]

    y_train = y[: split]
    y_test = y[split:]
    
    return X, X_train, X_test, y, y_train, y_test

In [414]:
def scaler_regression_1f(X, X_train, X_test, y, y_train, y_test):
    # Create a MinMaxScaler object
    scaler = MinMaxScaler()

    # Fit the MinMaxScaler object with the features data X
    scaler.fit(X)

    # Scale the features training and testing sets
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    # Fit the MinMaxScaler object with the target data Y
    scaler.fit(y)

    # Scale the target training and testing sets
    y_train = scaler.transform(y_train)
    y_test = scaler.transform(y_test)
    
    return X, X_train, X_test, y, y_train, y_test

### Regression 2 Features

In [415]:
def dataset_regression_2f(df, ticker):
    df = df[['SMA_shift',ticker,'avg_sentiments']]
    df.dropna(inplace=True, subset=['avg_sentiments','SMA_shift',ticker])
    X=df[['SMA_shift','avg_sentiments']]
    y=df[[ticker]]

    split = int(0.8 * len(X))

    X_train = X[: split]
    X_test = X[split:]

    y_train = y[: split]
    y_test = y[split:]
    
    return X, X_train, X_test, y, y_train, y_test

In [416]:
def scaler_regression_2f(X, X_train, X_test, y, y_train, y_test):
    # Create a MinMaxScaler object
    scaler = MinMaxScaler()

    # Fit the MinMaxScaler object with the features data X
    scaler.fit(X)

    # Scale the features training and testing sets
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    # Fit the MinMaxScaler object with the target data Y
    scaler.fit(np.array(y).reshape(-1, 1))

    # Scale the target training and testing sets
    y_train = scaler.transform(np.array(y_train).reshape(-1, 1))
    y_test = scaler.transform(np.array(y_test).reshape(-1, 1))
    
    return X, X_train, X_test, y, y_train, y_test

### Classification 1 Feature

In [460]:
def dataset_classification_1f(df, ticker):
    df = df[['SMA_shift',ticker,'avg_sentiments','actual_signal']]
    df.dropna(inplace=True, subset=['avg_sentiments','SMA_shift',ticker])
    X=df[['SMA_shift']].values.reshape(-1,1)
    y=df[['actual_signal']].values.reshape(-1,1)

    split = int(0.8 * len(X))

    X_train = X[: split]
    X_test = X[split:]

    y_train = y[: split]
    y_test = y[split:]
    
    return X, X_train, X_test, y, y_train, y_test

In [461]:
def scaler_classification_1f(X, X_train, X_test):
# Create a MinMaxScaler object
    scaler = MinMaxScaler()

# Fit the MinMaxScaler object with the features data X
    scaler.fit(X)

# Scale the features training and testing sets
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

# # Fit the MinMaxScaler object with the target data Y
# scaler.fit(y3)
    return X, X_train, X_test
# # Scale the target training and testing sets
# y3_train = scaler.transform(y3_train)
# y3_test = scaler.transform(y3_test)

### Classification 2 Features

In [492]:
def dataset_classification_2f(df, ticker):
    df = df[['SMA_shift',ticker,'avg_sentiments','actual_signal']]
    df.dropna(inplace=True, subset=['avg_sentiments','SMA_shift',ticker])
    X=df[['SMA_shift','avg_sentiments']].values.reshape(-1,2)
    y=df[['actual_signal']].values.reshape(-1,1)

    split = int(0.8 * len(X))

    X_train = X[: split]
    X_test = X[split:]

    y_train = y[: split]
    y_test = y[split:]
    
    return X, X_train, X_test, y, y_train, y_test

In [493]:
def scaler_classification_2f(X, X_train, X_test):
# Create a MinMaxScaler object
    scaler = MinMaxScaler()

# Fit the MinMaxScaler object with the features data X
    scaler.fit(X)

# Scale the features training and testing sets
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

# # Fit the MinMaxScaler object with the target data Y
# scaler.fit(y3)
    return X, X_train, X_test
# # Scale the target training and testing sets
# y3_train = scaler.transform(y3_train)
# y3_test = scaler.transform(y3_test)

## Model

## Regression

### Dense 1 feature

In [462]:
def dense_lr_1f(X_train, y_train):
    model = Sequential()
    model.add(Dense(20, activation='relu', input_shape=(1,)))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam',
                     loss='mse',
                     metrics=['mae'])
    history = model.fit(X_train, y_train, 
                        batch_size=20, 
                        epochs=500, 
                        verbose=0, 
                        callbacks=[earlystopping]
                       )
    return history, model

In [418]:
def score_dense_lr_1f(history):
    df = pd.DataFrame(history.history, 
                      index=range(1, len(history.history["loss"]) + 1)
                     )
    df=df.rename(columns={'loss': 'loss_1f', 'mae': 'mae_1f'})
    
    return df

In [419]:
def pred_dense_lr_1f(model, X_test, y_test):
    predicted = model.predict(X_test)
    result = pd.DataFrame(predicted)
    result.columns = ['predict_1feature']
    result['actual'] = y_test
    #result.plot()
    return result

### LSTM 1 feature

In [420]:
def lstm_lr_1f(X_train, y_train):
    model = Sequential()
    model.add(LSTM(20,
             #dropout=0.2,
             #recurrent_dropout=0.2,
             return_sequences=True,
             input_shape=(1,1)))
    model.add(Dropout(0.2))
    # model_2.add(Dense(5, activation='relu'))
    # model_2.add(Dropout(0.2))
    model.add(LSTM(units=20, return_sequences=True))
    model.add(Dropout(0.2))

    model.add(LSTM(units=20))
    model.add(Dropout(0.2))

    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam',
                  loss='mse',
                  metrics=['mae'])
    X_train_res=X_train[:,:,np.newaxis]
    history=model.fit(X_train_res, y_train, 
                      batch_size=20, 
                      epochs=500, 
                      verbose=0, 
                      callbacks=[earlystopping]
                     )
    return history, model

In [421]:
def score_lstm_lr_1f(history):
    df = pd.DataFrame(history.history, 
                      index=range(1, len(history.history["loss"]) + 1)
                     )
    df=df.rename(columns={'loss': 'loss_1f', 'mae': 'mae_1f'})
    
    return df

In [422]:
def pred_lstm_lr_1f(model, X_test, y_test):
    predicted = model.predict(X_test[:,:,np.newaxis])
    result = pd.DataFrame(predicted)
    result.columns = ['predict_1feature']
    result['actual'] = y_test
    #result.plot()
    return result

### Dense 2 features

In [423]:
def dense_lr_2f(X_train, y_train):
    model = Sequential()
    model.add(Dense(20, activation='relu', input_shape=(2,)))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam',
                     loss='mse',
                     metrics=['mae'])
    history = model.fit(X_train, y_train, 
                        batch_size=20, 
                        epochs=500, 
                        verbose=0, 
                        callbacks=[earlystopping]
                       )
    return history, model

In [424]:
def score_dense_lr_2f(history):
    df = pd.DataFrame(history.history, 
                      index=range(1, len(history.history["loss"]) + 1)
                     )
    df=df.rename(columns={'loss': 'loss_2f', 'mae': 'mae_2f'})
    
    return df

In [425]:
def pred_dense_lr_2f(model, X_test, y_test):
    predicted = model.predict(X_test)
    result = pd.DataFrame(predicted)
    result.columns = ['predict_2features']
    result['actual'] = y_test
    #result.plot()
    return result

### LSTM 2 features

In [426]:
def lstm_lr_2f(X_train, y_train):
    model = Sequential()
    model.add(LSTM(20,
             #dropout=0.2,
             #recurrent_dropout=0.2,
             return_sequences=True,
             input_shape=(2,1)))
    model.add(Dropout(0.2))
    # model_2.add(Dense(5, activation='relu'))
    # model_2.add(Dropout(0.2))
    model.add(LSTM(units=20, return_sequences=True))
    model.add(Dropout(0.2))

    model.add(LSTM(units=20))
    model.add(Dropout(0.2))

    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam',
                  loss='mse',
                  metrics=['mae'])
    X_train_res=X_train[:,:,np.newaxis]
    history=model.fit(X_train_res, y_train, 
                      batch_size=20, 
                      epochs=500, 
                      verbose=0, 
                      callbacks=[earlystopping]
                     )
    return history, model

In [427]:
def score_lstm_lr_2f(history):
    df = pd.DataFrame(history.history, 
                      index=range(1, len(history.history["loss"]) + 1)
                     )
    df=df.rename(columns={'loss': 'loss_2f', 'mae': 'mae_2f'})
    
    return df

In [428]:
def pred_lstm_lr_2f(model, X_test, y_test):
    predicted = model.predict(X_test[:,:,np.newaxis])
    result = pd.DataFrame(predicted)
    result.columns = ['predict_2features']
    result['actual'] = y_test
    #result.plot()
    return result

## Classification

### Dense 1 Feature

In [456]:
def dense_cl_1f(X_train, y_train):
    model = Sequential()
    model.add(Dense(20, activation='relu', input_shape=(1,)))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
           	      metrics=['acc'])
    history=model.fit(X_train, y_train, 
                      batch_size=20, 
                      epochs=500, 
                      verbose=0, 
                      callbacks=[earlystopping]
                     )
    return history, model

In [469]:
def eva_dense_cl_1f(model, X_test, y_test):
    accuracy = model.evaluate(X_test, y_test, verbose=0)[1]
    dic = {'Dense': accuracy}
    return dic

### LSTM 1 Feature

In [475]:
def lstm_cl_1f(X_train, y_train):
    model = Sequential()
    model.add(LSTM(20,
                 #dropout=0.2,
                 #recurrent_dropout=0.2,
                 return_sequences=True,
                 input_shape=(1,1)))
    model.add(Dropout(0.2))
    # model_2.add(Dense(5, activation='relu'))
    # model_2.add(Dropout(0.2))
    model.add(LSTM(units=20, return_sequences=True))
    model.add(Dropout(0.2))

    model.add(LSTM(units=20))
    model.add(Dropout(0.2))

    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['acc'])
    X_train_res = X_train[:,:,np.newaxis]
    history = model.fit(X_train_res, y_train, 
                        batch_size=20, 
                        epochs=500, 
                        verbose=0, 
                        callbacks=[earlystopping]
                       )
    
    return history, model

In [476]:
def eva_lstm_cl_1f(model, X_test, y_test):
    accuracy = model.evaluate(X_test[:,:,np.newaxis], y_test, verbose=0)[1]
    dic = {'LSTM': accuracy}
    return dic

### Dense 2 Features

In [497]:
def dense_cl_2f(X_train, y_train):
    model = Sequential()
    model.add(Dense(20, activation='relu', input_shape=(2,)))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
           	      metrics=['acc'])
    history=model.fit(X_train, y_train, 
                      batch_size=20, 
                      epochs=500, 
                      verbose=0, 
                      callbacks=[earlystopping]
                     )
    return history, model

In [498]:
def eva_dense_cl_2f(model, X_test, y_test):
    accuracy = model.evaluate(X_test, y_test, verbose=0)[1]
    dic = {'DENSE': accuracy}
    return dic

### LSTM 2 Features

In [502]:
def lstm_cl_2f(X_train, y_train):
    model = Sequential()
    model.add(LSTM(20,
                 #dropout=0.2,
                 #recurrent_dropout=0.2,
                 return_sequences=True,
                 input_shape=(2,1)))
    model.add(Dropout(0.2))
    # model_2.add(Dense(5, activation='relu'))
    # model_2.add(Dropout(0.2))
    model.add(LSTM(units=20, return_sequences=True))
    model.add(Dropout(0.2))

    model.add(LSTM(units=20))
    model.add(Dropout(0.2))

    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam',
               loss='binary_crossentropy',
               metrics=['acc'])
    X_train_res = X_train[:,:,np.newaxis]
    history = model.fit(X_train_res, y_train, 
                        batch_size=20, 
                        epochs=500, 
                        verbose=0, 
                        callbacks=[earlystopping]
                       )
    
    return history, model

In [503]:
def eva_lstm_cl_2f(model, X_test, y_test):
    accuracy = model.evaluate(X_test[:,:,np.newaxis], y_test, verbose=0)[1]
    dic = {'LSTM': accuracy}
    return dic

### Other Models: Logistic Regression, SVC, Decision Tree Classifier, Random Forest Classifier, XGB Classifier

In [523]:
def sklearn_cl_models(X_train, y_train, X_test, y_test):
    # Logistic regression
    algorithm1 = LogisticRegression(random_state=1)
    # Support vector machine
    algorithm2 = SVC(kernel='rbf', random_state=1)
    # Decision Tree Classifier
    algorithm3 = DecisionTreeClassifier(random_state=1)
    # Random Forest
    algorithm4 = RandomForestClassifier(random_state=1)
    # XGBoost
    algorithm5 = XGBClassifier(random_state=1)
    # Create a list of the five model instances
    algorithms = [algorithm1, algorithm2, algorithm3, algorithm4, algorithm5]
    
    # Create a balanced set of samples, create a StratifiedKFold instance
    stratifiedkfold = StratifiedKFold(n_splits=5)
    
    dic = {}

    
    for algorithm in algorithms:
        algorithm.fit(X_train, y_train)
        score = algorithm.score(X_test, y_test)
        name = algorithm.__class__.__name__
        dic[name] = score 
    
#     for algorithm in algorithms:
#         # Conduct cross validation for each one of the five models
#         scores = cross_val_score(algorithm, X_train, y_train, cv=stratifiedkfold)
#         score = scores.mean()
#         name = algorithm.__class__.__name__
#         dic[name] = score 
    
    return dic

In [528]:
def sklearn_cl_models_cross_validation(X, y):
    # Logistic regression
    algorithm1 = LogisticRegression(random_state=1)
    # Support vector machine
    algorithm2 = SVC(kernel='rbf', random_state=1)
    # Decision Tree Classifier
    algorithm3 = DecisionTreeClassifier(random_state=1)
    # Random Forest
    algorithm4 = RandomForestClassifier(random_state=1)
    # XGBoost
    algorithm5 = XGBClassifier(random_state=1)
    # Create a list of the five model instances
    algorithms = [algorithm1, algorithm2, algorithm3, algorithm4, algorithm5]
    
    # Create a balanced set of samples, create a StratifiedKFold instance
    stratifiedkfold = StratifiedKFold(n_splits=5)
    
    dic = {}
    
    for algorithm in algorithms:
        # Conduct cross validation for each one of the five models
        scores = cross_val_score(algorithm, X, y, cv=stratifiedkfold)
        score = scores.mean()
        name = algorithm.__class__.__name__
        dic[name + ' (Cross Validation)'] = score 
    
    return dic

In [529]:
def svc_optimizer(X_train, y_train, X_test, y_test):
    # Optimize the parameters
    params = {'C':[1, 10, 100, 1000, 10000], 'gamma':[1, 0.1, 0.01, 0.001, 0.0001, 0.00001]}
    algorithm = SVC(random_state=1)

    stratifiedkfold = StratifiedKFold(n_splits=5)
    gs = GridSearchCV(algorithm, params, cv=stratifiedkfold)
    gs.fit(X_train, y_train)

    # Based on the best parameters, predict y values from test data
    best = gs.best_estimator_
    best_pred = best.predict(X_test)

    # Accuracy
    score = best.score(X_test, y_test)
    
    dic = {'SVC (Grid Search)': score}
    
    return dic

## Visualization

### Regression

In [429]:
def score_loss_lr_1fvs2f(df_1f, df_2f):
    df=pd.concat([df_1f,df_2f], axis=1)
    return df[['loss_1f','loss_2f']].hvplot()

In [430]:
def score_mae_lr_1fvs2f(df_1f, df_2f):
    df=pd.concat([df_1f,df_2f], axis=1)
    return df[['mae_1f','mae_2f']].hvplot()

In [431]:
def pred_lr_1fvs2f(df_1f, df_2f):
    df=df_1f.merge(df_2f, on='actual')
    return df.hvplot()

### Classification

In [None]:
def score_df(
    dict1, dict2, dict3, dict4, dict5
    dict6, dict7, dict8, dict9, dict10
):
    dict1.update(dict2)
    dict1.update(dict3)
    dict1.update(dict4)
    dict_1f = dict1.update(dict5)

    dict6.update(dict7)
    dict6.update(dict8)
    dict6.update(dict9)
    dict_2f = dict6.update(dict10)

    df = pd.DataFrame([dict_1f, dict_2f], index=['1 Feature', '2 Features']).T

    return df

# Application to 5 Companies

## MSFT

## Data Creation

In [432]:
df=read_file("MSFT")
df1=sentiment_analysis(df)
df2=stock_prices()
df3=manipulate_df(df2, 'MSFT')
df_MSFT=final_df(df3, df1, 'MSFT')
df_MSFT.head()

Unnamed: 0_level_0,MSFT,SMA20,daily_return,actual_signal,avg_sentiments,SMA_shift
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-01-16,46.22,,0.016271,1.0,,
2015-01-20,46.39,,0.003678,1.0,0.4767,
2015-01-21,45.91,,-0.010347,0.0,0.0,
2015-01-22,47.12,,0.026356,1.0,,
2015-01-23,47.18,,0.001273,1.0,,


### Regression

### Dense 1 feature

In [433]:
X, X_train, X_test, y, y_train, y_test=dataset_regression_1f(df_MSFT, 'MSFT')
X, X_train, X_test, y, y_train, y_test=scaler_regression_1f(X, X_train, X_test, y, y_train, y_test)

In [434]:
msft_dense_lr_1f_history, msft_dense_lr_1f_model =dense_lr_1f(X_train, y_train)

In [435]:
score_dense_lr_1f_df = score_dense_lr_1f(msft_dense_lr_1f_history)

In [436]:
pred_dense_lr_1f_df = pred_dense_lr_1f(msft_dense_lr_1f_model, X_test, y_test)

### LSTM 1 feature

In [437]:
msft_lstm_lr_1f_history, msft_lstm_lr_1f_model =lstm_lr_1f(X_train, y_train)

In [438]:
score_lstm_lr_1f_df = score_lstm_lr_1f(msft_lstm_lr_1f_history)

In [439]:
pred_lstm_lr_1f_df = pred_lstm_lr_1f(msft_lstm_lr_1f_model, X_test, y_test)

### Dense 2 features

In [440]:
X, X_train, X_test, y, y_train, y_test=dataset_regression_2f(df_MSFT, 'MSFT')
X, X_train, X_test, y, y_train, y_test=scaler_regression_2f(X, X_train, X_test, y, y_train, y_test)

In [441]:
msft_dense_lr_2f_history, msft_dense_lr_2f_model = dense_lr_2f(X_train, y_train)

In [442]:
score_dense_lr_2f_df = score_dense_lr_2f(msft_dense_lr_2f_history)

In [443]:
pred_dense_lr_2f_df = pred_dense_lr_2f(msft_dense_lr_2f_model, X_test, y_test)

### LSTM 2 features

In [444]:
msft_lstm_lr_2f_history, msft_lstm_lr_2f_model = lstm_lr_2f(X_train, y_train)

In [445]:
score_lstm_lr_2f_df = score_lstm_lr_2f(msft_lstm_lr_2f_history)

In [446]:
pred_lstm_lr_2f_df = pred_lstm_lr_2f(msft_lstm_lr_2f_model, X_test, y_test)

### Visualization

In [447]:
score_loss_lr_1fvs2f(score_dense_lr_1f_df, score_dense_lr_2f_df)

In [448]:
score_mae_lr_1fvs2f(score_dense_lr_1f_df, score_dense_lr_2f_df)

In [449]:
pred_lr_1fvs2f(pred_dense_lr_1f_df,pred_lstm_lr_2f_df)

In [450]:
score_loss_lr_1fvs2f(score_lstm_lr_1f_df, score_lstm_lr_2f_df)

In [451]:
score_mae_lr_1fvs2f(score_lstm_lr_1f_df, score_lstm_lr_2f_df)

In [452]:
pred_lr_1fvs2f(pred_lstm_lr_1f_df,pred_lstm_lr_2f_df)

### Classification

### Dense 1 Feature

In [524]:
X, X_train, X_test, y, y_train, y_test = dataset_classification_1f(df_MSFT, "MSFT")
X, X_train, X_test = scaler_classification_1f(X, X_train, X_test)

In [470]:
history, model = dense_cl_1f(X_train, y_train)
dic_dense_cl_1f = eva_dense_cl_1f(model, X_test, y_test)

{'Dense 1 Feature': 0.5319148898124695}

### LSTM 1 Feature

In [477]:
history, model = lstm_cl_1f(X_train, y_train)
dic_lstm_cl_1f = eva_lstm_cl_1f(model, X_test, y_test)

{'LSTM 1 Feature': 0.5319148898124695}

### Logistic Regression, SVC, Decision Tree Classifier, Random Forest Classifier, XGB Classifier

In [525]:
sklearn_cl_models_1f = sklearn_cl_models(X_train, y_train, X_test, y_test)



{'LogisticRegression': 0.5319148936170213,
 'SVC': 0.5319148936170213,
 'DecisionTreeClassifier': 0.5319148936170213,
 'RandomForestClassifier': 0.5319148936170213,
 'XGBClassifier': 0.46808510638297873}

In [530]:
sklearn_cl_models_cross_validation_1f = sklearn_cl_models_cross_validation(X, y)



{'LogisticRegression': 0.5355825734549138,
 'SVC': 0.5157244174265451,
 'DecisionTreeClassifier': 0.4032624113475178,
 'RandomForestClassifier': 0.4060688956433637,
 'XGBClassifier': 0.39896656534954406}

In [531]:
svc_optimizer_1f = svc_optimizer(X_train, y_train, X_test, y_test)

{'SVC Grid Search': 0.5319148936170213}

### Dense 2 Features

In [526]:
X, X_train, X_test, y, y_train, y_test = dataset_classification_2f(df_MSFT, "MSFT")
X, X_train, X_test = scaler_classification_2f(X, X_train, X_test)

In [500]:
history, model = dense_cl_2f(X_train, y_train)
dic_dense_cl_2f = eva_dense_cl_2f(model, X_test, y_test)

{'DENSE 2 Features': 0.5602836608886719}

### LSTM 2 Features

In [504]:
history, model = lstm_cl_2f(X_train, y_train)
dic_lstm_cl_2f = eva_lstm_cl_2f(model, X_test, y_test)

{'LSTM 2 Features': 0.5319148898124695}

### Logistic Regression, SVC, Decision Tree Classifier, Random Forest Classifier, XGB Classifier

In [527]:
sklearn_cl_models_2f = sklearn_cl_models(X_train, y_train, X_test, y_test)



{'LogisticRegression': 0.5319148936170213,
 'SVC': 0.5319148936170213,
 'DecisionTreeClassifier': 0.48936170212765956,
 'RandomForestClassifier': 0.5035460992907801,
 'XGBClassifier': 0.46808510638297873}

In [532]:
sklearn_cl_models_cross_validation_2f = sklearn_cl_models_cross_validation(X, y)



{'LogisticRegression': 0.5355825734549138,
 'SVC': 0.5157244174265451,
 'DecisionTreeClassifier': 0.4032624113475178,
 'RandomForestClassifier': 0.4060688956433637,
 'XGBClassifier': 0.39896656534954406}

In [533]:
svc_optimizer_2f = svc_optimizer(X_train, y_train, X_test, y_test)

{'SVC Grid Search': 0.5319148936170213}

## Scores between 1 Feature and 2 Features

In [537]:
score_df = score_df(dic_dense_cl_1f,
                    dic_lstm_cl_1f,
                    sklearn_cl_models_1f,
                    sklearn_cl_models_cross_validation_1f,
                    svc_optimizer_1f,
                    dic_dense_cl_2f,
                    dic_lstm_cl_2f,
                    sklearn_cl_models_2f,
                    sklearn_cl_models_cross_validation_2f,
                    svc_optimizer_2f
                   )

Unnamed: 0,DecisionTreeClassifier,LSTM 2 Features,LogisticRegression,RandomForestClassifier,SVC,XGBClassifier
0,,0.531915,,,,
1,0.403262,,0.535583,0.406069,0.515724,0.398967
