# Import Price Data From Kraken API:

In [6]:
import krakenex 
from pykrakenapi import KrakenAPI
import sys
import time
import urllib.request
import json
import requests
import pandas as pd
import numpy as np 

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.decomposition import PCA

In [7]:
api = krakenex.API()
k = KrakenAPI(api)

In [8]:
ohlc = k.get_ohlc_data("BTCUSD")
print(ohlc)

(                           time     open     high      low    close     vwap  \
dtime                                                                          
2021-06-09 20:00:00  1623268800  36291.8  36384.6  36291.8  36354.1  36338.7   
2021-06-09 19:59:00  1623268740  36261.9  36284.2  36261.9  36281.7  36279.3   
2021-06-09 19:58:00  1623268680  36249.3  36338.3  36244.5  36268.2  36275.7   
2021-06-09 19:57:00  1623268620  36297.5  36309.6  36260.1  36270.0  36273.0   
2021-06-09 19:56:00  1623268560  36374.3  36374.3  36290.0  36290.0  36364.1   
...                         ...      ...      ...      ...      ...      ...   
2021-06-09 08:05:00  1623225900  33898.8  33938.3  33881.4  33899.8  33901.4   
2021-06-09 08:04:00  1623225840  33932.1  33954.2  33861.3  33901.8  33914.6   
2021-06-09 08:03:00  1623225780  34033.0  34036.8  33925.0  33925.0  33964.8   
2021-06-09 08:02:00  1623225720  34140.1  34140.1  34015.5  34047.6  34059.4   
2021-06-09 08:01:00  1623225660  34138.

In [9]:
def fetch_OHLC_data(symbol, timeframe):
    pair_split = symbol.split('/')
    symbol = pair_split[0] + pair_split[1]
    url = f'https://api.kraken.com/0/public/OHLC?pair={symbol}&interval={timeframe}'
    response = requests.get(url)
    if response.status_code == 200: 
        j = json.loads(response.text)
        result = j['result']
        keys = []
        for item in result:
            keys.append(item)
        if keys[0] != 'last':
            data = pd.DataFrame(result[keys[0]],
                                columns=['unix', 'open', 'high', 'low', 'close', 'vwap', 'volume', 'tradecount'])
        else:
            data = pd.DataFrame(result[keys[1]],
                                columns=['unix', 'open', 'high', 'low', 'close', 'vwap', 'volume', 'tradecount'])

        data['date'] = pd.to_datetime(data['unix'], unit='s')
        data['volume_from'] = data['volume'].astype(float) * data['close'].astype(float)
    return data

In [10]:
btc = fetch_OHLC_data(symbol="BTC/USD", timeframe="1440")
btc = btc.drop(columns=["unix", "open", "high", "low", "volume_from"])
btc = btc.set_index("date")
btc['close'] = btc['close'].astype(float)
btc['returns'] = btc.close.pct_change()
btc['change_in_price'] = btc['returns']
btc.change_in_price[btc.change_in_price > 0] = 1
btc.change_in_price[btc.change_in_price <= 0] = 0
btc.dropna(inplace=True)
btc['change_in_price'] = btc['change_in_price'].astype(int)
btc

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,close,vwap,volume,tradecount,returns,change_in_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-06-22,10663.8,10665.9,12842.48586265,54100,0.043506,1
2019-06-23,10814.1,10837.5,5943.46891708,34849,0.014094,1
2019-06-24,11020.6,10843.5,5457.41272587,26308,0.019095,1
2019-06-25,11765.1,11340.2,11623.88144358,40223,0.067555,1
2019-06-26,12933.7,12684.3,25637.82859518,100171,0.099328,1
...,...,...,...,...,...,...
2021-06-05,35534.6,36066.4,5532.75325986,39833,-0.035636,0
2021-06-06,35789.0,35930.0,3187.54640121,23692,0.007159,1
2021-06-07,33587.6,35319.4,6165.92045486,42615,-0.061511,0
2021-06-08,33420.0,32712.0,10548.32694320,72896,-0.004990,0


# Generate Quantitative Trading Signals:

In [11]:
def generate_signals(btc):
    """Generates trading signals for a given dataset."""
    # Grab just the `date` and `close` from the IEX dataset
    signals_df = btc.copy()
    # Set the short window and long windows
    short_window = 50
    long_window = 100

    # Generate the short and long moving averages (50 and 100 days, respectively)
    signals_df["SMA50"] = signals_df["close"].rolling(window=short_window).mean()
    signals_df["SMA100"] = signals_df["close"].rolling(window=long_window).mean()

    signals_df["Signal"] = 0.0
    # Generate the trading signal 0 or 1,
    # where 0 is when the SMA50 is under the SMA100, and
    # where 1 is when the SMA50 is higher (or crosses over) the SMA100
    signals_df["Signal"][short_window:] = np.where(
        signals_df["SMA50"][short_window:] > signals_df["SMA100"][short_window:],
        1.0,
        0.0,
    )
    # Calculate the points in time at which a position should be taken, 1 or -1
    signals_df["Entry/Exit"] = signals_df["Signal"].diff()
    return signals_df

In [12]:
mov_avg = generate_signals(btc)
mov_avg.tail()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,close,vwap,volume,tradecount,returns,change_in_price,SMA50,SMA100,Signal,Entry/Exit
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2021-06-05,35534.6,36066.4,5532.75325986,39833,-0.035636,0,47714.53,51769.148,0.0,0.0
2021-06-06,35789.0,35930.0,3187.54640121,23692,0.007159,1,47228.606,51663.893,0.0,0.0
2021-06-07,33587.6,35319.4,6165.92045486,42615,-0.061511,0,46775.562,51538.067,0.0,0.0
2021-06-08,33420.0,32712.0,10548.3269432,72896,-0.00499,0,46329.562,51419.813,0.0,0.0
2021-06-09,36281.7,34685.0,8188.31487431,56248,0.085628,1,45925.196,51286.423,0.0,0.0


In [13]:
bollinger_window = 20

# Calculate rolling mean and standard deviation
all_sig = mov_avg.copy()
all_sig['bollinger_mid_band'] = mov_avg['close'].rolling(window=bollinger_window).mean()
all_sig['bollinger_std'] = mov_avg['close'].rolling(window=20).std()

# Calculate upper and lowers bands of bollinger band
all_sig['bollinger_upper_band']  = all_sig['bollinger_mid_band'] + (all_sig['bollinger_std'] * 1)
all_sig['bollinger_lower_band']  = all_sig['bollinger_mid_band'] - (all_sig['bollinger_std'] * 1)

# Calculate bollinger band trading signal
all_sig['bollinger_long'] = np.where(all_sig['close'] < all_sig['bollinger_lower_band'], 1.0, 0.0)
all_sig['bollinger_short'] = np.where(all_sig['close'] > all_sig['bollinger_upper_band'], -1.0, 0.0)
all_sig['bollinger_signal'] = all_sig['bollinger_long'] + all_sig['bollinger_short']
all_sig.tail()

Unnamed: 0_level_0,close,vwap,volume,tradecount,returns,change_in_price,SMA50,SMA100,Signal,Entry/Exit,bollinger_mid_band,bollinger_std,bollinger_upper_band,bollinger_lower_band,bollinger_long,bollinger_short,bollinger_signal
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2021-06-05,35534.6,36066.4,5532.75325986,39833,-0.035636,0,47714.53,51769.148,0.0,0.0,37872.175,2412.901126,40285.076126,35459.273874,0.0,0.0,0.0
2021-06-06,35789.0,35930.0,3187.54640121,23692,0.007159,1,47228.606,51663.893,0.0,0.0,37484.035,2047.953661,39531.988661,35436.081339,0.0,0.0,0.0
2021-06-07,33587.6,35319.4,6165.92045486,42615,-0.061511,0,46775.562,51538.067,0.0,0.0,37018.655,1795.580802,38814.235802,35223.074198,1.0,0.0,1.0
2021-06-08,33420.0,32712.0,10548.3269432,72896,-0.00499,0,46329.562,51419.813,0.0,0.0,36852.54,1967.902434,38820.442434,34884.637566,1.0,0.0,1.0
2021-06-09,36281.7,34685.0,8188.31487431,56248,0.085628,1,45925.196,51286.423,0.0,0.0,36635.895,1759.399142,38395.294142,34876.495858,0.0,0.0,0.0


In [14]:
from ta.momentum import RSIIndicator

In [15]:
rsi_14 = RSIIndicator(close = all_sig.close, window = 14)
all_sig["RSI"] = rsi_14.rsi()
all_sig["RSI_long"] = np.where(all_sig["RSI"] < 30, 1.0, 0.0)
all_sig["RSI_short"] = np.where(all_sig["RSI"] > 70, -1.0, 0.0)
all_sig["RSI_signal"] = all_sig["RSI_long"] + all_sig["RSI_short"]
all_sig.tail()

Unnamed: 0_level_0,close,vwap,volume,tradecount,returns,change_in_price,SMA50,SMA100,Signal,Entry/Exit,...,bollinger_std,bollinger_upper_band,bollinger_lower_band,bollinger_long,bollinger_short,bollinger_signal,RSI,RSI_long,RSI_short,RSI_signal
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-06-05,35534.6,36066.4,5532.75325986,39833,-0.035636,0,47714.53,51769.148,0.0,0.0,...,2412.901126,40285.076126,35459.273874,0.0,0.0,0.0,37.564257,0.0,0.0,0.0
2021-06-06,35789.0,35930.0,3187.54640121,23692,0.007159,1,47228.606,51663.893,0.0,0.0,...,2047.953661,39531.988661,35436.081339,0.0,0.0,0.0,38.258192,0.0,0.0,0.0
2021-06-07,33587.6,35319.4,6165.92045486,42615,-0.061511,0,46775.562,51538.067,0.0,0.0,...,1795.580802,38814.235802,35223.074198,1.0,0.0,1.0,34.667528,0.0,0.0,0.0
2021-06-08,33420.0,32712.0,10548.3269432,72896,-0.00499,0,46329.562,51419.813,0.0,0.0,...,1967.902434,38820.442434,34884.637566,1.0,0.0,1.0,34.402798,0.0,0.0,0.0
2021-06-09,36281.7,34685.0,8188.31487431,56248,0.085628,1,45925.196,51286.423,0.0,0.0,...,1759.399142,38395.294142,34876.495858,0.0,0.0,0.0,42.47957,0.0,0.0,0.0


In [16]:
# Generate the trading signal 0 or 1,
# where 0 is when the close is under the VWAP, and
# where 1 is when the close is higher (or crosses over) the VWAP
all_sig["VWAP_long"] = np.where(all_sig["close"] < all_sig["vwap"], 0)
all_sig["VWAP_short"] = np.where(all_sig["close"] > all_sig["vwap"], 1)
all_sig["VWAP_signal"] = all_sig["VWAP_short"] + all_sig["VWAP_long"]
all_sig.tail()


TypeError: '<' not supported between instances of 'float' and 'str'

# PCA Analysis 

In [None]:
#all_sig_pca = all_sig.dropna()

In [None]:
#sig_scaled = StandardScaler().fit_transform(all_sig_pca)
#all_sig_scaled = pd.DataFrame(data=sig_scaled)
#all_sig_scaled = all_sig

In [None]:
#pca = PCA(n_components=3)
#sig_pca = pca.fit_transform(sig_scaled)
#sig_df = pd.DataFrame(data=sig_pca, columns=["pc1", "pc2", "pc3"])
#sig_df

In [None]:
#pca.explained_variance_ratio_

# Random Forest PCA

In [None]:
#avg_chng = all_sig["returns"]

In [None]:
#X = sig_df.copy()
#X.head()

In [None]:
#y=all_sig['returns'].values.reshape(-1,1)
#y=y[99:]
#y[:5]

# Random Forest With Raw Variables

In [None]:
X = all_sig.copy()
X.dropna(inplace=True)
X.drop(columns=['close','returns','change_in_price'], inplace=True)
X.head()

In [None]:
y=all_sig['change_in_price'].values.reshape(-1,1)
y=y[99:]
y[:5]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
scaler = StandardScaler()

In [None]:
X_scaler = scaler.fit(X_train)

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=78)

In [None]:
rf_model = rf_model.fit(X_train_scaled, y_train)

In [None]:
predictions = rf_model.predict(X_test_scaled)

In [None]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

acc_score = accuracy_score(y_test, predictions)

In [None]:
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

In [None]:
importances = rf_model.feature_importances_

In [None]:
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

In [None]:
importances = pd.DataFrame(sorted(zip(rf_model.feature_importances_, X.columns), reverse=True))
importances.set_index(importances[1], inplace=True)
importances.drop(columns=1, inplace=True)
importances.rename(columns={0: 'Feature Importances'}, inplace=True)
importances_sorted = importances.sort_values(by='Feature Importances')
importances_sorted.plot(kind='barh', color='lightgreen', title= 'Features Importances', legend=False)

# Random Forest Training for Algo Trading

##### From day 3 Algo Trading Module

In [None]:
all_sig

In [None]:
x_var_list = ['Signal', 'Entry/Exit']
all_sig[x_var_list]

In [None]:
all_sig.dropna(subset=x_var_list, inplace=True)
all_sig.dropna(subset=['returns'],inplace=True)
all_sig = all_sig.replace([np.inf, -np.inf], np.nan)
all_sig.head()

In [None]:
all_sig['Positive Return'] = np.where(all_sig['returns'] > 0, 1.0, 0.0)
all_sig

In [None]:
training_start = all_sig.index.min().strftime(format='%Y-%m-%d')
training_end = '2019-12-31'
testing_start = '2019-06-21'
testing_end = all_sig.index.max().strftime(format='%Y-%m-%d')
print(f"Training Start: {training_start}")
print(f"Training End: {training_end}")
print(f"Testing Start: {testing_start}")
print(f"Testing End: {testing_end}")

In [None]:
# Construct the X_train and y_train datasets
X_train = all_sig[x_var_list][training_start:training_end]
y_train = all_sig['Positive Return'][training_start:training_end]

X_train.tail()

In [None]:
y_train.tail()

In [None]:
# Construct the X test and y test datasets
X_test = all_sig[x_var_list][testing_start:testing_end]
y_test = all_sig['Positive Return'][testing_start:testing_end]

X_test.tail()

In [None]:
y_test.tail()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [None]:
# Fit a SKLearn linear regression using just the training set (X_train, Y_train):
model = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=0)
model.fit(X_train, y_train)

# Make a prediction of "y" values from the X_test dataset
predictions = model.predict(X_test)

# Assemble actual y data (Y_test) with predicted y data (from just above) into two columns in a dataframe:
Results = y_test.to_frame()
Results["Predicted Value"] = predictions
Results

In [None]:
# Save the pre-trained model
from joblib import dump, load
dump(model, 'random_forest_model.joblib')

### Plot Cumulative Returns:

In [None]:
initial_capital = 1000000

In [None]:
cumulative_return_capital = initial_capital * (1 + (result['Return'] * results['Predicted Value']


# Deep Learning Price Prediction: 

In [None]:
import numpy as np
import pandas as pd
import hvplot.pandas

from sklearn.datasets import make_blobs, make_moons
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
# Set the random seed for reproducibility
# Note: This is for the homework solution, but it is good practice to comment this out and run multiple experiments to evaluate your model
from numpy.random import seed
seed(1)
from tensorflow import random
random.set_seed(2)

In [17]:
all_sig = all_sig.drop(columns=["close", "returns"])
all_sig

Unnamed: 0_level_0,vwap,volume,tradecount,change_in_price,SMA50,SMA100,Signal,Entry/Exit,bollinger_mid_band,bollinger_std,bollinger_upper_band,bollinger_lower_band,bollinger_long,bollinger_short,bollinger_signal,RSI,RSI_long,RSI_short,RSI_signal
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2019-06-22,10665.9,12842.48586265,54100,1,,,0.0,,,,,,0.0,0.0,0.0,,0.0,0.0,0.0
2019-06-23,10837.5,5943.46891708,34849,1,,,0.0,0.0,,,,,0.0,0.0,0.0,,0.0,0.0,0.0
2019-06-24,10843.5,5457.41272587,26308,1,,,0.0,0.0,,,,,0.0,0.0,0.0,,0.0,0.0,0.0
2019-06-25,11340.2,11623.88144358,40223,1,,,0.0,0.0,,,,,0.0,0.0,0.0,,0.0,0.0,0.0
2019-06-26,12684.3,25637.82859518,100171,1,,,0.0,0.0,,,,,0.0,0.0,0.0,,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-06-05,36066.4,5532.75325986,39833,0,47714.530,51769.148,0.0,0.0,37872.175,2412.901126,40285.076126,35459.273874,0.0,0.0,0.0,37.564257,0.0,0.0,0.0
2021-06-06,35930.0,3187.54640121,23692,1,47228.606,51663.893,0.0,0.0,37484.035,2047.953661,39531.988661,35436.081339,0.0,0.0,0.0,38.258192,0.0,0.0,0.0
2021-06-07,35319.4,6165.92045486,42615,0,46775.562,51538.067,0.0,0.0,37018.655,1795.580802,38814.235802,35223.074198,1.0,0.0,1.0,34.667528,0.0,0.0,0.0
2021-06-08,32712.0,10548.32694320,72896,0,46329.562,51419.813,0.0,0.0,36852.540,1967.902434,38820.442434,34884.637566,1.0,0.0,1.0,34.402798,0.0,0.0,0.0


In [18]:
def window_data(all_sig, window, feature_col_number, target_col_number):
    X = []
    y = []
    for i in range(len(all_sig) - window - 1):
        features = all_sig.iloc[i:(i + window), feature_col_number]
        target = all_sig.iloc[(i + window), target_col_number]
        X.append(features)
        y.append(target)
    return np.array(X), np.array(y).reshape(-1, 1)

In [19]:
# Predict Closing Prices using a 10 day window of previous closing prices
# Then, experiment with window sizes anywhere from 1 to 10 and see how the model performance changes
window_size = 10

# Column index 0 is the 'fng_value' column
# Column index 1 is the `Close` column
feature_column = 0
target_column = 1
X, y = window_data(all_sig, window_size, feature_column, target_column)

In [20]:
# Use 70% of the data for training and the remaineder for testing
split = int(0.7 * len(X))
X_train = X[: split]
X_test = X[split:]
y_train = y[: split]
y_test = y[split:]

In [21]:
from sklearn.preprocessing import MinMaxScaler
# Use the MinMaxScaler to scale data between 0 and 1.
x_train_scaler = MinMaxScaler()
x_test_scaler = MinMaxScaler()
y_train_scaler = MinMaxScaler()
y_test_scaler = MinMaxScaler()

x_train_scaler.fit(X_train)
y_train_scaler.fit(y_train)

X_train = x_train_scaler.transform(X_train)
y_train = y_train_scaler.transform(y_train)

x_test_scaler.fit(X_test)
y_test_scaler.fit(y_test)

X_test = x_test_scaler.transform(X_test)
y_test = y_test_scaler.transform(y_test)

In [22]:
# Reshape the features for the model
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

In [23]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [24]:
# Build the LSTM model. 
# The return sequences need to be set to True if you are adding additional LSTM layers, but 
# You don't have to do this for the final layer. 
# Note: The dropouts help prevent overfitting
# Note: The input shape is the number of time steps and the number of indicators
# Note: Batching inputs has a different input shape of Samples/TimeSteps/Features

model = Sequential()

number_units = 30
dropout_fraction = 0.2

model.add(LSTM(
    units=number_units,
    return_sequences=True,
    input_shape=(X_train.shape[1], 1))
    )
model.add(Dropout(dropout_fraction))

model.add(LSTM(units=number_units, return_sequences=True))
model.add(Dropout(dropout_fraction))

model.add(LSTM(units=number_units))
model.add(Dropout(dropout_fraction))

model.add(Dense(1))

In [25]:
# Compile the model
model.compile(optimizer="adam", loss="mean_squared_error")

In [26]:
# Summarize the model
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 10, 30)            3840      
_________________________________________________________________
dropout (Dropout)            (None, 10, 30)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 10, 30)            7320      
_________________________________________________________________
dropout_1 (Dropout)          (None, 10, 30)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 30)                7320      
_________________________________________________________________
dropout_2 (Dropout)          (None, 30)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 3

In [27]:
# Train the model
# Use at least 10 epochs
# Do not shuffle the data
# Experiement with the batch size, but a smaller batch size is recommended
model.fit(X_train, y_train, epochs=10, shuffle=False, batch_size=1, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1aba7882a08>

In [28]:
# Evaluate the model
model.evaluate(X_test, y_test)



0.024514904245734215

In [29]:
# Make some predictions
predicted = model.predict(X_test)

In [30]:
# Recover the original prices instead of the scaled version
predicted_prices = y_test_scaler.inverse_transform(predicted)
real_prices = y_test_scaler.inverse_transform(y_test.reshape(-1, 1))

In [31]:
# Create a DataFrame of Real and Predicted values
stocks = pd.DataFrame({
    "Real": real_prices.ravel(),
    "Predicted": predicted_prices.ravel()
}, index = df.index[-len(real_prices): ]) 
stocks.head()

NameError: name 'df' is not defined

In [None]:
# Plot the real vs predicted values as a line chart
stocks.plot()

# Attempt Sentiment Analysis Using Twitter: 

In [None]:
import tweepy 
from textblob import TextBlob
from wordcloud import WordCloud
import pandas as pd
import numpy as np
import re 
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

In [None]:
API_KEY="ixfnArRC0LrqGw9oMsKWaeeHH"
API_SECRET_KEY="bavWCrmk1pGMFTyUkScWgLwtbGt7TYbTqzIFBGUc3H4xAgIn33"
BEARER_TOKEN="1049318615261097984-2TXX3rME99h28i7ZuE6LmirwUtW0KW"
BEARER_TOKEN_SECRET="EznHh57YvAkj299l4GdqtNsnfQ7Ge9UZYPMPROkUA2ukC"
AZURE_KEY1="934412a2f06d4a37bf1fc3cbb513a944"
AZURE_KEY2="52cf1c0ec04d4fb085c583ea5680d646"

In [None]:
consumerKey = API_KEY
consumerSecret = API_SECRET_KEY
accessToken = BEARER_TOKEN
accessTokenSecret = BEARER_TOKEN_SECRET

In [None]:
authenticate = tweepy.OAuthHandler(consumerKey, consumerSecret)

In [None]:
authenticate.set_access_token(accessToken, accessTokenSecret)

In [None]:
api = tweepy.API(authenticate, wait_on_rate_limit=True)

In [None]:
#save the id of the oldest tweet less one
username = "elonmusk"

alltweets = []	
new_tweets = api.user_timeline(screen_name = username,count=200)

#save most recent tweets
alltweets.extend(new_tweets)

#save the id of the oldest tweet less one

oldest = alltweets[-1].id - 1

posts = api.user_timeline(screen_name = username, count = 200, max_id=oldest, tweet_mode="extended")

In [None]:
type(posts[0])

In [None]:
print("Show the 5 recent tweets: \n ")
i = 1
for tweet in posts[0:5]:
  print(str(i) + ')' +  tweet.full_text + "\n")
  i = i+1

In [None]:
for status in api.user_timeline():
    print(status.id)

In [None]:
df = pd.DataFrame([tweet.full_text for tweet in posts], columns = ['Tweets'])

In [None]:
df.head()

In [None]:
# Create a function to clean the tweets
def cleanTxt(text):
 text = re.sub('@[A-Za-z0–9]+', '', text) #Removing @mentions
 text = re.sub('#', '', text) # Removing '#' hash tag
 text = re.sub('RT[\s]+', '', text) # Removing RT
 text = re.sub('https?:\/\/\S+', '', text) # Removing hyperlink
 
 return text


# Clean the tweets
df['Tweets'] = df['Tweets'].apply(cleanTxt)

In [None]:
for status in 

In [None]:
# Create a function to get the subjectivity
def getSubjectivity(text):
   return TextBlob(text).sentiment.subjectivity

# Create a function to get the polarity
def getPolarity(text):
   return  TextBlob(text).sentiment.polarity


# Create two new columns 'Subjectivity' & 'Polarity'
df['Subjectivity'] = df['Tweets'].apply(getSubjectivity)
df['Polarity'] = df['Tweets'].apply(getPolarity)

In [None]:
df

In [None]:
# Show the new dataframe with columns 'Subjectivity' & 'Polarity'
df

'''
Let's see how well the sentiments are distributed. 
A good way to accomplish this task is by understanding the common words by plotting word clouds.
A word cloud (also known as text clouds or tag clouds) is a visualization, the more a specific word 
appears in text, the bigger and bolder it appears in the word cloud.
Let’s visualize all the words in the data using the word cloud plot.
'''

allWords = ' '.join([twts for twts in df['Tweets']])
wordCloud = WordCloud(width=500, height=300, random_state=21, max_font_size=110).generate(allWords)


plt.imshow(wordCloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
# Create a function to compute negative (-1), neutral (0) and positive (+1) analysis
def getAnalysis(score):
 if score < 0:
  return 'Negative'
 elif score == 0:
  return 'Neutral'
 else:
  return 'Positive'


df['Analysis'] = df['Polarity'].apply(getAnalysis)

# Show the dataframe
df

In [None]:
# Printing positive tweets 
print('Printing positive tweets:\n')
j=1
sortedDF = df.sort_values(by=['Polarity']) #Sort the tweets
for i in range(0, sortedDF.shape[0] ):
  if( sortedDF['Analysis'][i] == 'Positive'):
    print(str(j) + ') '+ sortedDF['Tweets'][i])
    print()
    j= j+1

In [None]:

# Printing negative tweets  
print('Printing negative tweets:\n')
j=1
sortedDF = df.sort_values(by=['Polarity'],ascending=False) #Sort the tweets
for i in range(0, sortedDF.shape[0] ):
  if( sortedDF['Analysis'][i] == 'Negative'):
    print(str(j) + ') '+sortedDF['Tweets'][i])
    print()
    j=j+1

In [None]:
# Plotting
plt.figure(figsize=(8,6))
for i in range(0, df.shape[0]):
 plt.scatter(df["Polarity"][i], df["Subjectivity"][i], color='Blue') # plt.scatter(x,y,color)
 
plt.title('Sentiment Analysis')
plt.xlabel('Polarity')
plt.ylabel('Subjectivity')
plt.show()

In [None]:
# Print the percentage of positive tweets
ptweets = df[df.Analysis == 'Positive']
ptweets = ptweets['Tweets']
ptweets

round( (ptweets.shape[0] / df.shape[0]) * 100 , 1)

In [None]:
# Print the percentage of negative tweets
ntweets = df[df.Analysis == 'Negative']
ntweets = ntweets['Tweets']
ntweets

round( (ntweets.shape[0] / df.shape[0]) * 100, 1)

In [None]:
# Show the value counts
df['Analysis'].value_counts()

# Plotting and visualizing the counts
plt.title('Sentiment Analysis')
plt.xlabel('Sentiment')
plt.ylabel('Counts')
df['Analysis'].value_counts().plot(kind = 'bar')
plt.show()

# Perform Random Forest Analysis for Classification:

In [None]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
%matplotlib inline

# Needed for decision tree visualization
import pydotplus
from IPython.display import Image

In [None]:
# Define features set
X = all_sig.copy()
X.drop("close", axis=1, inplace=True)
X.head()

In [None]:
# Define target vector
y = all_sig["close"].ravel()
y[:5]

In [None]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [None]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [None]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=1000, random_state=78)

In [None]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [None]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [None]:
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)