In [0]:
# ! pip install kaggle
from google.colab import files
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download -d dgawlik/nyse
! unzip nyse.zip -d data_nyse
! rm nyse.zip
! ls
! pip install  matplotlib
! pip install  seaborn
##https://www.kaggle.com/dgawlik/nyse##

mkdir: cannot create directory ‘/root/.kaggle’: File exists
Downloading nyse.zip to /content
 82% 25.0M/30.7M [00:01<00:00, 14.2MB/s]
100% 30.7M/30.7M [00:01<00:00, 21.8MB/s]
Archive:  nyse.zip
replace data_nyse/fundamentals.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [0]:
! pip install mpld3
# ! ls

In [0]:
import pandas as pd
import numpy as np

import mpld3#zoomable graphs
mpld3.enable_notebook()

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
plt.style.use("fivethirtyeight")
%matplotlib inline
pd.set_option('display.max_columns', None)#Display all columns
pd.set_option('display.max_rows', None)#Display all columns

Context
This dataset is a playground for fundamental and technical analysis. It is said that 30% of traffic on stocks is already generated by machines, can trading be fully automated? If not, there is still a lot to learn from historical data.

Content
Dataset consists of following files:

*   **prices.csv**: raw, as-is daily prices. Most of data spans from 2010 to the end 2016, for companies new on stock market date range is shorter. There have been approx. 140 stock splits in that time, this set doesn't account for that.
*    **prices-split-adjusted.csv**: same as prices, but there have been added adjustments for splits.
*    **securities.csv**: general description of each company with division on sectors
*    **fundamentals.csv**: metrics extracted from annual SEC 10K fillings (2012-2016), should be enough to derive most of popular fundamental indicators.
Acknowledgements
Prices were fetched from Yahoo Finance, fundamentals are from Nasdaq Financials, extended by some fields from EDGAR SEC databases.

Inspiration
Here is couple of things one could try out with this data:

One day ahead prediction: Rolling Linear Regression, ARIMA, Neural Networks, LSTM
Momentum/Mean-Reversion Strategies
Security clustering, portfolio construction/hedging
Which company has biggest chance of being bankrupt? Which one is undervalued (how prices behaved afterwards), what is Return on Investment?

In [0]:
company_name = "EA"

#Import Data

In [0]:
prices_data_path = './data_nyse/prices.csv'
securities_data_path = './data_nyse/securities.csv'
prices_split_data_path = './data_nyse/prices-split-adjusted.csv'
fundamentals_data_path = './data_nyse/fundamentals.csv'


prices_data = pd.read_csv(prices_data_path)
securities_data = pd.read_csv(securities_data_path)
prices_split_data = pd.read_csv(prices_split_data_path)
fundamentals_data = pd.read_csv(fundamentals_data_path)

#Inspect Data

In [0]:
prices_data.head()

In [0]:
securities_data.head()

In [0]:
prices_split_data.head()

In [0]:
fundamentals_data.head()

Function to give us an idea of our Dataframes

In [0]:
def observe_data(dataframeName):
    dataframeName.info()
    print("Dataframe Shape: " + str(dataframeName.shape))
    print("Dataframe Size: " + str(dataframeName.size))
    print("Number Of Rows: " + str(len(dataframeName)))
    print("Number Of Columns: " + str(len(dataframeName.columns)))
    #used to view some basic statistical details like percentile, mean, std etc. of a data frame or a series of numeric values
    print(dataframeName.describe())#you can also pick specific columns as such: dataframe.columnName.describe()

In [0]:
observe_data(prices_split_data)

Number of Companies Per Sector/Industry

In [0]:
plt.figure(figsize=(15, 6))
ax = sns.countplot(y='GICS Sector', data=securities_data)
plt.xticks(rotation=45)

Correlation among sectors - The higher the correlation, the more likely that the stocks are moving in the same direction

In [0]:
securities_data = securities_data.rename(columns = {'Ticker symbol' : 'symbol','GICS Sector' : 'sector'})

In [0]:
prices_data  = prices_data.merge(securities_data[['symbol','sector']], on = 'symbol')
prices_data['date'] = pd.to_datetime(prices_data['date'])
prices_data.head()

In [0]:
prices_data = prices_data[prices_data['date'] >= '2016-01-01']#values after 2016 - we can change that later

In [0]:
sector_pivot = pd.pivot_table(prices_data, values = 'close', index = ['date'],columns = ['sector']).reset_index()

In [0]:
plt.figure(figsize = (10,10))
sns.heatmap(sector_pivot.corr() ,annot=True, cmap="coolwarm")

When building a diversified portfolio, investors seek negatively correlated stocks. Doing so reduces the risk of catastrophic losses in the portfolio and helps the investor sleep better at night. Assume the portfolio consists of two stocks and they are negatively correlated. This implies that when the price of one performs worse than usual, the other will likely do better than usual. However, risk takers would love to seek for positively correlated stocks for higher expected return, and of course, higher risk.

In [0]:
prices_data['return'] = np.log(prices_data.close / prices_data.close.shift(1)) + 1
prices_data['good'] = prices_data['symbol'] == prices_data['symbol'].shift(1)
prices_data = prices_data.drop(prices_data[prices_data['good'] == False].index)
prices_data.dropna(inplace = True)

In [0]:
risk_free = 0.032
sector_df = pd.DataFrame({'return' : (prices_data.groupby('sector')['return'].mean() - 1) * 252, 'stdev' : prices_data.groupby('sector')['return'].std()})
sector_df['sharpe'] = (sector_df['return'] - risk_free) / sector_df['stdev']
plt.figure(figsize = (12,8))
ax = sns.barplot(x= sector_df['sharpe'], y = sector_df.index)

Sharpe ratio is often used to describe how good our portfolio is. The higher the sharpe ratio, the better the portfolio. A sharpe ratio more than 1 is acceptable to investors. As a matter of fact, we will only choose the sectors that have sharpe ratio more than 1.

In [0]:
port_list = sector_df[sector_df['sharpe'] >= 1].index
port_list

Function plotting historical closing values of a specific company from *prices_split_data* dataframe

In [0]:
string = ', '.join(prices_split_data.symbol.unique())
print("The 500 companies are: " + string)

In [0]:
def plot_closing_values(company_name):    
    temp = prices_split_data.loc[prices_split_data["symbol"] == company_name]
    temp.close.plot()
    plt.title(company_name)
    plt.ylabel('Close')

In [0]:
# ! pip install plotly

In [0]:
import plotly.graph_objects as go
def plot_candlestick_chart(company_name):    
    df = prices_split_data.loc[prices_split_data["symbol"] == company_name]
    fig = go.Figure(data=[go.Candlestick(x=df.date,
                    open=df.open,
                    high=df.high,
                    low=df.low,
                    close=df.close)])

    fig.show()

In [0]:
plot_closing_values(company_name)

In [0]:
plot_candlestick_chart(company_name)

A moving average (MA) is a widely used indicator in technical analysis that helps smooth out price action by filtering out the “noise” from random short-term price fluctuations.

In [0]:
def moving_average(company_name):
    df = prices_split_data.loc[prices_split_data["symbol"] == company_name]
    df = df[['close','date']]
    df.reset_index(level=0, inplace=True)
    plt.plot(df.date, df.close)
    plt.show()
    #SMA - Simple Moving Average
    rolling_mean = df.close.rolling(20).mean()
    rolling_mean2 = df.close.rolling(50).mean()
    rolling_mean3 = df.close.rolling(100).mean()
    plt.plot(df.date, df.close, label=str(company_name))
    plt.plot(df.date, rolling_mean, label=str(company_name+' 20 Day SMA'), color='orange')
    plt.plot(df.date, rolling_mean2, label=str(company_name+' 50 Day SMA'), color='magenta')
    plt.plot(df.date, rolling_mean3, label=str(company_name+' 100 Day SMA'), color='red')
    plt.legend(loc='upper left')
    plt.show()
    #EMA - Exponential Moving Average
    exp1 = df.close.ewm(span=20, adjust=False).mean()
    exp2 = df.close.ewm(span=50, adjust=False).mean()
    exp3 = df.close.ewm(span=100, adjust=False).mean()
    plt.plot(df.date, df.close, label=str(company_name))
    plt.plot(df.date, exp1, label=str(company_name+' 20 Day EMA'), color='orange')
    plt.plot(df.date, exp2, label=str(company_name+' 50 Day EMA'), color='magenta')
    plt.plot(df.date, exp3, label=str(company_name+' 100 Day EMA'), color='red')
    plt.legend(loc='upper left')
    plt.legend(loc='upper left')
    plt.show()


In [0]:
moving_average(company_name)

After having the list of sectors, we will choose from each sectors the most outstanding return stock. In real life you should choose many. However, in this example, I will pick only one for simple illustration.

In [0]:
from scipy.stats.mstats import gmean

port_stock = []
return_stock = []
def get_stock(sector):
    list_stocks = prices_data[prices_data['sector'] == sector]['symbol'].unique()
    performance = prices_data.groupby('symbol')['return'].apply(lambda x : (gmean(x) - 1) * 252).sort_values(ascending = False)
    
    for i in range(len(performance)):
        if performance.index[i] in list_stocks:
            port_stock.append(performance.index[i])
            return_stock.append(performance[i])
            break
    
for sector in port_list:
    get_stock(sector)

return_stock

In [0]:
port_df = prices_data[prices_data['symbol'].isin(port_stock)].pivot('date','symbol','return')

Each portfolio will have different stock weights, or the allocation of your investment into each stock.

In [0]:
return_pred = []
weight_pred = []
std_pred = []
for i in range(1000):
    random_matrix = np.array(np.random.dirichlet(np.ones(len(port_stock)),size=1)[0])
    port_std = np.sqrt(np.dot(random_matrix.T, np.dot(port_df.cov(),random_matrix))) * np.sqrt(252)
    port_return = np.dot(return_stock, random_matrix)
    return_pred.append(port_return)
    std_pred.append(port_std)
    weight_pred.append(random_matrix)

In [0]:
pred_output = pd.DataFrame({'weight' : weight_pred , 'return' : return_pred, 'stdev' :std_pred })
pred_output['sharpe'] = (pred_output['return'] - risk_free) / pred_output['stdev']
pred_output.head()

In [0]:
max_pos = pred_output.iloc[pred_output.sharpe.idxmax(),:]
safe_pos = pred_output.iloc[pred_output.stdev.idxmin(),:]

After running 2000 simulations, we finally plot the results, as well as the options for the portfolio, either the best performing or the safest one for risk adverse.

In [0]:
plt.subplots(figsize=(15,10))
#ax = sns.scatterplot(x="Stdev", y="Return", data=pred_output, hue = 'Sharpe', size = 'Sharpe', sizes=(20, 200))

plt.scatter(pred_output.stdev,pred_output['return'],c=pred_output.sharpe,cmap='OrRd')
plt.colorbar()
plt.xlabel('Volatility')
plt.ylabel('Return')

plt.scatter(max_pos.stdev,max_pos['return'],marker='^',color='r',s=500)
plt.scatter(safe_pos.stdev,safe_pos['return'],marker='<',color='g',s=500)
#ax.plot()

In [0]:
print("The highest sharpe porfolio is {} sharpe, at {} volitality".format(max_pos.sharpe.round(3),max_pos.stdev.round(3)))

for i in range(len(port_stock)):
    print("{} : {}%".format(port_stock[i],(max_pos.weight[i] * 100).round(3)))

In [0]:
print("The safest porfolio is {} risk, {} sharpe".format(safe_pos.stdev.round(3), safe_pos.sharpe.round(3)))
for i in range(len(port_stock)):
    print("{} : {}%".format(port_stock[i],(safe_pos.weight[i] * 100).round(3)))

#Feature Engineering

In [0]:
company_data = (prices_split_data.loc[prices_split_data['symbol']==company_name]
         .drop(columns='symbol')
         .sort_values(by='date',ascending=True)
         .reset_index(drop=True)
         .assign(**{'average': lambda df: df.loc[:,['open','high','low','close']].mean(axis=1), 
                    'EMA20': lambda df: df['average'].ewm(span=20, adjust=False).mean(), 
                    'EMA5': lambda df: df['average'].ewm(span=5, adjust=False).mean(), 
                    'dist_EMA20': lambda df: (df['average'] - df['EMA20'])/df['EMA20']*100, 
                    'dist_EMA5': lambda df: (df['average'] - df['EMA5'])/df['EMA5']*100}))
company_data.head()

In [0]:
# use volume weighted averaged OHLC.mean to represent market average
# comparison with SPY500 shows that market_average calculated this way is representative of overall market

tickers_with_all_dates = prices_split_data.groupby('symbol').size().loc[lambda s: s.values==s.values.max()].index.to_list()
market = (prices_split_data.loc[prices_split_data['symbol'].isin(tickers_with_all_dates)]
          .assign(**{'average': lambda df: df.loc[:,['open','high','low','close']].mean(axis=1), 
                     'price x volume': lambda df: df['average']*df['volume']})
          .groupby('date')
          .agg(**{'price x volume sum': pd.NamedAgg(column='price x volume', aggfunc=np.sum), 
                  'volume sum': pd.NamedAgg(column='volume', aggfunc=np.sum)})
          .assign(**{'market_average': lambda df: df['price x volume sum']/df['volume sum']})
          .sort_index(ascending=True))

company_data['market'] = market['market_average'].values
company_data.head()

In [0]:
fig, ax = plt.subplots(figsize=(10,5))
ax.plot(company_data.index[:120], company_data['dist_EMA20'][:120], label='dist_EMA20')
ax.plot(company_data.index[:120], company_data['dist_EMA5'][:120], label='dist_EMA5')
ax.set_xlabel('day')
ax.set_ylabel('distance from moving mean')
ax.legend(loc='best')

fig, ax = plt.subplots(figsize=(10,5))
ax.plot(company_data.index[:120], company_data['open'][:120], label='open')
ax.plot(company_data.index[:120], company_data['close'][:120], label='close')
ax.set_xlabel('day')
ax.set_ylabel(company_name)
ax.legend(loc='best')

In [0]:
from sklearn import preprocessing

def normalize_data(df):
    min_max_scaler = preprocessing.MinMaxScaler()
    df['open'] = min_max_scaler.fit_transform(df.open.values.reshape(-1,1))
    df['high'] = min_max_scaler.fit_transform(df.high.values.reshape(-1,1))
    df['low'] = min_max_scaler.fit_transform(df.low.values.reshape(-1,1))
    df['volume'] = min_max_scaler.fit_transform(df.volume.values.reshape(-1,1))
    df['close'] = min_max_scaler.fit_transform(df['close'].values.reshape(-1,1))
    return df
df = normalize_data(prices_split_data)
df.plot(figsize=(23,10))


#Risk Management

We've done some baseline analysis, let's go ahead and dive a little deeper. We're now going to analyze the risk of the stock. In order to do so we'll need to take a closer look at the daily changes of the stock, and not just its absolute value, using the *pct_change* function, which will return the percentage change between the current and a prior element.

In [0]:
def risk_computation(company_name):
    risks = prices_split_data.loc[prices_split_data["symbol"] == company_name].close.pct_change()
    print(risks.head())
    risks.plot( legend=True, linestyle='--', marker='o')
    #Histogram & KDE Plot
    # Note the use of dropna() here, otherwise the NaN values can't be read by seaborn
    plt.figure(figsize=(12, 12))

    sns.distplot(risks.dropna(), bins=100, color='purple')
    plt.ylabel('Daily Return')
    plt.title(company_name)
    # Could have also done:
    #AAPL['Daily Return'].hist()

In [0]:
risk_computation(company_name)

#Train our Model & Display our predictions

In [0]:
# %tensorflow_version 2.x
# import tensorflow as tf
# device_name = tf.test.gpu_device_name()
# if device_name != '/device:GPU:0':
#   raise SystemError('GPU device not found')
# print('Found GPU at: {}'.format(device_name))

In [0]:
# %tensorflow_version 2.x
# import tensorflow as tf
# import timeit

# device_name = tf.test.gpu_device_name()
# if device_name != '/device:GPU:0':
#   print(
#       '\n\nThis error most likely means that this notebook is not '
#       'configured to use a GPU.  Change this in Notebook Settings via the '
#       'command palette (cmd/ctrl-shift-P) or the Edit menu.\n\n')
#   raise SystemError('GPU device not found')

# def cpu():
#   with tf.device('/cpu:0'):
#     random_image_cpu = tf.random.normal((100, 100, 100, 3))
#     net_cpu = tf.keras.layers.Conv2D(32, 7)(random_image_cpu)
#     return tf.math.reduce_sum(net_cpu)

# def gpu():
#   with tf.device('/device:GPU:0'):
#     random_image_gpu = tf.random.normal((100, 100, 100, 3))
#     net_gpu = tf.keras.layers.Conv2D(32, 7)(random_image_gpu)
#     return tf.math.reduce_sum(net_gpu)
  
# # We run each op once to warm up; see: https://stackoverflow.com/a/45067900
# cpu()
# gpu()

# # Run the op several times.
# print('Time (s) to convolve 32x7x7x3 filter over random 100x100x100x3 images '
#       '(batch x height x width x channel). Sum of ten runs.')
# print('CPU (s):')
# cpu_time = timeit.timeit('cpu()', number=10, setup="from __main__ import cpu")
# print(cpu_time)
# print('GPU (s):')
# gpu_time = timeit.timeit('gpu()', number=10, setup="from __main__ import gpu")
# print(gpu_time)
# print('GPU speedup over CPU: {}x'.format(int(cpu_time/gpu_time)))

In [0]:
prices_split_data.head()

In [0]:
# ! pip install sklearn

In [0]:
oof = prices_split_data

df = oof.loc[prices_split_data["symbol"] == company_name]

plt.figure(figsize=(16,8))
plt.title('Close Price History')
plt.plot(df['date'],df['close'])
plt.xlabel('date', fontsize=18)
plt.ylabel('Close Price USD ($)', fontsize=18)
plt.show()

#Create a new dataframe with only the 'Close column
data = df.filter(['close'])
#Convert the dataframe to a numpy array
dataset = data.values
#Get the number of rows to train the model on
training_data_len = int(np.ceil( len(dataset) * .8 ))

#Scale the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(dataset)

#Create the training data set
#Create the scaled training data set
train_data = scaled_data[0:int(training_data_len), :]
#Split the data into x_train and y_train data sets
x_train = []
y_train = []

for i in range(60, len(train_data)):
    x_train.append(train_data[i-60:i, 0])
    y_train.append(train_data[i, 0])
    if i<= 61:
        print(x_train)
        print(y_train)
        print()

# Convert the x_train and y_train to numpy arrays 
x_train, y_train = np.array(x_train), np.array(y_train)
#Reshape the data
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
x_train.shape

from keras.models import Sequential
from keras.layers import Dense, LSTM

#Build the LSTM model
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape= (x_train.shape[1], 1)))
model.add(LSTM(50, return_sequences= False))
model.add(Dense(25))
model.add(Dense(1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

#Train the model
model.fit(x_train, y_train, batch_size=1, epochs=1)

#Create the testing data set
#Create a new array containing scaled values from index 1543 to 2002 
test_data = scaled_data[training_data_len - 60: , :]
#Create the data sets x_test and y_test
x_test = []
y_test = dataset[training_data_len:, :]
for i in range(60, len(test_data)):
    x_test.append(test_data[i-60:i, 0])

# Convert the data to a numpy array
x_test = np.array(x_test)
# Reshape the data
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1 ))

# Get the models predicted price values 
predictions = model.predict(x_test)
predictions = scaler.inverse_transform(predictions)

In [0]:
# # Get the root mean squared error (RMSE)
rmse = np.sqrt(np.mean(((predictions - y_test) ** 2)))
rmse

In [0]:
# # Plot the data
train = data[:training_data_len]
valid = data[training_data_len:]
valid['Predictions']= predictions
valid.head()
# valid['Predictions'] = predictions

In [0]:
# # Visualize the data
plt.figure(figsize=(16,8))
plt.title(company_name)
plt.xlabel('Date', fontsize=18)
plt.ylabel('Close Price USD ($)', fontsize=18)
plt.plot(train['close'])
plt.plot(valid[['close', 'Predictions']])
plt.legend(['Train', 'Val', 'Predictions'], loc='lower right')
plt.show()