In [1]:
import numpy as np
import pandas as pd 

from pandas_datareader.data import DataReader


from datetime import datetime

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 10)

In [3]:
df = pd.read_csv('../input/sandp500/individual_stocks_5yr/individual_stocks_5yr/MSFT_data.csv')
print(df.shape)
df.head()

**Adding a new Year column**

In [4]:
df['year'] = pd.DatetimeIndex(df['date']).year


**Visualizing Yearly Based Open, Close, High and Low Stocks MAX**

In [5]:
tempdf = df.groupby('year',as_index=False).sum()
tempdf.plot('year',['open','close','high','low'],kind = 'bar')

**Visualizing the amount of stocks sold over year**

In [6]:
x = df.groupby('year').sum().sort_values('volume', ascending = False)
x.plot.bar( y='volume', rot=0)

**Visualizing the amount of data each year**

In [7]:
df.groupby('year')['year'].agg('count').plot(kind='pie',title='Year')

In [8]:
newDf = df[(df.year==2013)]
newDf.volume.sum()
s = df.year.value_counts()
x = s.to_dict()
x

In [9]:
def yearlyAllocatedData(years_dict,totalSize):
    allocation = []
    for i in years_dict:
        allocation.append((100 * years_dict[i] / totalSize))
    return allocation

In [10]:
print(yearlyAllocatedData(x,len(df)))

**Filtering 2018 data since, there is very less number of data we have for that year**

In [11]:
df = df[df['year'] != 2018]
df

**Grapical Visualization of  Open, Close, High and Low Stocks over Days**

In [12]:
fig = plt.figure(figsize=(25,20))

# Visualizing the opening prices of the data.
plt.subplot(2,2,1)
plt.title('Openning Price')
plt.xlabel('Days')
plt.ylabel('Opening Price USD ($)')
plt.plot(df['open'])

# Visualizing the closing prices of the data.
plt.subplot(2,2,2)
plt.title('Close Price')
plt.xlabel('Days')
plt.ylabel('Closing Price USD ($)')
plt.plot(df['close'])

# Visualizing the high prices of the data.
plt.subplot(2,2,3)
plt.title('High Price')
plt.xlabel('Days')
plt.ylabel('High Price USD ($)')
plt.plot(df['high'])

# Visualizing the low prices of the data.
plt.subplot(2,2,4)
plt.title('Low Price')
plt.xlabel('Days')
plt.ylabel('Low Price USD ($)')
plt.plot(df['low'])

plt.show()

**Grapical Visualization of Date Over Open, Close, High and Low Stocks**

In [13]:
df.plot(kind = "line",x = 'date' , y = ['open', 'close', 'low','high'])

**Grapical Visualization of Date Over Open, Close, High and Low Stocks (*Clear Insights*)**

In [14]:
df[10:50].plot(kind = "line",x = 'date' , y = ['open', 'close', 'low','high'])

**Grapical Visualization of Yearly Based Open, Close, High and Low Stocks Over Month**

In [15]:
df['months'] = pd.DatetimeIndex(df['date']).month
df.head()

In [16]:
yearly_open_Pivot = pd.pivot_table(df, values = "open", columns = "year", index = "months")
yearly_close_Pivot = pd.pivot_table(df, values = "close", columns = "year", index = "months")
yearly_high_Pivot = pd.pivot_table(df, values = "high", columns = "year", index = "months")
yearly_low_Pivot = pd.pivot_table(df, values = "low", columns = "year", index = "months")

In [17]:
yearly_open_Pivot.plot()

In [18]:
yearly_open_Pivot.plot(subplots = True, figsize=(15, 10), layout=(3,3))

In [19]:
yearly_close_Pivot.plot()

In [20]:
yearly_close_Pivot.plot(subplots = True, figsize=(15, 10), layout=(3,3))

In [21]:
yearly_high_Pivot.plot()

In [22]:
yearly_high_Pivot.plot(subplots = True, figsize=(20, 10), layout=(3,3))

In [23]:
yearly_low_Pivot.plot()

In [24]:
yearly_low_Pivot.plot(subplots = True, figsize=(20, 10), layout=(3,3))

# Predicting the closing price stock

In [25]:
df.head()

In [26]:
plt.figure(figsize=(16,6))
plt.title('Close Price History')
plt.plot(df['close'])
plt.ylabel('Close Price USD ($)', fontsize=18)
plt.show()

In [27]:

data = df.filter(['close'])
# Convert the dataframe to a numpy array
dataset = data.values

training_data_len = int(np.ceil( len(dataset) * .95 ))

training_data_len

In [28]:
# Scale the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(dataset)

scaled_data

In [29]:
# Create the training data set 

train_data = scaled_data[0:int(training_data_len), :]
# Split the data into x_train and y_train data sets
x_train = []
y_train = []

for i in range(60, len(train_data)):
    x_train.append(train_data[i-60:i, 0])
    y_train.append(train_data[i, 0])
    if i<= 61:
        print(x_train)
        print(y_train)
        print()
        

x_train, y_train = np.array(x_train), np.array(y_train)


x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))


In [30]:
from keras.models import Sequential
from keras.layers import Dense, LSTM

# Build the LSTM model
model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape= (x_train.shape[1], 1)))
model.add(LSTM(64, return_sequences=False))
model.add(Dense(25))
model.add(Dense(1))


model.compile(optimizer='adam', loss='mean_squared_error')


history = model.fit(x_train, y_train, batch_size=8, epochs=100)

In [31]:
plt.plot(history.history['loss'])

In [32]:
# Create the testing data set

test_data = scaled_data[training_data_len - 60: , :]
# Create the data sets x_test and y_test
x_test = []
y_test = dataset[training_data_len:, :]
for i in range(60, len(test_data)):
    x_test.append(test_data[i-60:i, 0])
    

x_test = np.array(x_test)


x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1 ))

predictions = model.predict(x_test)
predictions = scaler.inverse_transform(predictions)

# Get the root mean squared error
rmse = np.sqrt(np.mean(((predictions - y_test) ** 2)))
rmse

In [33]:
# Plot the data
train = data[:training_data_len]
valid = data[training_data_len:]
valid['Predictions'] = predictions
# Visualize the data
plt.figure(figsize=(16,6))
plt.title('Model')
plt.ylabel('Close Price USD ($)', fontsize=18)
plt.plot(train['close'])
plt.plot(valid[['close', 'Predictions']])
plt.legend(['Train', 'Val', 'Predictions'], loc='lower right')
plt.show()

In [34]:
# Show the valid and predicted prices
valid