In [33]:
# Gruppemedlemmer
# Hanne Austad s375093
# Jaspreet Kaur s375164
# Maryam Alam s375091
# Masooma Zahrah Azhar Khanum s374971

# TASK: Make a prediction algorithm which predicts the price of this stock on a specific date. Input will be date and output should be price of that stock (close value in the data file). You should also show the prediction percentage score.

# 1) We have chosen the Tesla "Predict stock market price for TESLA" use case
# 2 + 3) Given the different nature of regression and classification algorithms, we have chosen to use a regression algorithm in this case. 
# This is because regression algorithms are used for predicting continuous numerical values, while classification algorithms are used for predicting categorical labels. 
# Since the assignment asks us to predict the price, and not whether it increases or decreases (which would be a categorical prediction), a regression algorithm seems to be the logical choice. 

In [34]:
# Importing modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

In [35]:
# Load the Tesla dataset into the notebook and convert the date format
df = pd.read_csv('data2/TSLA.csv')
df['Date'] = pd.to_datetime(df['Date'])

In [36]:
# Filter data to include only dates from 2010 to 2020 (we will predict dates in 2021, and for that reason we do not want the model to have access to dates from that year)
# We have to predict the stock price on a date we actually have data on, to see our prediction % and how accurate our algorithm is.
filterDf = df[(df['Date'] >= '2010-01-01') & (df['Date'] < '2021-01-01')].copy()

# Printing number of rows/columns and top 5 rows to get an overview
print(filterDf.shape)
print (filterDf.head(5))

(2647, 7)
        Date   Open   High    Low  Close  Adj Close    Volume
0 2010-06-29  3.800  5.000  3.508  4.778      4.778  93831500
1 2010-06-30  5.158  6.084  4.660  4.766      4.766  85935500
2 2010-07-01  5.000  5.184  4.054  4.392      4.392  41094000
3 2010-07-02  4.600  4.620  3.742  3.840      3.840  25699000
4 2010-07-06  4.000  4.000  3.166  3.222      3.222  34334500


In [37]:
# Now let's look for null values to see if the dataset needs to be cleaned first.
df.isnull().sum()

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

In [38]:
# Selecting variables, then we ssplit the dataset into training and testing, and set a random state for reproducibility.
# We will leave out Adj Close and Volume in our model, and only use Open/High/Low, since the task does not specify which columns must be used.
x = filterDf[['Open', 'High', 'Low']]
y = filterDf['Close']

In [39]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# 4 + 5) Train the model and make a prediction
# First we initialize Linear Regression, and then we start training our model.
model = LinearRegression()
model.fit(x_train, y_train);

In [40]:
# Now it's time to predict the stock price for a  future date that the filtered dataset does not have access to. Let's say 2021-08-04, which is a real date in the full dataset.
# Lets retrieve the data for 2021-08-04
dato = pd.to_datetime('2021-08-04')
values = df[df['Date'] == dato][['Open', 'High', 'Low']]

# Make the prediction
predictedPrice = model.predict(values)

# Find actual closing price for 2021-08-04
realValue = df[df['Date'] == dato]['Close'].values 

# Find prediction score and MAE
r2 = r2_score(y_test, model.predict(x_test)) * 100
mae = mean_absolute_error(y_test, model.predict(x_test))

# Time to see the results - the predicted + actual price, and our prediction score + MAE evaluation.
print(f"Predicted stock price for Tesla on {dato.date()}: ${predictedPrice[0]:.2f}")
print(f"Real Tesla stock price on {dato.date()}: ${realValue[0]:.2f}")
print(f"Prediction score: {r2:.2f}%")
print(f"Mean absolute error, MAE: {mae:.2f}")


Predicted stock price for Tesla on 2021-08-04: $719.75
Real Tesla stock price on 2021-08-04: $710.92
Prediction score: 99.94%
Mean absolute error, MAE: 0.64
