In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pylab import rcParams
import numpy as np
import seaborn as sns
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.feature_selection import RFECV, SelectFromModel, SelectKBest
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
%matplotlib inline

# . Loading the dataset 

This dataset contains Apple's (AAPL) stock data for the last 10 years (from 2010).Insights from this data can be used to build useful price forecasting algorithms to aid investment. 


Data is collected and aggregated from 25 exchanges.

Data is updated weekly

This dataset contains 1-minute, 5-minute, 30-minute and 1-hour bars (open/high/low/close/volume) for AAPL (Apple).

Out-of-hours trades are included. Prices are adjusted for splits and dividends.
To get access directly to this dataset you can see the link below 

https://finance.yahoo.com/quote/AAPL/history/ 

or 


https://www.kaggle.com/datasets/tarunpaparaju/apple-aapl-historical-stock-data

In [None]:
#Reading the dataset using "read_csv" a prebuild function in the "pandas" package, the "index_col" contains the column to use as the row labels of the DataFrame
Stock = pd.read_csv('AppleDataset.csv',  index_col=0)
df_Stock = Stock
#"rename" prebuild function that enables us to change the name of a column in a Dataframe
df_Stock = df_Stock.rename(columns={'Close(t)':'Close'})
#Display using Head function that returns the dataframe or series with the first few rows (by default 5)
df_Stock.head()

#  Exploratoration of the dataset 

1. understanding features and determining the target: 

In order to better illustrate this regression approach it is necessary to have suitable predictors for our target. The dataset recorded more direct elements to the stock.  Five
values illustrating movements in the price over one unit of time (usually one day, but it can also be one week or one month) are key trading indicators. They are as follows:

• Open: The starting price for a given trading day


• Close: The final price on that day


• High: The highest prices at which the stock traded on that day


• Low: The lowest prices at which the stock traded on that day


• Volume: The total number of shares traded before the market closed on
that day

We will be focusing on historical prices (the 5 recorded values) and performance to predict future prices.

2. Getting the data's types and convert into float

In [None]:
#Converting columns that contain data type (Dtype) =object to float using the "astype" prebuild function 
#In the same time replacing "$" symbol with an empty space 
df_Stock["Close"] = df_Stock[' Close/Last'].str.replace('$','').astype(float)
df_Stock['Open'] = df_Stock[' Open'].str.replace('$','').astype(float)
df_Stock['High'] = df_Stock[' High'].str.replace('$','').astype(float)
df_Stock['Low'] = df_Stock[' Low'].str.replace('$','').astype(float)

df_Stock['Volume'] = df_Stock[' Volume']
#With the new named columns containing the converted datatypes, we are droping pevious columns
drop = [' Close/Last',' Open',' High',' Low',' Volume']
df_Stock = df_Stock.drop(drop,axis=1)
#Diplaying the new dataset 
df_Stock.head()

In [None]:
def create_train_test_set(df_Stock):
    #Taking all features as predictors of course droping the close  column...
    """
    the close  column represents the predicted value of close prices in the 
    previous 10 years. so not to make our model overfit and the learning to be biased by these values 
    we drop this column.

    """
    features = df_Stock.drop(columns=['Close'], axis=1)

    #now the target is to predit the values of the close column which represents the predicted 
    #prices by which shares of apple corporation will be sold (or bought).
    target = df_Stock['Close']
    

    data_len = df_Stock.shape[0]
    print('Historical Stock Data length is - ', str(data_len))

    #create a chronological split for train and testing
    train_split = int(data_len * 0.7)
    print('Training Set length - ', str(train_split))

    val_split = train_split + int(data_len * 0.2)
    print('Validation Set length - ', str(int(data_len * 0.2)))

    print('Test Set length - ', str(int(data_len * 0.1)))

    # Splitting features and target into train, validation and test samples 
    X_train, X_val, X_test = features[:train_split], features[train_split:val_split], features[val_split:]
    Y_train, Y_val, Y_test = target[:train_split], target[train_split:val_split], target[val_split:]

    #print shape of samples
    print(X_train.shape, X_val.shape, X_test.shape)
    print(Y_train.shape, Y_val.shape, Y_test.shape)
    
    return X_train, X_val, X_test, Y_train, Y_val, Y_test

In [None]:
X_train, X_val, X_test, Y_train, Y_val, Y_test = create_train_test_set(df_Stock)

In [None]:
from sklearn.linear_model import LinearRegression #uploading the linear regression model 

lr = LinearRegression()
#fit() is implemented by every estimator and it accepts an input for the sample data ( X_train ) and for supervised models it also accepts an argument for labels (Y_train)
lr.fit(X_train, Y_train)

In [None]:
#Displaying the models coefficients and intercept 
print('LR Coefficients: \n', lr.coef_)
print('LR Intercept: \n', lr.intercept_)

In [None]:
print("Performance (R^2): ", lr.score(X_train, Y_train))

In [None]:
#Calculating the regression error in order to be used while evaluating the model's performance
def get_mape(y_true, y_pred): 
    
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

5.1. Training the model 

In [None]:
#In this case we are using linear regression 

Y_train_pred = lr.predict(X_train)
Y_val_pred = lr.predict(X_val)
Y_test_pred = lr.predict(X_test)

Test Performance On Google Stock
    https://www.kaggle.com/datasets/varpit94/google-stock-data

In [None]:
#Reading the dataset using "read_csv" a prebuild function in the "pandas" package, the "index_col" contains the column to use as the row labels of the DataFrame
GoogleStock = pd.read_csv('GOOGL.csv',  index_col=0)
df_GoogleStock = GoogleStock
#"rename" prebuild function that enables us to change the name of a column in a Dataframe
df_GoogleStock = df_GoogleStock.rename(columns={'Close(t)':'Close'})
#Display using Head function that returns the dataframe or series with the first few rows (by default 5)
df_GoogleStock.head()

In [None]:
X_Google=df_GoogleStock[["Open","Low","High","Volume"]]
Y_Google=df_GoogleStock['Close']

In [None]:
Y_Google_pred = lr.predict(X_Google)


# The MSE 

In [None]:
# Calculate the accuracy of the model
r2 = metrics.r2_score(Y_Google, Y_Google_pred)
print('R-squared score:', r2)
print('Validation Mean Squared Error for The Original Dataset :', round(metrics.mean_squared_error(Y_val,Y_val_pred), 2)) 
print('Validation Mean Squared Error For Google Dataset :',round(metrics.mean_squared_error(Y_Google,Y_Google_pred), 2)) 


In [None]:
#Displaying the values of actual price at a closing trading day and the predicted value of the price 
df_pred = pd.DataFrame(Y_Google.values, columns=['close Actual'], index=Y_Google.index)
df_pred['Close Predicted'] = Y_Google_pred
df_pred = df_pred.reset_index()
df_pred
