In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pylab import rcParams
import numpy as np
import seaborn as sns
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.feature_selection import RFECV, SelectFromModel, SelectKBest
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
%matplotlib inline

In [None]:
#Reading the dataset using "read_csv" a prebuild function in the "pandas" package, the "index_col" contains the column to use as the row labels of the DataFrame
Stock = pd.read_csv('AppleDataset.csv',  index_col=0)
df_Stock = Stock
#"rename" prebuild function that enables us to change the name of a column in a Dataframe
df_Stock = df_Stock.rename(columns={'Close(t)':'Close'})
#Display using Head function that returns the dataframe or series with the first few rows (by default 5)
df_Stock.head()

In [None]:
#Converting columns that contain data type (Dtype) =object to float using the "astype" prebuild function 
#In the same time replacing "$" symbol with an empty space 
df_Stock["Close"] = df_Stock[' Close/Last'].str.replace('$','').astype(float)
df_Stock['Open'] = df_Stock[' Open'].str.replace('$','').astype(float)
df_Stock['High'] = df_Stock[' High'].str.replace('$','').astype(float)
df_Stock['Low'] = df_Stock[' Low'].str.replace('$','').astype(float)

df_Stock['Volume'] = df_Stock[' Volume']
#With the new named columns containing the converted datatypes, we are droping pevious columns
drop = [' Close/Last',' Open',' High',' Low',' Volume']
df_Stock = df_Stock.drop(drop,axis=1)
#Diplaying the new dataset 
df_Stock.head()

In [None]:
def create_train_test_set(df_Stock):
    #Taking all features as predictors of course droping the close  column...
    """
    the close  column represents the predicted value of close prices in the 
    previous 10 years. so not to make our model overfit and the learning to be biased by these values 
    we drop this column.

    """
    features = df_Stock.drop(columns=['Close'], axis=1)

    #now the target is to predit the values of the close column which represents the predicted 
    #prices by which shares of apple corporation will be sold (or bought).
    target = df_Stock['Close']
    

    data_len = df_Stock.shape[0]
    print('Historical Stock Data length is - ', str(data_len))

    #create a chronological split for train and testing
    train_split = int(data_len * 0.7)
    print('Training Set length - ', str(train_split))

    val_split = train_split + int(data_len * 0.2)
    print('Validation Set length - ', str(int(data_len * 0.2)))

    print('Test Set length - ', str(int(data_len * 0.1)))

    # Splitting features and target into train, validation and test samples 
    X_train, X_val, X_test = features[:train_split], features[train_split:val_split], features[val_split:]
    Y_train, Y_val, Y_test = target[:train_split], target[train_split:val_split], target[val_split:]

    #print shape of samples
    print(X_train.shape, X_val.shape, X_test.shape)
    print(Y_train.shape, Y_val.shape, Y_test.shape)
    
    return X_train, X_val, X_test, Y_train, Y_val, Y_test

In [None]:
X_train, X_val, X_test, Y_train, Y_val, Y_test = create_train_test_set(df_Stock)

In [None]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import LinearRegression #uploading the linear regression model 
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler,PowerTransformer, QuantileTransformer, Normalizer
from sklearn.model_selection import cross_val_score

# define a list of normalization techniques to test

normalizers = [
    ('StandardScaler', StandardScaler()),
    ('MinMaxScaler', MinMaxScaler()),
    ('RobustScaler', RobustScaler()),
    ('PowerTransformer', PowerTransformer()),
    ('QuantileTransformer', QuantileTransformer()),
    ('Normalizer', Normalizer())
]
# # create a pipeline that applies each normalization technique and the linear regression model
# pipeline = Pipeline(normalizers + [('LinearRegression', LinearRegression())])

# # evaluate the performance of the pipeline using cross-validation
# cv_scores = cross_val_score(pipeline, X_train, Y_train, cv=10, scoring='neg_mean_squared_error')

# # compute the mean and standard deviation of the cross-validation scores
# mean_score = np.mean(-cv_scores)
# std_score = np.std(-cv_scores)



In [None]:
# Create pipelines for each normalization technique
from sklearn.pipeline import make_pipeline

pipelines = []
for name, normalizer in normalizers:
    pipeline = make_pipeline(normalizer, LinearRegression())
    pipelines.append((name, pipeline))

# Train and test each pipeline
for name, pipeline in pipelines:
    pipeline.fit(X_train, Y_train)
    y_pred = pipeline.predict(X_test)
    r2 = metrics.r2_score(Y_test, y_pred)
    Y_val_pred = pipeline.predict(X_val)
    print(f'{name}: R2 score = {r2:.3f}')
    print('Validation Mean Squared Error for :',name, round(metrics.mean_squared_error(Y_val,Y_val_pred), 2)) 


In [None]:
# # print the results
# print('Mean score:', cv_scores)

# print('Mean score:', mean_score)
# print('Standard deviation:', std_score)

In [None]:
#Calculating the regression error in order to be used while evaluating the model's performance
def get_mape(y_true, y_pred): 
    
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
#Displaying the values of actual price at a closing trading day and the predicted value of the price
# Train and test each pipeline
df_pred = pd.DataFrame(Y_val.values, columns=['close Actual'], index=Y_val.index)
for name, pipeline in pipelines:
    y_pred = pipeline.predict(X_val)
    df_pred['Close Predicted'+"_"+name] = y_pred 
df_pred = df_pred.reset_index()
df_pred
