In [1]:
#description: Forecasting adjusted closing price for Facebook stock (FB) for 30 days using linear regression 
#and support vector regression (rbf) models with data retreived from quandl 

In [2]:
#import dependencies 
import quandl
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR 
from sklearn.model_selection import train_test_split

In [3]:
#get stock data 
df = quandl.get("WIKI/FB")
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2012-05-18,42.05,45.0,38.0,38.2318,573576400.0,0.0,1.0,42.05,45.0,38.0,38.2318,573576400.0
2012-05-21,36.53,36.66,33.0,34.03,168192700.0,0.0,1.0,36.53,36.66,33.0,34.03,168192700.0
2012-05-22,32.61,33.59,30.94,31.0,101786600.0,0.0,1.0,32.61,33.59,30.94,31.0,101786600.0
2012-05-23,31.37,32.5,31.36,32.0,73600000.0,0.0,1.0,31.37,32.5,31.36,32.0,73600000.0
2012-05-24,32.95,33.21,31.77,33.03,50237200.0,0.0,1.0,32.95,33.21,31.77,33.03,50237200.0


In [4]:
#get adjusted closing price 
df = df[["Adj. Close"]]
df.head()

Unnamed: 0_level_0,Adj. Close
Date,Unnamed: 1_level_1
2012-05-18,38.2318
2012-05-21,34.03
2012-05-22,31.0
2012-05-23,32.0
2012-05-24,33.03


In [5]:
#create variable for predicting number of days out into future 
forecast_out = 1 
#create another column for the target/dependent variable shifted forecast_out units up
df["Prediction"] = df[["Adj. Close"]].shift(-1)
#print the new data set
df.head()


Unnamed: 0_level_0,Adj. Close,Prediction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2012-05-18,38.2318,34.03
2012-05-21,34.03,31.0
2012-05-22,31.0,32.0
2012-05-23,32.0,33.03
2012-05-24,33.03,31.91


In [6]:
#view value trying to forecast
df.tail()

Unnamed: 0_level_0,Adj. Close,Prediction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-03-21,169.39,164.89
2018-03-22,164.89,159.39
2018-03-23,159.39,160.06
2018-03-26,160.06,152.19
2018-03-27,152.19,


In [7]:
df["Prediction"] = df[["Adj. Close"]].shift(-forecast_out)
df.tail()

Unnamed: 0_level_0,Adj. Close,Prediction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-03-21,169.39,164.89
2018-03-22,164.89,159.39
2018-03-23,159.39,160.06
2018-03-26,160.06,152.19
2018-03-27,152.19,


In [8]:
forecast_out = 30
df["Prediction"] = df[["Adj. Close"]].shift(-forecast_out)
df.tail()


Unnamed: 0_level_0,Adj. Close,Prediction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-03-21,169.39,
2018-03-22,164.89,
2018-03-23,159.39,
2018-03-26,160.06,
2018-03-27,152.19,


In [9]:
#create the independent data set (X) 
#covernt the dataframe to numpy array
X = np.array(df.drop(["Prediction"],1))
#remove the last forecast_out number of rows
X = X[:-forecast_out]
print(X)

[[ 38.2318]
 [ 34.03  ]
 [ 31.    ]
 ...
 [171.5499]
 [175.98  ]
 [176.41  ]]


In [10]:
#create the dependent data set (y)
#convert the dataframe into a numpy array (all values inclduding the NaN)
y = np.array(df["Prediction"])
#get all of the y values except the last forecast_out number of rows 
y = y[:-forecast_out]
print(y)

[ 30.771  31.2    31.47  ... 159.39  160.06  152.19 ]


In [11]:
#split data in 80% training and 20% testing
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [12]:
#create and train the Support Vector Machine (Regressor)
svr_rbf = SVR(kernel="rbf", C=1e3, gamma=0.1)
svr_rbf.fit(x_train, y_train)

SVR(C=1000.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [13]:
#test the model: score returns the coefficient of determination (R squared)
#best possible R squared is 1.0
svm_confidence = svr_rbf.score(x_test, y_test)
print("svm confidence: ", svm_confidence)

svm confidence:  0.9833580060203231


In [14]:
#create and train the Linear Regression model
linreg = LinearRegression()
#train the model
linreg.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [15]:
#test the linear regression model 
lr_confidence = linreg.score(x_test, y_test)
print("lr confidence: ", lr_confidence)

lr confidence:  0.9831413979917554


In [16]:
#set x_forecast equal to the last 30 rows of the dataset from Adj Close column
x_forecast = np.array(df.drop(["Prediction"],1))[-forecast_out:]
print(x_forecast)

[[173.15]
 [179.52]
 [179.96]
 [177.36]
 [176.01]
 [177.91]
 [178.99]
 [183.29]
 [184.93]
 [181.46]
 [178.32]
 [175.94]
 [176.62]
 [180.4 ]
 [179.78]
 [183.71]
 [182.34]
 [185.23]
 [184.76]
 [181.88]
 [184.19]
 [183.86]
 [185.09]
 [172.56]
 [168.15]
 [169.39]
 [164.89]
 [159.39]
 [160.06]
 [152.19]]


In [17]:
#print linear regression model predictions for the next forecasted days 
linreg_prediction = linreg.predict(x_forecast)
print(linreg_prediction)

#print support vector machine model predictions for the next forecasted days 
svm_prediction = svr_rbf.predict(x_forecast)
print(svm_prediction)

[177.0811149  183.52322329 183.96820409 181.3387721  179.9734901
 181.8949981  182.9872237  187.33589969 188.99446449 185.48518409
 182.3096393  179.9026977  180.5903953  184.41318489 183.78616649
 187.76065409 186.37514569 189.29786049 188.82254009 185.90993849
 188.24608769 187.91235209 189.15627569 176.4844361  172.02451491
 173.27855171 168.72761171 163.16535172 163.84293612 155.88384773]
[174.58979861 181.00683243 179.49707862 180.6998415  174.9409073
 182.80775708 182.80003603 179.00735856 178.46582269 179.49912756
 183.54622993 174.7306981  177.23911396 178.58153031 180.07056057
 178.10061819 180.52660705 179.26292046 178.10464859 180.23762873
 177.60022765 177.86321509 178.8704008  175.74732258 171.09633285
 172.29655473 172.25514133 167.36561759 166.07848255 161.23427781]
