# Data Science Internship at Widhya

## Mission: Stock Market Prediction using LR

#### Importing Required Libraries

In [1]:
import quandl
import pandas as pd
import numpy as np 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

#### Fetching the stock data

In [2]:
df = quandl.get("WIKI/AAPL")

#### Preview of data

In [3]:
print(df.head())

             Open   High    Low  Close     Volume  Ex-Dividend  Split Ratio  \
Date                                                                          
1980-12-12  28.75  28.87  28.75  28.75  2093900.0          0.0          1.0   
1980-12-15  27.38  27.38  27.25  27.25   785200.0          0.0          1.0   
1980-12-16  25.37  25.37  25.25  25.25   472000.0          0.0          1.0   
1980-12-17  25.87  26.00  25.87  25.87   385900.0          0.0          1.0   
1980-12-18  26.63  26.75  26.63  26.63   327900.0          0.0          1.0   

            Adj. Open  Adj. High  Adj. Low  Adj. Close  Adj. Volume  
Date                                                                 
1980-12-12   0.422706   0.424470  0.422706    0.422706  117258400.0  
1980-12-15   0.402563   0.402563  0.400652    0.400652   43971200.0  
1980-12-16   0.373010   0.373010  0.371246    0.371246   26432000.0  
1980-12-17   0.380362   0.382273  0.380362    0.380362   21610400.0  
1980-12-18   0.391536   0.

In [4]:
print(df.tail())

              Open    High     Low    Close      Volume  Ex-Dividend  \
Date                                                                   
2018-03-21  175.04  175.09  171.26  171.270  35247358.0          0.0   
2018-03-22  170.00  172.68  168.60  168.845  41051076.0          0.0   
2018-03-23  168.39  169.92  164.94  164.940  40248954.0          0.0   
2018-03-26  168.07  173.10  166.44  172.770  36272617.0          0.0   
2018-03-27  173.68  175.15  166.92  168.340  38962839.0          0.0   

            Split Ratio  Adj. Open  Adj. High  Adj. Low  Adj. Close  \
Date                                                                  
2018-03-21          1.0     175.04     175.09    171.26     171.270   
2018-03-22          1.0     170.00     172.68    168.60     168.845   
2018-03-23          1.0     168.39     169.92    164.94     164.940   
2018-03-26          1.0     168.07     173.10    166.44     172.770   
2018-03-27          1.0     173.68     175.15    166.92     168.340  

### Shape of data

In [5]:
df.shape

(9400, 12)

#### Calculating new fields High-Low Percentage (HL_PCT) and Percentage Change (PCT_Cchange)

In [6]:
df['HL_PCT'] = (df['Adj. High']-df['Adj. Low'])/df['Adj. Close'] * 100.0
df['PCT_change'] = (df['Adj. Close']-df['Adj. Open'])/df['Adj. Open'] * 100.0

#### Preview of the dataset after adding fields

In [7]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume,HL_PCT,PCT_change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1980-12-12,28.75,28.87,28.75,28.75,2093900.0,0.0,1.0,0.422706,0.42447,0.422706,0.422706,117258400.0,0.417391,0.0
1980-12-15,27.38,27.38,27.25,27.25,785200.0,0.0,1.0,0.402563,0.402563,0.400652,0.400652,43971200.0,0.477064,-0.474799
1980-12-16,25.37,25.37,25.25,25.25,472000.0,0.0,1.0,0.37301,0.37301,0.371246,0.371246,26432000.0,0.475248,-0.473
1980-12-17,25.87,26.0,25.87,25.87,385900.0,0.0,1.0,0.380362,0.382273,0.380362,0.380362,21610400.0,0.502513,0.0
1980-12-18,26.63,26.75,26.63,26.63,327900.0,0.0,1.0,0.391536,0.3933,0.391536,0.391536,18362400.0,0.45062,0.0


#### Subseting required features

In [8]:
df = df[['Adj. Close','PCT_change','HL_PCT']] 
# Take a look at the new data 
print(df.head())

            Adj. Close  PCT_change    HL_PCT
Date                                        
1980-12-12    0.422706    0.000000  0.417391
1980-12-15    0.400652   -0.474799  0.477064
1980-12-16    0.371246   -0.473000  0.475248
1980-12-17    0.380362    0.000000  0.502513
1980-12-18    0.391536    0.000000  0.450620


#### Identifying Null values

In [9]:
df.isna().sum()

Adj. Close    0
PCT_change    0
HL_PCT        0
dtype: int64

#### A variable for predicting no of days out into the future.`

In [10]:
forecast_out = 10 #'n=10' days

#### Creating another column (i.e. target variable) shifted 'n' units up

In [11]:
df['Prediction'] = df[['Adj. Close']].shift(-forecast_out)

#### Preview of dataset

In [12]:
print(df.tail())

            Adj. Close  PCT_change    HL_PCT  Prediction
Date                                                    
2018-03-21     171.270   -2.153793  2.236235         NaN
2018-03-22     168.845   -0.679412  2.416417         NaN
2018-03-23     164.940   -2.048815  3.019280         NaN
2018-03-26     172.770    2.796454  3.854836         NaN
2018-03-27     168.340   -3.074620  4.888915         NaN


#### Creating independent data set (X)

#### Converting the dataframe to a numpy array

In [13]:
X = np.array(df.drop(['Prediction'],1))

#### Removing the last '10' rows

In [14]:
X = X[:-forecast_out]
print(X)

[[  0.42270592   0.           0.4173913 ]
 [  0.40065169  -0.47479912   0.47706422]
 [  0.37124607  -0.47299961   0.47524752]
 ...
 [179.98         1.13508654   1.45016113]
 [181.72         0.79316656   1.19964781]
 [179.97        -1.43490881   2.36706118]]


#### Creating the dependent data set (y)

#### Convert the dataframe to a numpy array

In [15]:
y = np.array(df['Prediction'])

#### Geting all of the y values except the last '10' rows

In [16]:
y = y[:-forecast_out]
print(y)

[  0.52930132   0.51636284   0.50180706 ... 164.94       172.77
 168.34      ]


#### Split the dataset into 80% training and 20% testing

In [17]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#### Model Building: Linear Regression

In [18]:
lr = LinearRegression()

#### Model Training

In [19]:
lr.fit(x_train, y_train)

LinearRegression()

#### Model Testing: Score returns the coefficient of determination R^2 of the prediction. 

In [20]:
print('Training Score: ', round(lr.score(x_train,y_train)*100,2),'%')

# The best possible score is 1.0
print('lr confidence (Testing Score): ', round(lr.score(x_test,y_test)*100,2))

Training Score:  99.71 %
lr confidence (Testing Score):  99.72


#### Model Predictions

In [21]:
y_pred = lr.predict(x_test)
print(y_pred)

[ 1.15890017  1.26868497 77.38783798 ...  1.92974878  0.46175881
  1.23629931]


#### Model performance

In [22]:
from sklearn import metrics
acc=metrics.r2_score(y_test,y_pred)
print("Accuracy Score of Model: ",round(acc*100,2),'%')

Accuracy Score of Model:  99.72 %


#### Actuval Price v/s Predicted Price

In [23]:
prediction_df = pd.DataFrame({ 'Actual Price': y_test, 'Predicted Price': y_pred})  
prediction_df.head()

Unnamed: 0,Actual Price,Predicted Price
0,1.028335,1.1589
1,1.193252,1.268685
2,69.62019,77.387838
3,1.035787,1.130374
4,0.97451,1.10757


#### Model Evaluation

In [24]:
from sklearn import metrics
print('Mean Absolute Error:',round(metrics.mean_absolute_error(y_test,y_pred),2))
print('Mean Squared Error:',round(metrics.mean_squared_error(y_test,y_pred),2))
print('Root Mean Squared Error:',round(np.sqrt(metrics.mean_squared_error(y_test,y_pred)),2))
print('Explained Variance Score:',metrics.explained_variance_score(y_test,y_pred))

Mean Absolute Error: 0.87
Mean Squared Error: 4.09
Root Mean Squared Error: 2.02
Explained Variance Score: 0.9972434734077428


#### Forecasting

In [25]:
# Set x_forecast equal to the last 10 rows of the original data set from Adj. Close column
x_forecast = np.array(df.drop(['Prediction'],1))[-forecast_out:]
print(x_forecast)

[[ 1.78440000e+02 -1.04259095e+00  1.51871778e+00]
 [ 1.78650000e+02  8.40336134e-02  1.21460957e+00]
 [ 1.78020000e+02 -3.52644836e-01  8.42601955e-01]
 [ 1.75300000e+02 -1.13918340e+00  2.17341700e+00]
 [ 1.75240000e+02  0.00000000e+00  1.06140151e+00]
 [ 1.71270000e+02 -2.15379342e+00  2.23623518e+00]
 [ 1.68845000e+02 -6.79411765e-01  2.41641742e+00]
 [ 1.64940000e+02 -2.04881525e+00  3.01927974e+00]
 [ 1.72770000e+02  2.79645386e+00  3.85483591e+00]
 [ 1.68340000e+02 -3.07461999e+00  4.88891529e+00]]


In [26]:
# Printing lR model predictions for the next '10' days
lr_prediction = lr.predict(x_forecast)
print(lr_prediction)

[179.66811987 179.88028552 179.25120724 176.50075491 176.44993127
 172.44608633 170.00004039 166.06609699 173.92718696 169.47017206]


#### Creating Final function pricepredictor for price prediction

In [27]:
def pricepredictor(forecast_days):
    
    
    # A variable for predicting 'n' days out into the future
    forecast_out = forecast_days
    df['Prediction'] = df[['Adj. Close']].shift(-forecast_out)
    
    # Convert the dataframe to a numpy array
    X = np.array(df.drop(['Prediction'],1))
    
    #Remove the last '10' rows
    X = X[:-forecast_out]
    
    # Convert the dataframe to a numpy array 
    y = np.array(df['Prediction'])
    
    # Get all of the y values except the last '10' rows
    y = y[:-forecast_out]

    # Split the data into 80% training and 20% testing
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    # Create and train the Linear Regression  Model
    lr = LinearRegression()
    
    # Train the model
    lr.fit(x_train, y_train)

    # The best possible score is 1.0
    lr_confidence = lr.score(x_test, y_test)
    print('The Confidence {}\n'.format(lr_confidence))
    
    # Set x_forecast equal to the last 10 rows of the original data set from Adj. Close column
    x_forecast = np.array(df.drop(['Prediction'],1))[-forecast_out:]

    
    # Print linear regression model predictions for the next '10' days
    lr_prediction = lr.predict(x_forecast)
    print('Prediction for next {} day price:\n {}'.format(forecast_days,lr_prediction))
    

#### Predictiing next 1 day price

In [28]:
pricepredictor(1)

The Confidence 0.9996756572002168

Prediction for next 1 day price:
 [168.53264492]


#### Predicting next 15 days price

In [29]:
pricepredictor(15)

The Confidence 0.9955821011083461

Prediction for next 15 day price:
 [176.98303239 178.90378285 181.96618241 183.73228667 181.93436959
 180.41257623 180.63140347 180.00665537 177.22096509 177.19066156
 173.14932704 170.69088834 166.73042324 174.60743463 170.11295375]


#### Thank You :)