In [1]:
import pandas as pd
import math
import numpy as np
from sklearn import preprocessing, svm, model_selection
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv('../Machine-Learning/Datasets/tesla-stock.csv', index_col = 0)

In [3]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6/29/2010,19.0,25.0,17.540001,23.889999,18766300,23.889999
6/30/2010,25.790001,30.42,23.299999,23.83,17187100,23.83
7/1/2010,25.0,25.92,20.27,21.959999,8218800,21.959999
7/2/2010,23.0,23.1,18.709999,19.200001,5139800,19.200001
7/6/2010,20.0,20.0,15.83,16.110001,6866900,16.110001


# Labels and Features

### Features - it's the set of attributes which are actually input and on which our model will be trained.

### Labels - It's the set of attributes which are actually output, i.e values on which data will be tested and results will be predicted.

#### Adding new and more usefull features in the DataFrame - 

In [4]:
df['HL_PCT'] = (df['High'] - df['Low']) / df['Low'] * 100
df['PCT_Change'] = (df['Close'] - df['Open']) / df['Open'] * 100

In [5]:
df = df[['HL_PCT', 'PCT_Change', 'Close', 'Volume']]

In [6]:
df.head()

Unnamed: 0_level_0,HL_PCT,PCT_Change,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6/29/2010,42.531349,25.736837,23.889999,18766300
6/30/2010,30.557946,-7.599848,23.83,17187100
7/1/2010,27.873705,-12.160004,21.959999,8218800
7/2/2010,23.463395,-16.521735,19.200001,5139800
7/6/2010,26.342388,-19.449995,16.110001,6866900


forcast_out = math.ceil(0.01*len(df))
#### This was done so that we can actually decide that for how many values / days / rows in future the value has to be predicted.

In [7]:
forecast_col = 'Close'
forecast_out = math.ceil(0.01*len(df))
df.fillna(-99999, inplace=True)
print(forecast_out)

17


In [26]:
df.iloc[forecast_out:, 3]

Date
7/23/2010      653600
7/26/2010      922200
7/27/2010      619700
7/28/2010      467200
7/29/2010      616000
               ...   
12/27/2016    5915700
12/28/2016    3782500
12/29/2016    4045000
12/30/2016    4642600
1/3/2017      5923300
Name: Volume, Length: 1624, dtype: int64

#### .shift(+/- value) - it shifts the column or DataFrame up & down respectively by the no of rows mentioned(values)
#### above iloc is used to demostrate the shift taking place in the DataFrame which you can see below.

In [29]:
df['label'] = df[forecast_col].shift(-forecast_out)
df.dropna(inplace=True) # Removing all NaN(s) as they can't be used in Training / Testing of Model

In [30]:
df

Unnamed: 0_level_0,HL_PCT,PCT_Change,Close,Volume,label
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6/29/2010,42.531349,25.736837,23.889999,18766300,21.290001
6/30/2010,30.557946,-7.599848,23.830000,17187100,20.950001
7/1/2010,27.873705,-12.160004,21.959999,8218800,20.549999
7/2/2010,23.463395,-16.521735,19.200001,5139800,20.719999
7/6/2010,26.342388,-19.449995,16.110001,6866900,20.350000
...,...,...,...,...,...
12/1/2016,4.160220,-3.383795,181.880005,5126400,219.529999
12/2/2016,2.711114,-0.771000,181.470001,4042300,219.740005
12/5/2016,3.495701,2.350561,186.800003,4072200,214.679993
12/6/2016,2.134886,0.177879,185.850006,3391600,213.690002


# Training and Testing

#### scikit-learn cannot directly work with pandas DataFrames instead, it's made fundamentally to work with NumPy Arrays. So we convert the Cleaned Data frames to Numpy Arrays.

#### Here, we use X to denote all the features and y for all the labels

In [31]:
X = np.array(df.drop(['label'], axis =1))
y = np.array(df['label'])
print(len(X), len(y))

1624 1624


#### Scaling is the process of cleaning & bringing all the data in the range of 1 and -1. It helps in increasing the computation time. In mathematical words, all the data is NORMALIZED. This function returns a new Dataset.

In [33]:
X = preprocessing.scale(X)
print(len(X))

1624


#### cross_validation is outdated, now model_selection is used. Out of which we used "test_train_split" to split data into 2 parts
1. Training Data - on which model will be fitted / trained.
2. Tested Data - on which we'll find the output and calculate accuracy & score aka confidence.

#### test_size = 0.2 means, we'll train on 80% data and test on 20%

In [13]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size =0.2)

## Classifier is the Algorithm used.
### Linear Regression

In [14]:
clf = LinearRegression(n_jobs =-1)

## Training

####  .fit( X, y ) -  it's the training of the model.
#### The model learns trend for what values of different attributes in X_train, values in y_train exist.

In [15]:
clf.fit(X_train, y_train)

LinearRegression(n_jobs=-1)

## Testing
#### .score( X, y ) - it's the testing of model on the given data and calculation of accuracy of our model.
#### Model after training, predicts the values of attributes in y_test after analysing the values of attributes in X_test.

In [16]:
accuracy = clf.score(X_test, y_test)
accuracy

0.9659885187879884

### Training & Testing Using Support Vector Regression from Support Vector Machines (SVM)

In [17]:
for k in ['linear', 'poly', 'rbf', 'sigmoid']:
    clf = svm.SVR(kernel=k)
    clf.fit(X_train, y_train)
    confidence = clf.score(X_test, y_test)
    print(k, confidence)

linear 0.9655422784326977
poly 0.5455350982854816
rbf 0.9247996102303602
sigmoid 0.9335850678347932
