# Internal data without time courses

This notebook imports the data from the Data Preperation for the Ticker AAPL. Test and training data are generated on the basis of this data. The result is applied to different sklearn methods.

# Content
 1. Import dependencies
 2. Load data
 3. Splitting data in training and testing
 4. Support Vector Machine
 5. Linear Discrimant Analysis
 6. Gradient Boosting
 7. Random Forest
 8. KNN

<hr>

# 1. Import dependencies

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import datetime
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import model_selection
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# 2. Load Data

In [2]:
merged= pd.read_csv('prepared data/Data_Preperation_one_ticker_internal.csv', sep=',', decimal=',')
merged.head()

Unnamed: 0,AAPL,aaplopen,aaplclose,aapllow,aaplhigh,aaplvolume
0,-1,-0.8439133708661497,-0.8491356788444249,-0.8505482714349095,-0.8476315604538179,-0.3788670257818751
1,-1,-0.8489391768685292,-0.8490192685242434,-0.8503652918037392,-0.8513293217198822,-0.5214024609069974
2,0,-0.8540951882514695,-0.8682653430692955,-0.8684014541000062,-0.8569854903450415,-0.1524493953531429
3,0,-0.8673758415424508,-0.8713824836781627,-0.8797197895367793,-0.8690966312786422,0.2180639638648587
4,0,-0.868821086498885,-0.8796474264520685,-0.8789748184883415,-0.8705654281968385,-0.1115821567711611


# 3. Splitting data in training and testing

In [3]:
x = merged[['aaplopen', 'aaplclose', 'aapllow', 'aaplhigh', 'aaplvolume']]
y = merged['AAPL']

In [4]:
x.shape, y.shape

((2518, 5), (2518,))

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=42)



### In the following the training is carried out with the following 5 methods:
- Support Vector Machine
- Linear Discrimant Analysis
- Gradient Boosting
- Random Forest
- KNN

Different parameters were tested for each method. The best result is used in the following. In addition, prediction methods are used for each ticker with the same data, but with different results.

# 4. Support Vector Machine

In [6]:
svm = svm.SVC(kernel= 'linear', C = 1, max_iter=100000000)

In [7]:
svm.fit(x_train, y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=100000000, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [8]:
svm.score(x_test, y_test)

0.6984126984126984

In [9]:
#prediction for one data set
res_svm = svm.predict([[-0.84, -0.849, -0.85, -0.84, -0.4]])
res_svm[0]

0

# 5. Linear Discrimant Analysis

In [10]:
lda = LinearDiscriminantAnalysis()

In [11]:
lda.fit(x_train, y_train)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [12]:
lda.score(x_test, y_test)

0.7208994708994709

In [13]:
#prediction for one data set
res_lda = lda.predict([[-0.84, -0.849, -0.85, -0.84, -0.4]])
res_lda[0]

0

# 6. Gradient Boosting

In [14]:
scaler = MinMaxScaler()

In [15]:
num_trees = 10
kfold = model_selection.KFold(n_splits=10, random_state=42)
model = GradientBoostingClassifier(n_estimators=num_trees, random_state=42)
model.fit(x_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=10,
              presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False)

In [16]:
steps = [('scale', scaler), ('GB', model)]

In [17]:
pipeline = Pipeline(steps)

In [18]:
gb = model_selection.cross_val_score(pipeline, x_train, y_train, cv=kfold)
print(gb.mean())

0.7712859527478171


In [19]:
#prediction for one data set
res_gb = model.predict([[-0.84, -0.849, -0.85, -0.84, -0.4]])
res_gb[0]

0

# 7. Random Forest

In [20]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)

In [21]:
rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [22]:
rf.score(x_test, y_test)

0.798941798941799

In [23]:
rf.feature_importances_

array([0.18646095, 0.19483849, 0.18137445, 0.21626596, 0.22106015])

In [24]:
#prediction for one data set
res_rf = rf.predict([[-0.84, -0.849, -0.85, -0.84, -0.4]])
res_rf[0]

1

# 8. KNN

In [25]:
knn = KNeighborsClassifier()

In [26]:
knn.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [27]:
knn.score(x_test, y_test)

0.8029100529100529

In [28]:
#prediction for one data set
res_knn = knn.predict([[-0.84, -0.849, -0.85, -0.84, -0.4]])
res_knn[0]

0