# HW3 - Stock Movement Prediction

作業檔案：
- hw3.ipynb

資料：
https://www.sharecast.com/index/SP_500/prices/download

- train.csv: S&P 500 訓練資料(2009-2017)
- test.csv: S&P 500 測試資料(2018)


In [2]:
# Read data

import pandas as pd
import numpy as np

train_data_path = './train.csv'
test_data_path = './test.csv'

train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

print(train_df.shape)
print(train_df.head())
print(test_df.shape)
print(test_df.head())

(2264, 6)
          Date  Open Price  Close Price  High Price  Low Price      Volume
0  02-Jan-2009      902.99       931.80      934.73     899.35  4048270080
1  05-Jan-2009      929.17       927.45      936.63     919.53  5413910016
2  06-Jan-2009      931.17       934.70      943.85     927.28  5392620032
3  07-Jan-2009      927.45       906.65      927.45     902.37  4704940032
4  08-Jan-2009      905.73       909.73      910.00     896.81  4991549952
(252, 6)
          Date  Open Price  Close Price  High Price  Low Price      Volume
0  02-Jan-2018     2683.73      2695.81     2695.89    2682.36  1846463232
1  03-Jan-2018     2697.85      2713.06     2714.37    2697.77  2090595328
2  04-Jan-2018     2719.31      2723.99     2729.29    2719.07  2100767744
3  05-Jan-2018     2731.33      2743.15     2743.45    2727.92  1918869120
4  08-Jan-2018     2742.67      2747.71     2748.51    2737.60  1894823936


In [3]:
# Drop unnecessary columns

drop_col_names = ['Date'] # !--- or you can modify it to drop the columns you don't want ---!

train_df.drop(columns=drop_col_names, inplace=True)
test_df.drop(columns=drop_col_names, inplace=True)

print(train_df.shape)
print(train_df.head())
print(test_df.shape)
print(test_df.head())

(2264, 5)
   Open Price  Close Price  High Price  Low Price      Volume
0      902.99       931.80      934.73     899.35  4048270080
1      929.17       927.45      936.63     919.53  5413910016
2      931.17       934.70      943.85     927.28  5392620032
3      927.45       906.65      927.45     902.37  4704940032
4      905.73       909.73      910.00     896.81  4991549952
(252, 5)
   Open Price  Close Price  High Price  Low Price      Volume
0     2683.73      2695.81     2695.89    2682.36  1846463232
1     2697.85      2713.06     2714.37    2697.77  2090595328
2     2719.31      2723.99     2729.29    2719.07  2100767744
3     2731.33      2743.15     2743.45    2727.92  1918869120
4     2742.67      2747.71     2748.51    2737.60  1894823936


In [4]:
# Add the column `Tomorrow Movement` by comparing the `Close Price` with the previous days as the training target

train_df['Tomorrow Movement'] = np.where(train_df['Close Price'].diff() >= 0, 1, 0)
test_df['Tomorrow Movement'] = np.where(test_df['Close Price'].diff() >= 0, 1, 0)

train_df['Tomorrow Movement'] = train_df['Tomorrow Movement'].shift(-1)
test_df['Tomorrow Movement'] = test_df['Tomorrow Movement'].shift(-1)

print(train_df.head())
print(train_df.tail())

   Open Price  Close Price  High Price  Low Price      Volume  \
0      902.99       931.80      934.73     899.35  4048270080   
1      929.17       927.45      936.63     919.53  5413910016   
2      931.17       934.70      943.85     927.28  5392620032   
3      927.45       906.65      927.45     902.37  4704940032   
4      905.73       909.73      910.00     896.81  4991549952   

   Tomorrow Movement  
0                0.0  
1                1.0  
2                0.0  
3                1.0  
4                0.0  
      Open Price  Close Price  High Price  Low Price      Volume  \
2259     2684.22      2683.34     2685.35    2678.13  1383888512   
2260     2679.09      2680.50     2682.74    2677.96  1103808384   
2261     2682.10      2682.62     2685.64    2678.91  1149108352   
2262     2686.10      2687.54     2687.66    2682.69  1126089856   
2263     2689.15      2673.61     2692.12    2673.61  1332374016   

      Tomorrow Movement  
2259                0.0  
2260      

In [5]:
# !--- You can add your own data preprocessing here ---!

In [6]:
# Drop rows with NaN values

train_df = train_df.dropna()
test_df = test_df.dropna()

print(train_df.shape)

(2263, 6)


In [7]:
# Divide x and y data

train_x_df = train_df.drop(columns=['Tomorrow Movement'])
train_y_df = train_df['Tomorrow Movement']

test_x_df = test_df.drop(columns=['Tomorrow Movement'])
test_y_df = test_df['Tomorrow Movement']

print(train_x_df.shape)
print(train_x_df.head())
print(train_y_df.shape)
print(train_y_df.head())
print('-----')
print(test_x_df.shape)
print(test_x_df.head())
print(test_y_df.shape)
print(test_y_df.head())

(2263, 5)
   Open Price  Close Price  High Price  Low Price      Volume
0      902.99       931.80      934.73     899.35  4048270080
1      929.17       927.45      936.63     919.53  5413910016
2      931.17       934.70      943.85     927.28  5392620032
3      927.45       906.65      927.45     902.37  4704940032
4      905.73       909.73      910.00     896.81  4991549952
(2263,)
0    0.0
1    1.0
2    0.0
3    1.0
4    0.0
Name: Tomorrow Movement, dtype: float64
-----
(251, 5)
   Open Price  Close Price  High Price  Low Price      Volume
0     2683.73      2695.81     2695.89    2682.36  1846463232
1     2697.85      2713.06     2714.37    2697.77  2090595328
2     2719.31      2723.99     2729.29    2719.07  2100767744
3     2731.33      2743.15     2743.45    2727.92  1918869120
4     2742.67      2747.71     2748.51    2737.60  1894823936
(251,)
0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
Name: Tomorrow Movement, dtype: float64


In [8]:
# Normalize data

# !--- Modify here if you want ---!

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(train_x_df)

normalized_train_x_df = scaler.transform(train_x_df)
normalized_train_x_df = np.transpose(normalized_train_x_df)

normalized_train_x_df = pd.DataFrame({
    'Open Price': normalized_train_x_df[0],
    'Close Price': normalized_train_x_df[1],
    'High Price': normalized_train_x_df[2],
    'Low Price': normalized_train_x_df[3],
    'Volume': normalized_train_x_df[4],
})

normalized_test_x_df = scaler.transform(test_x_df)
normalized_test_x_df = np.transpose(normalized_test_x_df)
normalized_test_x_df = pd.DataFrame({
    'Open Price': normalized_test_x_df[0],
    'Close Price': normalized_test_x_df[1],
    'High Price': normalized_test_x_df[2],
    'Low Price': normalized_test_x_df[3],
    'Volume': normalized_test_x_df[4],
})

print(normalized_train_x_df.head())
print(train_y_df[:5])

  return self.partial_fit(X, y)
  # Remove the CWD from sys.path while we load stuff.


   Open Price  Close Price  High Price  Low Price    Volume
0   -1.552572    -1.494607   -1.505683  -1.541181  0.813175
1   -1.498571    -1.503581   -1.501760  -1.499581  1.823826
2   -1.494446    -1.488625   -1.486853  -1.483605  1.808070
3   -1.502119    -1.546489   -1.520714  -1.534956  1.299148
4   -1.546921    -1.540136   -1.556744  -1.546417  1.511255
0    0.0
1    1.0
2    0.0
3    1.0
4    0.0
Name: Tomorrow Movement, dtype: float64


# Logistic Regression

In [9]:
# Train & Predict using Logistic Regression

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr_model =  LogisticRegression()
lr_model.fit(normalized_train_x_df, train_y_df)

print('training accuracy:')
lr_training_pred = lr_model.predict(normalized_train_x_df) # !-- Predict training target & print the training accuracy here --!
lr_training_acc = np.mean(lr_training_pred == train_y_df)
print(lr_training_acc)

print('\ntesting accuracy:')
# !-- Predict testing target & print the testing accuracy here --!
lr_predict_test_result = lr_model.predict(normalized_test_x_df)
lr_testing_acc = np.mean(lr_predict_test_result == test_y_df)
print(lr_testing_acc)

print('\npredicted testing labels:')
print(lr_predict_test_result)



training accuracy:
0.5475033141847105

testing accuracy:
0.5258964143426295

predicted testing labels:
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1.]


In [10]:
# Print precision, recall, fbeta-score and confusion matrix

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

print('precision, recall, fbeta-score:')
print(precision_recall_fscore_support(test_y_df, lr_predict_test_result, average='weighted'))
print('\nconfusion matrix(tn, fp, fn, tp):')
tn, fp, fn, tp = confusion_matrix(test_y_df, lr_predict_test_result).ravel()
print((tn, fp, fn, tp))

precision, recall, fbeta-score:
(0.5137282196515144, 0.5258964143426295, 0.3694764768608263, None)

confusion matrix(tn, fp, fn, tp):
(1, 118, 1, 131)


# SVM

In [12]:
# Train & Predict with SVC

from sklearn.svm import SVC

svc_model =  SVC(gamma='scale')
svc_model.fit(normalized_train_x_df, train_y_df) # !-- Fill the training data here --!

print('training accuracy:')
lr_training_pred = svc_model.predict(normalized_train_x_df)
svc_training_acc = np.mean(lr_training_pred == train_y_df)
print(svc_training_acc)

print('\ntesting accuracy:')
# !-- Predict testing target & print the testing accuracy here --!
svc_predict_test_result = svc_model.predict(normalized_test_x_df)
svc_testing_acc = np.mean(svc_predict_test_result == test_y_df)
print(svc_testing_acc)

print('\npredicted testing labels:')
print(svc_predict_test_result)

training accuracy:
0.5483870967741935

testing accuracy:
0.5258964143426295

predicted testing labels:
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [13]:
# Print precision, recall, fbeta-score and confusion matrix

print('precision, recall, fbeta-score:')
print(precision_recall_fscore_support(test_y_df, svc_predict_test_result, average='weighted'))
print('\nconfusion matrix(tn, fp, fn, tp):')
tn, fp, fn, tp = confusion_matrix(test_y_df, svc_predict_test_result).ravel()
print((tn, fp, fn, tp))

precision, recall, fbeta-score:


  'precision', 'predicted', average, warn_for)


(0.2765670386184346, 0.5258964143426295, 0.3624977895207681, None)

confusion matrix(tn, fp, fn, tp):
(0, 119, 0, 132)


# Neural Network

In [14]:
# Define NN output groundtruth

falling_prob = pd.DataFrame(data=np.where(train_y_df == 0, 1, 0)[:])
train_y_df = pd.DataFrame(data=np.where(train_y_df == 0, 0, 1)[:])
train_y_df = pd.concat( [ falling_prob, train_y_df ], axis=1, ignore_index=True )

falling_prob = pd.DataFrame(data=np.where(test_y_df == 0, 1, 0)[:])
test_y_df = pd.DataFrame(data=np.where(test_y_df == 0, 0, 1)[:])
test_y_df = pd.concat( [ falling_prob, test_y_df ], axis=1, ignore_index=True )

print(train_y_df.shape)
print(train_y_df.head())

(2263, 2)
   0  1
0  1  0
1  0  1
2  1  0
3  0  1
4  1  0


In [40]:
# Define NN structure

import torch
import torch.nn.functional as F

# !--- You can modify the NN structure here ---!
class M_NN(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(M_NN, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        h = self.linear1(x)
        acti_out = F.relu(h)
        y_pred = self.linear2(h)
        return y_pred


# N = batch size, D_in = input size, H = hidden size, D_out = output size
N, D_in, H, D_out = 300, 5, 100, 2  # !--- You can modify here ---!

model = M_NN(D_in, H, D_out)
criterion = torch.nn.BCEWithLogitsLoss(reduction='sum') # !--- You can modify here ---!
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) # !--- You can modify here ---!


# Train NN
# !--- You can modify here ---!

for t in range(1000):
    for batch_num in range(N, len(normalized_train_x_df), N): 
        X = torch.Tensor(normalized_train_x_df.to_numpy())[batch_num-N:batch_num]
        Y = torch.Tensor(train_y_df.to_numpy())[batch_num-N:batch_num]
        y_pred = model(X) # !-- Fill the training batch data here --!
        loss = criterion(y_pred, Y) # !-- Fill the prediction & groundtruth here to calculate loss --!
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    if (t%100 == 0):
        print('epoch:%d - loss:%.10f' % (t, loss.item()))

epoch:0 - loss:526.0357055664
epoch:100 - loss:416.2016296387
epoch:200 - loss:415.3622131348
epoch:300 - loss:415.1055908203
epoch:400 - loss:414.9973144531
epoch:500 - loss:414.9463500977
epoch:600 - loss:414.9208679199
epoch:700 - loss:414.9069213867
epoch:800 - loss:414.8977050781
epoch:900 - loss:414.8901367188


In [43]:
# Predict

nn_predict_train_y = model(torch.Tensor(normalized_train_x_df.to_numpy()))
result_train = np.where(nn_predict_train_y[:, 0] > nn_predict_train_y[:, 1], 1, 0) # !-- You can modify here --!
print('training accuracy:')
print(accuracy_score(train_y_df[0], result_train))

nn_predict_test_y = model(torch.Tensor(normalized_test_x_df.to_numpy()))
result_test = np.where(nn_predict_test_y[:, 0] > nn_predict_test_y[:, 1], 1, 0) # !-- You can modify here --!
print('\ntesting accuracy:')
print(accuracy_score(test_y_df[0], result_test))

print('\npredicted testing prob:')
print(nn_predict_test_y)
print('\npredicted testing labels:')
print(result_test)

training accuracy:
0.5497127706584181

testing accuracy:
0.5219123505976095

predicted testing prob:
tensor([[ 3.1489e-02, -3.1790e-02],
        [ 4.6007e-02, -4.7118e-02],
        [ 2.2245e-02, -2.4210e-02],
        [ 4.2855e-02, -4.2700e-02],
        [ 2.8167e-02, -2.7563e-02],
        [ 1.4437e-02, -1.6183e-02],
        [ 2.6864e-02, -2.5561e-02],
        [ 5.4298e-02, -5.4996e-02],
        [ 6.3450e-02, -6.4568e-02],
        [-2.2035e-02,  2.1339e-02],
        [ 7.6339e-02, -7.6734e-02],
        [ 2.1561e-02, -2.1525e-02],
        [ 5.7131e-02, -5.6638e-02],
        [ 9.1495e-02, -9.2223e-02],
        [ 4.8534e-02, -4.8801e-02],
        [ 2.5732e-02, -2.4936e-02],
        [ 2.7160e-02, -2.6068e-02],
        [ 1.0494e-01, -1.0564e-01],
        [ 6.5963e-03, -7.3728e-03],
        [ 1.3529e-02, -1.4268e-02],
        [ 2.5977e-02, -2.5552e-02],
        [ 4.7432e-02, -5.0999e-02],
        [-7.9345e-02,  7.9732e-02],
        [-2.0523e-01,  2.0194e-01],
        [ 2.3404e-01, -2.3243e-01],

        [-7.3027e-02,  7.0189e-02]], grad_fn=<AddmmBackward>)

predicted testing labels:
[1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0 1 1 1 1 1 1 0 0 0 1
 1 0 0 0 1 1 1 1 1 1 1 0 0 0 1 0 1 0 0 0 1 0 0 1 0 1 1 1 0 0 1 0 1 0 1 1 1
 0 0 0 0 1 1 0 0 1 0 1 1 1 1 1 1 1 0 0 1 1 0 1 0 1 1 0 0 1 0 1 1 1 1 1 1 1
 1 0 1 1 1 1 1 0 1 0 1 0 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1
 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 0 0 1 0 1 1 0 1 0 1 0 1 0 0 1 1 1 0 1 1 1 1 1 0 0 0 1
 1 0 0 0 0 1 1 1 1 1 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0]


In [44]:
# Print precision, recall, fbeta-score and confusion matrix

print('\nprecision, recall, fbeta-score:')
print(precision_recall_fscore_support(test_y_df[0], result_test, average='weighted'))
print('\nconfusion matrix(tn, fp, fn, tp):')
tn, fp, fn, tp = confusion_matrix(test_y_df[0], result_test).ravel()
print((tn, fp, fn, tp))


precision, recall, fbeta-score:
(0.5390833287570039, 0.5219123505976095, 0.5046506732366034, None)

confusion matrix(tn, fp, fn, tp):
(45, 87, 33, 86)


# Discussion