In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing, svm
from sklearn.metrics import accuracy_score, confusion_matrix

df = pd.read_csv('stock_apple.csv')
print(df.head())

         Date        Open        High          Low        Close    Adj Close  \
0  2018-04-30  1030.01001  1037.00000  1016.849976  1017.330017  1017.330017   
1  2018-04-27  1046.00000  1049.50000  1025.589966  1030.050049  1030.050049   
2  2018-04-26  1029.51001  1047.97998  1018.190002  1040.040039  1040.040039   
3  2018-04-25  1025.52002  1032.48999  1015.309998  1021.179993  1021.179993   
4  2018-04-24  1052.00000  1057.00000  1010.590027  1019.979980  1019.979980   

    Volume  
0  1664113  
1  1611500  
2  2079500  
3  2391100  
4  4760300  


In [2]:
# Define the fields for yesterday's closing price and yesterday's trading volume
df['Yes Adj Close'] = df['Adj Close'].shift(-1)
df['Yes Volume'] = df['Volume'].shift(-1)

# Define the gain field (today's closing price - yesterday's closing price)"、
df['Gain'] = df['Adj Close']- df['Yes Adj Close']

# Define the rise/fall category field. If today's closing price - yesterday's closing price > 0, set it to 1; otherwise, set it to 0
df['Up'] = df['Gain'].apply(lambda x:1 if x > 0 else 0)

print(df.head())

         Date        Open        High          Low        Close    Adj Close  \
0  2018-04-30  1030.01001  1037.00000  1016.849976  1017.330017  1017.330017   
1  2018-04-27  1046.00000  1049.50000  1025.589966  1030.050049  1030.050049   
2  2018-04-26  1029.51001  1047.97998  1018.190002  1040.040039  1040.040039   
3  2018-04-25  1025.52002  1032.48999  1015.309998  1021.179993  1021.179993   
4  2018-04-24  1052.00000  1057.00000  1010.590027  1019.979980  1019.979980   

    Volume  Yes Adj Close  Yes Volume       Gain  Up  
0  1664113    1030.050049   1611500.0 -12.720032   0  
1  1611500    1040.040039   2079500.0  -9.989990   0  
2  2079500    1021.179993   2391100.0  18.860046   1  
3  2391100    1019.979980   4760300.0   1.200013   1  
4  4760300    1067.449951   2341300.0 -47.469971   0  


In [3]:
# Handle missing values
df = df.dropna()

print(df.head())

         Date        Open        High          Low        Close    Adj Close  \
0  2018-04-30  1030.01001  1037.00000  1016.849976  1017.330017  1017.330017   
1  2018-04-27  1046.00000  1049.50000  1025.589966  1030.050049  1030.050049   
2  2018-04-26  1029.51001  1047.97998  1018.190002  1040.040039  1040.040039   
3  2018-04-25  1025.52002  1032.48999  1015.309998  1021.179993  1021.179993   
4  2018-04-24  1052.00000  1057.00000  1010.590027  1019.979980  1019.979980   

    Volume  Yes Adj Close  Yes Volume       Gain  Up  
0  1664113    1030.050049   1611500.0 -12.720032   0  
1  1611500    1040.040039   2079500.0  -9.989990   0  
2  2079500    1021.179993   2391100.0  18.860046   1  
3  2391100    1019.979980   4760300.0   1.200013   1  
4  4760300    1067.449951   2341300.0 -47.469971   0  


In [4]:
df_normal = pd.DataFrame()
df_normal['volume_ratio'] = df['Volume'].apply(pd.to_numeric) / df['Yes Volume'].apply(pd.to_numeric) 
df_normal['close_price_ratio'] = df['Adj Close'].apply(pd.to_numeric) / df['Yes Adj Close'].apply(pd.to_numeric)

df_normal['target'] = df['Up'].shift(1)

df_normal = df_normal.dropna()
df_normal = df_normal.reset_index(drop=True)
print(df_normal.head())

   volume_ratio  close_price_ratio  target
0      0.774946           0.990395     0.0
1      0.869683           1.018469     0.0
2      0.502300           1.001177     1.0
3      2.033187           0.955530     1.0
4      1.238980           0.994865     0.0


In [5]:
X=df_normal[['volume_ratio', 'close_price_ratio']]
y=df_normal['target']

In [6]:
# Standardize data
standardizer = preprocessing.StandardScaler()
X = standardizer.fit_transform(X)

# Split dataset
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=1)


# Build a model
model = svm.SVC()
classifier = model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)
con_matrix = confusion_matrix(y_test, y_pred)

print('number of correct sample: {}'.format(num_correct_samples))
print('accuracy: {}'.format(accuracy))
print('confusion matrix: {}'.format(con_matrix))

number of correct sample: 62
accuracy: 0.6138613861386139
confusion matrix: [[ 1 38]
 [ 1 61]]
