In [1]:
# libraries!
import numpy as np      # numpy is Python's "array" library
import pandas as pd     # Pandas is Python's "data" library ("dataframe" == spreadsheet)

In [2]:
# let's read in our stock data...
# 
# 
filename = 'stock_prices.csv'
df = pd.read_csv(filename)        # encoding="utf-8" et al.
print(f"{filename} : file read into a pandas dataframe.")

stock_prices.csv : file read into a pandas dataframe.


In [3]:
#
# a dataframe is a "spreadsheet in Python"   (seems to have an extra column!)
#
# let's view it!
df.head()

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target
0,20170104_1301,2017-01-04,1301,2734.0,2755.0,2730.0,2742.0,31400,1.0,,False,0.00073
1,20170104_1332,2017-01-04,1332,568.0,576.0,563.0,571.0,2798500,1.0,,False,0.012324
2,20170104_1333,2017-01-04,1333,3150.0,3210.0,3140.0,3210.0,270800,1.0,,False,0.006154
3,20170104_1376,2017-01-04,1376,1510.0,1550.0,1510.0,1550.0,11300,1.0,,False,0.011053
4,20170104_1377,2017-01-04,1377,3270.0,3350.0,3270.0,3330.0,150800,1.0,,False,0.003026


In [4]:
#
# let's look at the dataframe's "info":
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2332531 entries, 0 to 2332530
Data columns (total 12 columns):
RowId               object
Date                object
SecuritiesCode      int64
Open                float64
High                float64
Low                 float64
Close               float64
Volume              int64
AdjustmentFactor    float64
ExpectedDividend    float64
SupervisionFlag     bool
Target              float64
dtypes: bool(1), float64(7), int64(2), object(2)
memory usage: 198.0+ MB


In [5]:
# Let's look at the dataframe's columns:
df.columns

Index(['RowId', 'Date', 'SecuritiesCode', 'Open', 'High', 'Low', 'Close',
       'Volume', 'AdjustmentFactor', 'ExpectedDividend', 'SupervisionFlag',
       'Target'],
      dtype='object')

In [6]:
# we can drop a series of data (a row or a column)
# they're indicated by numeric value, row~0, col~1, but let's use readable names instead:
ROW = 0
COLUMN = 1

df_clean1 = df.drop(['Date', 'RowId', 'AdjustmentFactor', 'ExpectedDividend', 'SupervisionFlag'], axis=COLUMN)
df_clean1.head()

# df_clean1 is a new dataframe, without that unwanted column

Unnamed: 0,SecuritiesCode,Open,High,Low,Close,Volume,Target
0,1301,2734.0,2755.0,2730.0,2742.0,31400,0.00073
1,1332,568.0,576.0,563.0,571.0,2798500,0.012324
2,1333,3150.0,3210.0,3140.0,3210.0,270800,0.006154
3,1376,1510.0,1550.0,1510.0,1550.0,11300,0.011053
4,1377,3270.0,3350.0,3270.0,3330.0,150800,0.003026


In [7]:
#
# let's drop _all_ rows with data that is missing/NaN (not-a-number)
df_clean2 = df_clean1.dropna()
df_clean2.info()  # print the info

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2324923 entries, 0 to 2332530
Data columns (total 7 columns):
SecuritiesCode    int64
Open              float64
High              float64
Low               float64
Close             float64
Volume            int64
Target            float64
dtypes: float64(5), int64(2)
memory usage: 141.9 MB


In [8]:
# Covert target to two category
# Positive and Negative

def convert_rate(rate):
    """ return the positive and nagative  """

    if rate > 0:
        return 1
    else: return 0

In [9]:
#
# we can "apply" to a whole column and create a new column
#   it may give a warning, but this is ok...
#

df_clean3 = df_clean2.copy()  # copy everything AND...

# add a new column, 'irisnum'
df_clean3['twoclass'] = df_clean2['Target'].apply(convert_rate)

# let's see...
df_clean3.head()

Unnamed: 0,SecuritiesCode,Open,High,Low,Close,Volume,Target,twoclass
0,1301,2734.0,2755.0,2730.0,2742.0,31400,0.00073,1
1,1332,568.0,576.0,563.0,571.0,2798500,0.012324,1
2,1333,3150.0,3210.0,3140.0,3210.0,270800,0.006154,1
3,1376,1510.0,1550.0,1510.0,1550.0,11300,0.011053,1
4,1377,3270.0,3350.0,3270.0,3330.0,150800,0.003026,1


In [10]:
# Covert target to ten category

def convert_rate_multiclass(rate):
    """ return the classes  """

    if rate < -0.025:
        return 0
    elif rate >= -0.025 and rate < -0.015:
        return 1
    elif rate >= -0.015 and rate < -0.01:
        return 2
    elif rate >= -0.01 and rate < -0.005:
        return 3
    elif rate >= -0.005 and rate <= 0:
        return 4
    elif rate > 0 and rate < 0.005:
        return 5
    elif rate >= 0.005 and rate < 0.01:
        return 6
    elif rate >= 0.01 and rate < 0.015:
        return 7
    elif rate >= 0.015 and rate < 0.025:
        return 8
    else: return 9

In [11]:
#
# we can "apply" to a whole column and create a new column
#   it may give a warning, but this is ok...
#

df_clean4 = df_clean3.copy()  # copy everything AND...

# add a new column, 'irisnum'
df_clean4['multiclasses'] = df_clean3['Target'].apply(convert_rate_multiclass)

# let's see...
df_clean4.head()

Unnamed: 0,SecuritiesCode,Open,High,Low,Close,Volume,Target,twoclass,multiclasses
0,1301,2734.0,2755.0,2730.0,2742.0,31400,0.00073,1,5
1,1332,568.0,576.0,563.0,571.0,2798500,0.012324,1,7
2,1333,3150.0,3210.0,3140.0,3210.0,270800,0.006154,1,6
3,1376,1510.0,1550.0,1510.0,1550.0,11300,0.011053,1,7
4,1377,3270.0,3350.0,3270.0,3330.0,150800,0.003026,1,5


In [13]:
# Get the code list

stock_list = df_clean4.groupby('SecuritiesCode').count().index

In [14]:
# Collect data from continue ten days as input
# Target rate in the 10th day as output

X_all = []
y_all = []
for stockID in stock_list:
    A = df_clean4[df_clean4.SecuritiesCode == stockID]
    B = A.drop(['Target','SecuritiesCode', 'multiclasses', 'twoclass'], axis=1 ).to_numpy().astype('float64') 
    for j in range(len(B) - 9):
        X_all.append(B[j: j+10].reshape(-1))
        y_all.append(A.iloc[j: j+1, 7])

In [15]:
len(X_all)

2306923

In [16]:
#
# We next separate into test data and training data ... 
#    + We will train on the training data...
#    + We will _not_ look at the testing data to build the model
#
# Then, afterward, we will test on the testing data -- and see how well we do!
#

#
# a common convention:  train on 80%, test on 20%    Let's define the TEST_PERCENT
#

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(np.array(X_all), np.array(y_all).ravel(), test_size=0.2, random_state=42)

print(f"training with {len(y_train)} rows;  testing with {len(y_test)} rows\n" )

print(f"Held-out data... (testing data: {len(y_test)})")
print(f"y_test: {y_test}\n")
print(f"X_test (few rows): {X_test[0:5,:]}")  # 5 rows
print()
print(f"Data used for modeling... (training data: {len(y_train)})")
print(f"y_train: {y_train}\n")
print(f"X_train (few rows): {X_train[0:5,:]}")  # 5 rows

training with 922769 rows;  testing with 1384154 rows

Held-out data... (testing data: 1384154)
y_test: [1 1 0 ... 1 0 0]

X_test (few rows): [[2.400e+03 2.406e+03 2.371e+03 2.399e+03 9.400e+03 2.399e+03 2.400e+03
  2.365e+03 2.378e+03 4.500e+03 2.395e+03 2.440e+03 2.391e+03 2.438e+03
  4.900e+03 2.400e+03 2.412e+03 2.365e+03 2.366e+03 4.600e+03 2.353e+03
  2.437e+03 2.353e+03 2.437e+03 7.900e+03 2.430e+03 2.430e+03 2.380e+03
  2.389e+03 3.000e+03 2.351e+03 2.351e+03 2.319e+03 2.319e+03 2.900e+03
  2.319e+03 2.336e+03 2.260e+03 2.266e+03 4.600e+03 2.289e+03 2.300e+03
  2.262e+03 2.272e+03 3.400e+03 2.270e+03 2.274e+03 2.241e+03 2.244e+03
  3.000e+03]
 [5.140e+02 5.160e+02 5.060e+02 5.080e+02 5.350e+05 5.080e+02 5.100e+02
  5.020e+02 5.090e+02 5.120e+05 5.130e+02 5.220e+02 5.110e+02 5.130e+02
  3.090e+05 5.140e+02 5.260e+02 5.090e+02 5.200e+02 4.550e+05 5.280e+02
  5.340e+02 5.230e+02 5.250e+02 3.560e+05 5.340e+02 5.360e+02 5.280e+02
  5.340e+02 3.630e+05 5.420e+02 5.460e+02 5.380e+02 5

In [None]:
#
# we also use "cross validation"
#

from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
#
# cross-validation splits the training set into two pieces:
#   + model-building and model-validation. We'll use "build" and "validate"
#
best_k = 84  # Not correct!
best_accuracy = 0.0  # also not correct...

# Note that we are cross-validating using only our TEST data!
for k in range(1,10):
    knn_cv_model = KNeighborsClassifier(n_neighbors=k)   # build knn_model for every k!
    cv_scores = cross_val_score( knn_cv_model, X_train, y_train, cv=5 )  # cv=5 means 80/20
    print(cv_scores)  # just to see the five scores... 
    average_cv_accuracy = cv_scores.mean()  # mean() is numpy's built-in average function 
    print(f"k: {k:2d}  cv accuracy: {average_cv_accuracy:7.4f}")

    
# assign best value of k to best_k
    if average_cv_accuracy > best_accuracy:
        best_accuracy = average_cv_accuracy
        best_k = k      # at the moment this is incorrect   
# you'll need to use the loop above to find and remember the real best_k

print(f"best_k = {best_k}   yields the highest average cv accuracy.")  # print the best one

In [None]:
# Our computer can not even run a KNN 
# It takes too much time, so we give up this idea