<a href="https://colab.research.google.com/github/Haren2006/DS340W-Project-by-Haren-Anand-and-Hyun-Woo-Jang/blob/main/DS340W_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
import numpy as np
import pandas as pd
from pandas_datareader import data

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from sklearn.metrics import classification_report

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [22]:
# Load 2019 percent price variation for all tech stocks
PVAR = pd.read_csv('Example_2019_price_var.csv', index_col=0)

# Load dataset with all financial indicators (referring to end of 2018)
DATA = pd.read_csv('Example_DATASET.csv', index_col=0)

In [23]:
# Divide data in train and testing splits
train_split, test_split = train_test_split(DATA, test_size=0.2, random_state=1, stratify=DATA['class'])
X_train = train_split.iloc[:, :-1].values
y_train = train_split.iloc[:, -1].values
X_test = test_split.iloc[:, :-1].values
y_test = test_split.iloc[:, -1].values

print(f'Total number of samples: {DATA.shape[0]}')
print()
print(f'Number of training samples: {X_train.shape[0]}')
print()
print(f'Number of testing samples: {X_test.shape[0]}')
print()
print(f'Number of features: {X_train.shape[1]}')

Total number of samples: 638

Number of training samples: 510

Number of testing samples: 128

Number of features: 107


In [24]:
# Standardize input data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [25]:
# Parameter grid to be tuned
tuned_parameters = [{'kernel': ['rbf', 'linear'], 'gamma': [1e-3, 1e-4], 'C': [0.01, 0.1, 1, 10, 100]}]

clf1 = GridSearchCV(SVC(random_state=1),
                    tuned_parameters,
                    n_jobs=4,
                    scoring='precision_weighted',
                    cv=5)

clf1.fit(X_train, y_train)

print('Best score and parameters found on development set:')
print()
print('%0.3f for %r' % (clf1.best_score_, clf1.best_params_))
print()

Best score and parameters found on development set:

0.713 for {'C': 0.01, 'gamma': 0.001, 'kernel': 'linear'}



In [26]:
# Parameter grid to be tuned
tuned_parameters = {'n_estimators': [1024, 4096],
                    'max_features': ['auto', 'sqrt'],
                    'max_depth': [4, 6, 8],
                    'criterion': ['gini', 'entropy']}

clf2 = GridSearchCV(RandomForestClassifier(random_state=1),
                    tuned_parameters,
                    n_jobs=4,
                    scoring='precision_weighted',
                    cv=5)

clf2.fit(X_train, y_train)

print('Best score and parameters found on development set:')
print()
print('%0.3f for %r' % (clf2.best_score_, clf2.best_params_))
print()

Best score and parameters found on development set:

0.713 for {'criterion': 'entropy', 'max_depth': 4, 'max_features': 'sqrt', 'n_estimators': 1024}



In [27]:
# Parameter grid to be tuned
tuned_parameters = {'learning_rate': [0.01, 0.001],
                    'max_depth': [4, 6, 8],
                    'n_estimators': [512, 1024]}

clf3 = GridSearchCV(xgb.XGBClassifier(random_state=1),
                   tuned_parameters,
                   n_jobs=4,
                   scoring='precision_weighted',
                   cv=5)

clf3.fit(X_train, y_train)

print('Best score and parameters found on development set:')
print()
print('%0.3f for %r' % (clf3.best_score_, clf3.best_params_))
print()

Best score and parameters found on development set:

0.700 for {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 1024}



In [28]:
# Parameter grid to be tuned
tuned_parameters = {'hidden_layer_sizes': [(32,), (64,), (32, 64, 32)],
                    'activation': ['tanh', 'relu'],
                    'solver': ['lbfgs', 'adam']}

clf4 = GridSearchCV(MLPClassifier(random_state=1, batch_size=4, early_stopping=True),
                    tuned_parameters,
                    n_jobs=4,
                    scoring='precision_weighted',
                    cv=5)

clf4.fit(X_train, y_train)

print('Best score, and parameters, found on development set:')
print()
print('%0.3f for %r' % (clf4.best_score_, clf4.best_params_))
print()

Best score, and parameters, found on development set:

0.730 for {'activation': 'relu', 'hidden_layer_sizes': (32, 64, 32), 'solver': 'adam'}



In [29]:
# Get 2019 price variations ONLY for the stocks in testing split
pvar_test = PVAR.loc[test_split.index.values, :]

In [30]:
# Initial investment can be $100 for each stock whose predicted class = 1
buy_amount = 100

# In new dataframe df1, store all the information regarding each model's predicted class and relative gain/loss in $USD
df1 = pd.DataFrame(y_test, index=test_split.index.values, columns=['ACTUAL']) # first column is the true class (BUY/INGORE)

df1['SVM'] = clf1.predict(X_test) # predict class for testing dataset
df1['VALUE START SVM [$]'] = df1['SVM'] * buy_amount # if class = 1 --> buy $100 of that stock
df1['VAR SVM [$]'] = (pvar_test['2019 PRICE VAR [%]'].values / 100) * df1['VALUE START SVM [$]'] # compute price variation in $
df1['VALUE END SVM [$]'] = df1['VALUE START SVM [$]'] + df1['VAR SVM [$]'] # compute final value

df1['RF'] = clf2.predict(X_test)
df1['VALUE START RF [$]'] = df1['RF'] * buy_amount
df1['VAR RF [$]'] = (pvar_test['2019 PRICE VAR [%]'].values / 100) * df1['VALUE START RF [$]']
df1['VALUE END RF [$]'] = df1['VALUE START RF [$]'] + df1['VAR RF [$]']

df1['XGB'] = clf3.predict(X_test)
df1['VALUE START XGB [$]'] = df1['XGB'] * buy_amount
df1['VAR XGB [$]'] = (pvar_test['2019 PRICE VAR [%]'].values / 100) * df1['VALUE START XGB [$]']
df1['VALUE END XGB [$]'] = df1['VALUE START XGB [$]'] + df1['VAR XGB [$]']

df1['MLP'] = clf4.predict(X_test)
df1['VALUE START MLP [$]'] = df1['MLP'] * buy_amount
df1['VAR MLP [$]'] = (pvar_test['2019 PRICE VAR [%]'].values / 100) * df1['VALUE START MLP [$]']
df1['VALUE END MLP [$]'] = df1['VALUE START MLP [$]'] + df1['VAR MLP [$]']

# Show dataframe df1
df1.head()

Unnamed: 0,ACTUAL,SVM,VALUE START SVM [$],VAR SVM [$],VALUE END SVM [$],RF,VALUE START RF [$],VAR RF [$],VALUE END RF [$],XGB,VALUE START XGB [$],VAR XGB [$],VALUE END XGB [$],MLP,VALUE START MLP [$],VAR MLP [$],VALUE END MLP [$]
SFUN,0,1,100,-59.432627,40.567373,1,100,-59.432627,40.567373,1,100,-59.432627,40.567373,1,100,-59.432627,40.567373
GRUB,0,1,100,-35.542009,64.457991,1,100,-35.542009,64.457991,1,100,-35.542009,64.457991,1,100,-35.542009,64.457991
PLXS,1,1,100,48.705064,148.705064,1,100,48.705064,148.705064,1,100,48.705064,148.705064,1,100,48.705064,148.705064
DSPG,1,1,100,38.070178,138.070178,0,0,0.0,0.0,0,0,0.0,0.0,1,100,38.070178,138.070178
NH,1,0,0,0.0,0.0,0,0,0.0,0.0,1,100,80.38529,180.38529,1,100,80.38529,180.38529


In [31]:
import yfinance as yf
import pandas as pd

def get_price_var(symbol, start="2019-01-01", end="2019-12-31"):
    """
    Yahoo Finance via yfinance
    Calculate the profit ratio annually
    """
    df = yf.download(symbol, start=start, end=end, progress=False, auto_adjust=False)

    if df is None or df.empty:
        raise ValueError(f"No data for {symbol} between {start} and {end}")

    start_price = float(df["Close"].iloc[0])
    end_price   = float(df["Close"].iloc[-1])
    return float((end_price / start_price - 1) * 100)


In [32]:
# Create a new, compact, dataframe in order to show gain/loss for each model
start_value_svm = df1['VALUE START SVM [$]'].sum()
final_value_svm = df1['VALUE END SVM [$]'].sum()
net_gain_svm = final_value_svm - start_value_svm
percent_gain_svm = (net_gain_svm / start_value_svm) * 100

start_value_rf = df1['VALUE START RF [$]'].sum()
final_value_rf = df1['VALUE END RF [$]'].sum()
net_gain_rf = final_value_rf - start_value_rf
percent_gain_rf = (net_gain_rf / start_value_rf) * 100

start_value_xgb = df1['VALUE START XGB [$]'].sum()
final_value_xgb = df1['VALUE END XGB [$]'].sum()
net_gain_xgb = final_value_xgb - start_value_xgb
percent_gain_xgb = (net_gain_xgb / start_value_xgb) * 100

start_value_mlp = df1['VALUE START MLP [$]'].sum()
final_value_mlp = df1['VALUE END MLP [$]'].sum()
net_gain_mlp = final_value_mlp - start_value_mlp
percent_gain_mlp = (net_gain_mlp / start_value_mlp) * 100

percent_gain_sp500 = float(get_price_var('^GSPC', start="2019-01-02", end="2019-12-31"))
percent_gain_dj    = float(get_price_var('^DJI',  start="2019-01-02", end="2019-12-31"))
percent_gain_sector = float(PVAR['2019 PRICE VAR [%]'].mean())

MODELS_COMPARISON = pd.DataFrame([start_value_svm, final_value_svm, net_gain_svm, percent_gain_svm],
                    index=['INITIAL COST [USD]', 'FINAL VALUE [USD]', '[USD] GAIN/LOSS', 'ROI'], columns=['SVM'])
MODELS_COMPARISON['RF'] = [start_value_rf, final_value_rf, net_gain_rf, percent_gain_rf]
MODELS_COMPARISON['XGB'] = [start_value_xgb, final_value_xgb, net_gain_xgb, percent_gain_xgb]
MODELS_COMPARISON['MLP'] = [start_value_mlp, final_value_mlp, net_gain_mlp, percent_gain_mlp]
MODELS_COMPARISON['S&P 500'] = ['', '', '', percent_gain_sp500]
MODELS_COMPARISON['DOW JONES'] = ['', '', '', percent_gain_dj]
MODELS_COMPARISON['TECH SECTOR'] = ['', '', '', percent_gain_sector]

# Show the dataframe
MODELS_COMPARISON

Unnamed: 0,SVM,RF,XGB,MLP,S&P 500,DOW JONES,TECH SECTOR
INITIAL COST [USD],12300.0,7800.0,9500.0,10000.0,,,
FINAL VALUE [USD],15701.241963,10392.292754,12635.912546,12834.504406,,,
[USD] GAIN/LOSS,3401.241963,2592.292754,3135.912546,2834.504406,,,
ROI,27.652374,33.234522,33.009606,28.345044,28.336713,21.913166,28.340367


In [33]:
print(53 * '=')
print(15 * ' ' + 'SUPPORT VECTOR MACHINE')
print(53 * '-')
print(classification_report(y_test, clf1.predict(X_test), target_names=['IGNORE', 'BUY']))
print(53 * '-')
print(53 * '=')
print(20 * ' ' + 'RANDOM FOREST')
print(53 * '-')
print(classification_report(y_test, clf2.predict(X_test), target_names=['IGNORE', 'BUY']))
print(53 * '-')
print(53 * '=')
print(14 * ' ' + 'EXTREME GRADIENT BOOSTING')
print(53 * '-')
print(classification_report(y_test, clf3.predict(X_test), target_names=['IGNORE', 'BUY']))
print(53 * '-')
print(53 * '=')
print(15 * ' ' + 'MULTI-LAYER PERCEPTRON')
print(53 * '-')
print(classification_report(y_test, clf4.predict(X_test), target_names=['IGNORE', 'BUY']))
print(53 * '-')

               SUPPORT VECTOR MACHINE
-----------------------------------------------------
              precision    recall  f1-score   support

      IGNORE       0.40      0.05      0.09        38
         BUY       0.71      0.97      0.82        90

    accuracy                           0.70       128
   macro avg       0.55      0.51      0.45       128
weighted avg       0.62      0.70      0.60       128

-----------------------------------------------------
                    RANDOM FOREST
-----------------------------------------------------
              precision    recall  f1-score   support

      IGNORE       0.46      0.61      0.52        38
         BUY       0.81      0.70      0.75        90

    accuracy                           0.67       128
   macro avg       0.63      0.65      0.64       128
weighted avg       0.70      0.67      0.68       128

-----------------------------------------------------
              EXTREME GRADIENT BOOSTING
------------------