In [1]:
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime, timedelta
from functools import partial
import numpy as np
import psycopg2
import pandas as pd
from sqlalchemy import create_engine
from config import db_password
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
#from sklearn.metrics import accuracy_score

#import necessary libraries 
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols


#from joblib import Parallel, delayed, Model 
#from collections import Counter <--????
#from sklearn.metrics import confusion_matrix
#from imblearn.metrics import classification_report_imbalanced

## Indexes, Features (the possible causes), Targets (the desired effects), Throw-Aways

### Inference: 
We believe that location of a stock's company matters when determining a stock's volume weighted price change. This is the first run of the model. We are using region and country to see how strong they are as features. We compare this model against the model containing the additional features: employee_count, revenue, sector. 

### Indexes/Primary Key: 

- Concatinate ticker and date to yield ticker_and_date

#### Features are:
- REGION 
- COUNTRY CODE
- EMPLOYEE COUNT (added)                                                    
- REVENUE (added)
- SECTOR (added)
- PERCENT_CHANGE_VOLUME (calculated) 
- PERCENT_CHANGE_VOLUME_WEIGHT (calculated)

#### Target is:
- PERCENT_CHANGE_VOLUME_WEIGHT (calculated)

#### Throw-aways for modeling:
- TICKER 
- DATE
- CITY NAME
- STATE NAME
- COMPANY NAME
- COMPANY URL
- LATITUDE
- LONGITUDE
- OPEN 
- HIGH 
- LOW
- CLOSE
- VOLUME
- VOLUME WEIGHT
- NUMBER OF TRANSACTIONS
- PERCENT CHANGE (% change from close to open) <-- can't be used, all values are represented as positive values

### Results:

To get the best scores I could come up with, I used these parameters with the gradient boosting function: 
max_depth=10,
n_estimators=2500,
learning_rate=.001

Other settings: 
begin_date = '2022-03-08'
end_date = '2022-03-10'
interval controls: day_range_of_iter = 3


r2_score(y_test, y_pred) => 0.3746194189404347 (Best possible score is 1.0.)

mean_absolute_error(y_test, y_pred) => 0.912472612607337 (negatively-oriented, lower values are better)

mean_squared_error(y_test, y_pred) => 1.5577164280921987 (which ever is lower is better, generally)

### Team Notes: 
*r2_score:* Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a score of 0.0.

*mean_squared_error:* The smaller the mean squared error, the closer you are to finding the line of best fit. Depending on your data, it may be impossible to get a very small value for the mean squared error. For example, the above data is scattered wildly around the regression line, so 6.08 is as good as it gets (and is in fact, the line of best fit). 

What value of RMSE is acceptable?
Based on a rule of thumb, it can be said that RMSE values between 0.2 and 0.5 shows that the model can relatively

predict the data accurately. In addition, Adjusted R-squared more than 0.75 is a very good value for showing the

accuracy. In some cases, Adjusted R-squared of 0.4 or more is acceptable as well.

Thus RMSE can be very sensitive to outliers; in general we call this form of statistic not "robust". 
Robust statistics is a field interested in algorithms that are NOT sensitive to outliers.
Since the errors are squared before they are averaged, the RMSE gives a relatively high weight to large errors. 
This means the RMSE is most useful when large errors are particularly desirable. Both the MAE and RMSE can range
from 0 to ∞. They are negatively-oriented scores: Lower values are better.

In [2]:
# GET Tabled input

# creating database engine
db_name = 'Company_Stock_DB'
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/{db_name}"
engine = create_engine(db_string)

# read data from PostgreSQL database table and load into Dataframe instance
stock_df = pd.read_sql("select * from \"company_all_star\"", engine);

#sort the dataframe by ticker column
stock_df.sort_values(by=['ticker'])

# Print the DataFrame
stock_df.tail()

Unnamed: 0,ticker,date_val,company_name,company_url,employee_count,revenue,sector,city_name,state_name,region,...,latitude,longitude,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change
50869,ZS,2022-03-06,Zscaler Inc,zscaler.com,1k-5k,100m-200m,Technology,San Jose,CA,W,...,37.336191,-121.890583,228.37,229.97,204.36,204.37,4379337.0,210.5799,72096.0,10.509261
50870,ZS,2022-03-07,Zscaler Inc,zscaler.com,1k-5k,100m-200m,Technology,San Jose,CA,W,...,37.336191,-121.890583,203.5,203.92,190.13,198.63,4389634.0,196.9284,71180.0,2.39312
50871,ZS,2022-03-08,Zscaler Inc,zscaler.com,1k-5k,100m-200m,Technology,San Jose,CA,W,...,37.336191,-121.890583,203.84,213.57,199.12,212.35,3050554.0,209.3268,45960.0,4.174843
50872,ZS,2022-03-09,Zscaler Inc,zscaler.com,1k-5k,100m-200m,Technology,San Jose,CA,W,...,37.336191,-121.890583,212.13,213.51,204.87,208.41,2305091.0,208.7971,40754.0,1.753642
50873,ZS,2022-03-10,Zscaler Inc,zscaler.com,1k-5k,100m-200m,Technology,San Jose,CA,W,...,37.336191,-121.890583,211.02,211.67,200.5,201.14,1893573.0,202.9376,37307.0,4.682021


In [3]:
# drop throw-aways 
# stock_df.drop(['open_val', 'high_val', 'low_val', 'close_val', 'number_of_transactions', 'percent_change', 
#                'city_name', 'state_name', 'longitude', 'latitude', 'company_name',
#                'company_url'], axis=1, inplace=True)
stock_df.drop(['number_of_transactions', 'percent_change', 
               'city_name', 'state_name', 'longitude', 'latitude', 'company_name',
               'company_url'], axis=1, inplace=True)

stock_df

Unnamed: 0,ticker,date_val,employee_count,revenue,sector,region,country_code,open_val,high_val,low_val,close_val,volume,volume_weight
0,AMD,2020-03-12,5k-10k,over-1b,Technology,W,US,42.20,43.91,39.60,43.90,86689681.0,41.6701
1,AMD,2020-03-15,5k-10k,over-1b,Technology,W,US,39.08,43.37,38.51,38.71,84545868.0,41.0812
2,AMD,2020-03-16,5k-10k,over-1b,Technology,W,US,40.19,42.88,38.30,41.88,92741881.0,41.1240
3,AMD,2020-03-17,5k-10k,over-1b,Technology,W,US,39.54,41.95,36.75,39.12,106949287.0,39.6363
4,AMD,2020-03-18,5k-10k,over-1b,Technology,W,US,39.56,41.70,37.69,39.82,88939024.0,40.2337
...,...,...,...,...,...,...,...,...,...,...,...,...,...
50869,ZS,2022-03-06,1k-5k,100m-200m,Technology,W,US,228.37,229.97,204.36,204.37,4379337.0,210.5799
50870,ZS,2022-03-07,1k-5k,100m-200m,Technology,W,US,203.50,203.92,190.13,198.63,4389634.0,196.9284
50871,ZS,2022-03-08,1k-5k,100m-200m,Technology,W,US,203.84,213.57,199.12,212.35,3050554.0,209.3268
50872,ZS,2022-03-09,1k-5k,100m-200m,Technology,W,US,212.13,213.51,204.87,208.41,2305091.0,208.7971


In [4]:
# check dtypes
stock_df.dtypes

ticker             object
date_val           object
employee_count     object
revenue            object
sector             object
region             object
country_code       object
open_val          float64
high_val          float64
low_val           float64
close_val         float64
volume            float64
volume_weight     float64
dtype: object

In [5]:
# preserve date column as type object
stock_df['date'] = stock_df['date_val']

# have the user enter beginning date as yyyy-mm-dd
begin_date = '2022-03-01'
# have the user enter ending date as yyyy-mm-dd
end_date = '2022-03-03'
# interval controls
day_range_of_iter = 2

# convert the date to datetime64
stock_df['date_val'] = pd.to_datetime(stock_df['date_val'], format='%Y-%m-%d')

stock_df = stock_df.loc[(stock_df['date_val'] >= begin_date)
                     & (stock_df['date_val'] <= end_date)]

# drop date_val
stock_df.drop(['date_val'], axis=1, inplace=True)

stock_df

Unnamed: 0,ticker,employee_count,revenue,sector,region,country_code,open_val,high_val,low_val,close_val,volume,volume_weight,date
496,AMD,5k-10k,over-1b,Technology,W,US,115.47,119.48,113.3100,118.28,118054509.0,116.7418,2022-03-01
497,AMD,5k-10k,over-1b,Technology,W,US,118.13,118.26,111.0500,111.98,123853721.0,113.6286,2022-03-02
498,AMD,5k-10k,over-1b,Technology,W,US,112.00,113.00,106.8100,108.41,100671339.0,109.0619,2022-03-03
1001,ADBE,over-10k,1m-10m,Technology,W,US,471.94,477.49,466.0400,471.18,2948450.0,471.9849,2022-03-01
1002,ADBE,over-10k,1m-10m,Technology,W,US,475.83,476.05,455.4450,459.08,2906318.0,462.3042,2022-03-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...
50363,ZM,1k-5k,100m-200m,Technology,W,US,122.00,122.06,111.9500,113.11,6211452.0,114.9198,2022-03-02
50364,ZM,1k-5k,100m-200m,Technology,W,US,114.10,116.19,108.4300,108.94,6227090.0,110.5694,2022-03-03
50866,ZS,1k-5k,100m-200m,Technology,W,US,253.26,257.54,242.1100,254.41,2968394.0,250.6195,2022-03-01
50867,ZS,1k-5k,100m-200m,Technology,W,US,252.98,256.41,238.3801,242.03,2799374.0,245.7130,2022-03-02


In [6]:
# check dtypes
stock_df.dtypes

ticker             object
employee_count     object
revenue            object
sector             object
region             object
country_code       object
open_val          float64
high_val          float64
low_val           float64
close_val         float64
volume            float64
volume_weight     float64
date               object
dtype: object

In [7]:
# unique days in df
unique_days = len(pd.unique(stock_df['date']))
print("unique number of days(number of days in df):", unique_days)

# unique stocks in df
unique_stocks = len(pd.unique(stock_df['ticker']))
print(unique_stocks)

# interation sets
iteration_sets = (unique_days - day_range_of_iter + 1)
print("iteration_sets: ", iteration_sets)

# total records captured
length_of_df = len(stock_df)
print(length_of_df)

unique number of days(number of days in df): 3
102
iteration_sets:  2
306


In [8]:
# sort dataframe by date
sort_date_stock_df = stock_df.sort_values(by=['date', 'ticker'])
next_date_stock_df = sort_date_stock_df

# get beginning dataframe records
b = 0 
# ending record for beginning df
ending_records = iteration_sets * unique_stocks

# starting record for end
x = (unique_days - iteration_sets) * unique_stocks
max_records = unique_days * unique_stocks

begin_df = pd.DataFrame()
end_df = pd.DataFrame()
 
# create two dataframes, one containing the beginning interval records and one containing the ending interval records
for rec in sort_date_stock_df.iterrows():
    
    if b < ending_records:
        new_begin_df = sort_date_stock_df.iloc[b]
        begin_df = begin_df.append(new_begin_df,ignore_index=False)
    
    if x < max_records: 
        new_end_df = next_date_stock_df.iloc[x]
        end_df = end_df.append(new_end_df,ignore_index=False)
    b=b+1
    x=x+1
   
begin_df.reset_index(drop=True,inplace=True)
begin_df 

Unnamed: 0,close_val,country_code,date,employee_count,high_val,low_val,open_val,region,revenue,sector,ticker,volume,volume_weight
0,166.56,US,2022-03-01,over-10k,167.360,162.9500,164.390,W,over-1b,Technology,AAPL,79724750.0,165.8095
1,157.53,US,2022-03-01,5k-10k,158.080,151.8900,152.480,W,200m-1b,Technology,ABNB,5028250.0,155.9842
2,471.18,US,2022-03-01,over-10k,477.490,466.0400,471.940,W,1m-10m,Technology,ADBE,2948450.0,471.9849
3,159.82,US,2022-03-01,over-10k,161.530,157.1100,157.470,SE,over-1b,Technology,ADI,2578681.0,159.8088
4,206.29,US,2022-03-01,over-10k,207.325,202.2200,202.950,NE,over-1b,Consumer Discretionary,ADP,1674593.0,206.1608
...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,46.72,US,2022-03-02,over-10k,47.010,46.1900,46.755,MW,over-1b,Healthcare,WBA,7234332.0,46.7042
200,245.37,US,2022-03-02,over-10k,250.000,241.7700,250.000,W,over-1b,Technology,WDAY,2213408.0,245.0289
201,69.13,US,2022-03-02,over-10k,69.280,67.5700,67.860,MW,over-1b,Utilities,XEL,2822121.0,68.9333
202,113.11,US,2022-03-02,1k-5k,122.060,111.9500,122.000,W,100m-200m,Technology,ZM,6211452.0,114.9198


In [9]:
end_df.reset_index(drop=True,inplace=True)
end_df

Unnamed: 0,close_val,country_code,date,employee_count,high_val,low_val,open_val,region,revenue,sector,ticker,volume,volume_weight
0,166.23,US,2022-03-02,over-10k,168.910,165.550,168.47,W,over-1b,Technology,AAPL,76678441.0,166.9180
1,151.69,US,2022-03-02,5k-10k,159.220,150.990,158.33,W,200m-1b,Technology,ABNB,4045757.0,153.1107
2,459.08,US,2022-03-02,over-10k,476.050,455.445,475.83,W,1m-10m,Technology,ADBE,2906318.0,462.3042
3,159.28,US,2022-03-02,over-10k,161.700,156.970,161.46,SE,over-1b,Technology,ADI,2656102.0,159.5458
4,204.70,US,2022-03-02,over-10k,209.610,203.970,208.86,NE,over-1b,Consumer Discretionary,ADP,2192173.0,206.0616
...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,47.72,US,2022-03-03,over-10k,47.790,46.050,46.11,MW,over-1b,Healthcare,WBA,9602408.0,47.2177
200,240.21,US,2022-03-03,over-10k,248.500,237.640,245.74,W,over-1b,Technology,WDAY,2640109.0,240.2588
201,71.26,US,2022-03-03,over-10k,71.315,68.640,68.95,MW,over-1b,Utilities,XEL,3515340.0,70.6307
202,108.94,US,2022-03-03,1k-5k,116.190,108.430,114.10,W,100m-200m,Technology,ZM,6227090.0,110.5694


In [10]:
# merge together the beginning and ending dataframes
merged_df = pd.merge(begin_df, end_df, left_index=True, right_index=True)
merged_df

Unnamed: 0,close_val_x,country_code_x,date_x,employee_count_x,high_val_x,low_val_x,open_val_x,region_x,revenue_x,sector_x,...,employee_count_y,high_val_y,low_val_y,open_val_y,region_y,revenue_y,sector_y,ticker_y,volume_y,volume_weight_y
0,166.56,US,2022-03-01,over-10k,167.360,162.9500,164.390,W,over-1b,Technology,...,over-10k,168.910,165.550,168.47,W,over-1b,Technology,AAPL,76678441.0,166.9180
1,157.53,US,2022-03-01,5k-10k,158.080,151.8900,152.480,W,200m-1b,Technology,...,5k-10k,159.220,150.990,158.33,W,200m-1b,Technology,ABNB,4045757.0,153.1107
2,471.18,US,2022-03-01,over-10k,477.490,466.0400,471.940,W,1m-10m,Technology,...,over-10k,476.050,455.445,475.83,W,1m-10m,Technology,ADBE,2906318.0,462.3042
3,159.82,US,2022-03-01,over-10k,161.530,157.1100,157.470,SE,over-1b,Technology,...,over-10k,161.700,156.970,161.46,SE,over-1b,Technology,ADI,2656102.0,159.5458
4,206.29,US,2022-03-01,over-10k,207.325,202.2200,202.950,NE,over-1b,Consumer Discretionary,...,over-10k,209.610,203.970,208.86,NE,over-1b,Consumer Discretionary,ADP,2192173.0,206.0616
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,46.72,US,2022-03-02,over-10k,47.010,46.1900,46.755,MW,over-1b,Healthcare,...,over-10k,47.790,46.050,46.11,MW,over-1b,Healthcare,WBA,9602408.0,47.2177
200,245.37,US,2022-03-02,over-10k,250.000,241.7700,250.000,W,over-1b,Technology,...,over-10k,248.500,237.640,245.74,W,over-1b,Technology,WDAY,2640109.0,240.2588
201,69.13,US,2022-03-02,over-10k,69.280,67.5700,67.860,MW,over-1b,Utilities,...,over-10k,71.315,68.640,68.95,MW,over-1b,Utilities,XEL,3515340.0,70.6307
202,113.11,US,2022-03-02,1k-5k,122.060,111.9500,122.000,W,100m-200m,Technology,...,1k-5k,116.190,108.430,114.10,W,100m-200m,Technology,ZM,6227090.0,110.5694


In [11]:
# drop *_y fields not needed 
merged_df.drop(columns = ['date_x', 'region_y', 'ticker_y', 'country_code_y'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
merged_df

Unnamed: 0,close_val_x,country_code_x,employee_count_x,high_val_x,low_val_x,open_val_x,region_x,revenue_x,sector_x,ticker_x,...,close_val_y,date_y,employee_count_y,high_val_y,low_val_y,open_val_y,revenue_y,sector_y,volume_y,volume_weight_y
0,166.56,US,over-10k,167.36,162.95,164.39,W,over-1b,Technology,AAPL,...,166.23,2022-03-02,over-10k,168.91,165.55,168.47,over-1b,Technology,76678441.0,166.918
1,157.53,US,5k-10k,158.08,151.89,152.48,W,200m-1b,Technology,ABNB,...,151.69,2022-03-02,5k-10k,159.22,150.99,158.33,200m-1b,Technology,4045757.0,153.1107
2,471.18,US,over-10k,477.49,466.04,471.94,W,1m-10m,Technology,ADBE,...,459.08,2022-03-02,over-10k,476.05,455.445,475.83,1m-10m,Technology,2906318.0,462.3042
3,159.82,US,over-10k,161.53,157.11,157.47,SE,over-1b,Technology,ADI,...,159.28,2022-03-02,over-10k,161.7,156.97,161.46,over-1b,Technology,2656102.0,159.5458
4,206.29,US,over-10k,207.325,202.22,202.95,NE,over-1b,Consumer Discretionary,ADP,...,204.7,2022-03-02,over-10k,209.61,203.97,208.86,over-1b,Consumer Discretionary,2192173.0,206.0616
5,216.51,US,over-10k,218.16,209.52,216.34,W,over-1b,Healthcare,ADSK,...,210.0,2022-03-02,over-10k,219.45,208.56,219.45,over-1b,Healthcare,1730045.0,211.8108
6,91.24,US,over-10k,91.49,89.9,89.9,MW,over-1b,Energy,AEP,...,93.98,2022-03-02,over-10k,94.69,91.42,91.42,over-1b,Energy,4173651.0,93.6477
7,496.14,US,over-10k,505.2,481.82,505.17,SW,200m-1b,Technology,ALGN,...,477.45,2022-03-02,over-10k,502.24,475.37,501.22,200m-1b,Technology,655912.0,481.2937
8,133.18,US,over-10k,134.47,129.65,130.0,W,over-1b,Technology,AMAT,...,130.64,2022-03-02,over-10k,135.117,129.57,135.117,over-1b,Technology,5449674.0,131.0939
9,118.28,US,5k-10k,119.48,113.31,115.47,W,over-1b,Technology,AMD,...,111.98,2022-03-02,5k-10k,118.26,111.05,118.13,over-1b,Technology,123853721.0,113.6286


In [12]:
# calculate the percentage change between the beginning (*_x) and the ending (*_y) volume and volume_weights
merged_df['percent_change_volume_weight'] = 100 - merged_df['volume_weight_y']/merged_df['volume_weight_x'] * 100
merged_df['percent_change_volume'] = 100 - merged_df['volume_y']/merged_df['volume_x'] * 100
merged_df

Unnamed: 0,close_val_x,country_code_x,employee_count_x,high_val_x,low_val_x,open_val_x,region_x,revenue_x,sector_x,ticker_x,...,employee_count_y,high_val_y,low_val_y,open_val_y,revenue_y,sector_y,volume_y,volume_weight_y,percent_change_volume_weight,percent_change_volume
0,166.56,US,over-10k,167.36,162.95,164.39,W,over-1b,Technology,AAPL,...,over-10k,168.91,165.55,168.47,over-1b,Technology,76678441.0,166.918,-0.668538,3.821033
1,157.53,US,5k-10k,158.08,151.89,152.48,W,200m-1b,Technology,ABNB,...,5k-10k,159.22,150.99,158.33,200m-1b,Technology,4045757.0,153.1107,1.842174,19.539462
2,471.18,US,over-10k,477.49,466.04,471.94,W,1m-10m,Technology,ADBE,...,over-10k,476.05,455.445,475.83,1m-10m,Technology,2906318.0,462.3042,2.051061,1.428954
3,159.82,US,over-10k,161.53,157.11,157.47,SE,over-1b,Technology,ADI,...,over-10k,161.7,156.97,161.46,over-1b,Technology,2656102.0,159.5458,0.164572,-3.002349
4,206.29,US,over-10k,207.325,202.22,202.95,NE,over-1b,Consumer Discretionary,ADP,...,over-10k,209.61,203.97,208.86,over-1b,Consumer Discretionary,2192173.0,206.0616,0.048118,-30.907809
5,216.51,US,over-10k,218.16,209.52,216.34,W,over-1b,Healthcare,ADSK,...,over-10k,219.45,208.56,219.45,over-1b,Healthcare,1730045.0,211.8108,1.640587,-3.148942
6,91.24,US,over-10k,91.49,89.9,89.9,MW,over-1b,Energy,AEP,...,over-10k,94.69,91.42,91.42,over-1b,Energy,4173651.0,93.6477,-2.957645,-32.643692
7,496.14,US,over-10k,505.2,481.82,505.17,SW,200m-1b,Technology,ALGN,...,over-10k,502.24,475.37,501.22,200m-1b,Technology,655912.0,481.2937,2.737491,-12.824112
8,133.18,US,over-10k,134.47,129.65,130.0,W,over-1b,Technology,AMAT,...,over-10k,135.117,129.57,135.117,over-1b,Technology,5449674.0,131.0939,1.041117,22.244453
9,118.28,US,5k-10k,119.48,113.31,115.47,W,over-1b,Technology,AMD,...,5k-10k,118.26,111.05,118.13,over-1b,Technology,123853721.0,113.6286,2.66674,-4.912317


In [13]:
# drop volume and weight columns
merged_df.drop(columns = ['volume_x', 'volume_weight_x', 'volume_y', 'volume_weight_y', 'date_y', 
                         'sector_y', 'revenue_y', 'employee_count_y'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
merged_df

Unnamed: 0,close_val_x,country_code_x,employee_count_x,high_val_x,low_val_x,open_val_x,region_x,revenue_x,sector_x,ticker_x,close_val_y,high_val_y,low_val_y,open_val_y,percent_change_volume_weight,percent_change_volume
0,166.56,US,over-10k,167.36,162.95,164.39,W,over-1b,Technology,AAPL,166.23,168.91,165.55,168.47,-0.668538,3.821033
1,157.53,US,5k-10k,158.08,151.89,152.48,W,200m-1b,Technology,ABNB,151.69,159.22,150.99,158.33,1.842174,19.539462
2,471.18,US,over-10k,477.49,466.04,471.94,W,1m-10m,Technology,ADBE,459.08,476.05,455.445,475.83,2.051061,1.428954
3,159.82,US,over-10k,161.53,157.11,157.47,SE,over-1b,Technology,ADI,159.28,161.7,156.97,161.46,0.164572,-3.002349
4,206.29,US,over-10k,207.325,202.22,202.95,NE,over-1b,Consumer Discretionary,ADP,204.7,209.61,203.97,208.86,0.048118,-30.907809
5,216.51,US,over-10k,218.16,209.52,216.34,W,over-1b,Healthcare,ADSK,210.0,219.45,208.56,219.45,1.640587,-3.148942
6,91.24,US,over-10k,91.49,89.9,89.9,MW,over-1b,Energy,AEP,93.98,94.69,91.42,91.42,-2.957645,-32.643692
7,496.14,US,over-10k,505.2,481.82,505.17,SW,200m-1b,Technology,ALGN,477.45,502.24,475.37,501.22,2.737491,-12.824112
8,133.18,US,over-10k,134.47,129.65,130.0,W,over-1b,Technology,AMAT,130.64,135.117,129.57,135.117,1.041117,22.244453
9,118.28,US,5k-10k,119.48,113.31,115.47,W,over-1b,Technology,AMD,111.98,118.26,111.05,118.13,2.66674,-4.912317


In [14]:
merged_df.dtypes

close_val_x                     float64
country_code_x                   object
employee_count_x                 object
high_val_x                      float64
low_val_x                       float64
open_val_x                      float64
region_x                         object
revenue_x                        object
sector_x                         object
ticker_x                         object
close_val_y                     float64
high_val_y                      float64
low_val_y                       float64
open_val_y                      float64
percent_change_volume_weight    float64
percent_change_volume           float64
dtype: object

In [15]:
# unique values for each column (getting to know your data)
merged_df.nunique()

close_val_x                     203
country_code_x                    8
employee_count_x                  4
high_val_x                      203
low_val_x                       204
open_val_x                      204
region_x                         11
revenue_x                         6
sector_x                          9
ticker_x                        102
close_val_y                     202
high_val_y                      203
low_val_y                       203
open_val_y                      204
percent_change_volume_weight    204
percent_change_volume           204
dtype: int64

In [16]:
# generate our categorical variable list
# categorical preprocessing can be done easiest using Dataframe.dtypes == 'object'
stock_categories = merged_df.dtypes[merged_df.dtypes == "object"].index.tolist()
stock_categories

['country_code_x',
 'employee_count_x',
 'region_x',
 'revenue_x',
 'sector_x',
 'ticker_x']

In [17]:
# Checking the number of unique values in each column
merged_df[stock_categories].nunique()

country_code_x        8
employee_count_x      4
region_x             11
revenue_x             6
sector_x              9
ticker_x            102
dtype: int64

In [18]:
# merge objects having 'set' categories
merged_df['country_code_x'] = merged_df['country_code_x'].astype('category').cat.codes
merged_df['region_x'] = merged_df['region_x'].astype('category').cat.codes
merged_df['sector_x'] = merged_df['sector_x'].astype('category').cat.codes
merged_df['employee_count_x'] = merged_df['employee_count_x'].astype('category').cat.codes
merged_df['revenue_x'] = merged_df['revenue_x'].astype('category').cat.codes

In [19]:
merged_df

Unnamed: 0,close_val_x,country_code_x,employee_count_x,high_val_x,low_val_x,open_val_x,region_x,revenue_x,sector_x,ticker_x,close_val_y,high_val_y,low_val_y,open_val_y,percent_change_volume_weight,percent_change_volume
0,166.56,7,3,167.36,162.95,164.39,10,5,7,AAPL,166.23,168.91,165.55,168.47,-0.668538,3.821033
1,157.53,7,2,158.08,151.89,152.48,10,3,7,ABNB,151.69,159.22,150.99,158.33,1.842174,19.539462
2,471.18,7,3,477.49,466.04,471.94,10,2,7,ADBE,459.08,476.05,455.445,475.83,2.051061,1.428954
3,159.82,7,3,161.53,157.11,157.47,8,5,7,ADI,159.28,161.7,156.97,161.46,0.164572,-3.002349
4,206.29,7,3,207.325,202.22,202.95,5,5,1,ADP,204.7,209.61,203.97,208.86,0.048118,-30.907809
5,216.51,7,3,218.16,209.52,216.34,10,5,5,ADSK,210.0,219.45,208.56,219.45,1.640587,-3.148942
6,91.24,7,3,91.49,89.9,89.9,4,5,3,AEP,93.98,94.69,91.42,91.42,-2.957645,-32.643692
7,496.14,7,3,505.2,481.82,505.17,9,3,7,ALGN,477.45,502.24,475.37,501.22,2.737491,-12.824112
8,133.18,7,3,134.47,129.65,130.0,10,5,7,AMAT,130.64,135.117,129.57,135.117,1.041117,22.244453
9,118.28,7,2,119.48,113.31,115.47,10,5,7,AMD,111.98,118.26,111.05,118.13,2.66674,-4.912317


In [20]:
# remvoe ticker
merged_df.drop(columns = ['ticker_x', 'high_val_x', 'low_val_x', 'open_val_x', 'close_val_x'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
merged_df

Unnamed: 0,country_code_x,employee_count_x,region_x,revenue_x,sector_x,close_val_y,high_val_y,low_val_y,open_val_y,percent_change_volume_weight,percent_change_volume
0,7,3,10,5,7,166.23,168.91,165.55,168.47,-0.668538,3.821033
1,7,2,10,3,7,151.69,159.22,150.99,158.33,1.842174,19.539462
2,7,3,10,2,7,459.08,476.05,455.445,475.83,2.051061,1.428954
3,7,3,8,5,7,159.28,161.7,156.97,161.46,0.164572,-3.002349
4,7,3,5,5,1,204.7,209.61,203.97,208.86,0.048118,-30.907809
5,7,3,10,5,5,210.0,219.45,208.56,219.45,1.640587,-3.148942
6,7,3,4,5,3,93.98,94.69,91.42,91.42,-2.957645,-32.643692
7,7,3,9,3,7,477.45,502.24,475.37,501.22,2.737491,-12.824112
8,7,3,10,5,7,130.64,135.117,129.57,135.117,1.041117,22.244453
9,7,2,10,5,7,111.98,118.26,111.05,118.13,2.66674,-4.912317


In [21]:
# create features array
X = merged_df.drop(columns=["percent_change_volume_weight"]).values
    
# create target
y = merged_df["percent_change_volume_weight"].values

# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

# max_depth refers to the number of leaves of each tree 
# n_estimators refers to the total number of trees in the ensemble
# learning_rate hyperparameter scales the contribution of each tree NOTE: If you set it to a low value, 
# you will need more trees in the ensemble to fit the training set, but the overall variance will be lower.

# best way to tune the model: https://neptune.ai/blog/lightgbm-parameters-guide
    
regressor = GradientBoostingRegressor(
max_depth=17,
n_estimators=200,
learning_rate=.01,
criterion='mse' # ‘mse’, ‘mae’}
)
regressor.fit(X_train, y_train)

# Use staged_predict() method to measures the validation error at each stage of training 
# (i.e. with one tree, with two trees…) to find the optimal number of trees.
errors = [mean_squared_error(y_test, y_pred) for y_pred in 
           regressor.staged_predict(X_test)]


In [22]:
best_n_estimators = np.argmin(errors)+1

print(best_n_estimators)

109


In [23]:
# build and fit our model using the optimal number of trees
best_regressor = GradientBoostingRegressor(
     max_depth=15,
     n_estimators=best_n_estimators,
     learning_rate=.01
)

best_regressor.fit(X_train, y_train)

GradientBoostingRegressor(learning_rate=0.01, max_depth=15, n_estimators=109)

In [24]:
# NOTES: 

# mean_squared_error: 
# The smaller the mean squared error, the closer you are to finding the line of best fit. Depending on your data, 
# it may be impossible to get a very small value for the mean squared error. For example, the above data is scattered 
# wildly around the regression line, so 6.08 is as good as it gets (and is in fact, the line of best fit). 

# What value of RMSE is acceptable?
# Based on a rule of thumb, it can be said that RMSE values between 0.2 and 0.5 shows that the model can relatively
#                                               -------------------------------   
# predict the data accurately. In addition, Adjusted R-squared more than 0.75 is a very good value for showing the
#                                                    ------------------------
# accuracy. In some cases, Adjusted R-squared of 0.4 or more is acceptable as well.
#                          -------------------------   
# Thus RMSE can be very sensitive to outliers; in general we call this form of statistic not "robust". 
# Robust statistics is a field interested in algorithms that are NOT sensitive to outliers.
#
# Since the errors are squared before they are averaged, the RMSE gives a relatively high weight to large errors. 
# This means the RMSE is most useful when large errors are particularly undesirable. Both the MAE and RMSE can range
# from 0 to ∞. They are negatively-oriented scores: Lower values are better.
#                                                   ----------------------- 

In [25]:

# # Sklearn provides numerous metrics to evaluate 
# # the performance of our machine learning models.
# # They categorize the each metric according 
# # to the problem domain which they’re applicable. 
# # https://scikit-learn.org/stable/modules/model_evaluation.html <-- GO TO THIS SITE TO SEE WHICH METRICS YOU WILL USE.

# # We use the mean absolute error 
# # which can be interpreted as 
# # the average distance from 
# # our predictions and the actual values

# # this will give you the value of the stocks for the next period of time
y_pred = best_regressor.predict(X_test)
print(y_pred)
print(X_test)
print(X_train)
print(y_train)


[ 2.4936741   5.04304276  0.98440529  2.68190725 -3.19434633  1.39166812
 -2.10589078  0.34520752  0.65576756  0.15897886  2.08609881 -0.12157854
  0.03988714  3.94940905  0.91991177  0.75151294  1.51655167  0.37189905
  2.2721405   0.04803876  1.11277381  0.11325433  0.98313916  1.57282605
  3.93251452  0.51719145  3.09694911  2.15433256  3.91683132  0.68593794
 -3.36375565  2.98406944  1.61410508 -0.39896676  1.25787036  1.36273711
  0.74818249  2.22548301  1.90684369  0.07803259  3.18903379  0.34520752
  4.07771628  1.24080311 -0.06451605  0.98119288  2.27438688  1.62601527
  3.04948191  0.06339844  0.61140828]
[[ 7.00000000e+00  2.00000000e+00  8.00000000e+00  5.00000000e+00
   7.00000000e+00  1.18410000e+02  1.21020000e+02  1.17840000e+02
   1.20440000e+02 -1.09477136e+01]
 [ 7.00000000e+00  0.00000000e+00  1.00000000e+01  0.00000000e+00
   7.00000000e+00  2.25500000e+02  2.44000000e+02  2.23040000e+02
   2.42360000e+02 -2.91248329e+01]
 [ 7.00000000e+00  2.00000000e+00  8.0000000

In [26]:
# how well the model performed (looking for smallest error)
print("r2_score: ", r2_score(y_test, y_pred))
print("mean_absolute_error",mean_absolute_error(y_test, y_pred))
print("mean_squared_error: ", min(errors))
# r2_score: Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model 
# that always predicts the expected value of y, disregarding the input features, would get a score of 0.0.

r2_score:  0.2429279173448946
mean_absolute_error 1.7156303355028637
mean_squared_error:  6.123965876118614


In [27]:

# #     # Tomas: correlation analysis to see how your features are correlated to each other
    
# #     # as with any regression you need to minimize the mean square error.
#                                                         ------------------
# #     examples are at : 
# # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html#sklearn.metrics.mean_squared_error
# #     from sklearn.metrics import mean_squared_error
    
# # EMPTY PROCESS DATAFRAME   
    
# #     # accrossed all stocks, what is the average score.
# #     # what is the mean?
# #     # what is the median?
# #     # do we have any outliers that we need to note
# #     # does this work better for same sectors?