In [1]:
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime, timedelta
import os
from functools import partial
import numpy as np
import psycopg2
import pandas as pd
from sqlalchemy import create_engine
from config import db_password
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
#from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

#import necessary libraries 
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
%matplotlib inline

import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

## Indexes, Features (the possible causes), Targets (the desired effects), Throw-Aways

### Inference: 
We believe that location of a stock's company matters when determining a stock's volume weighted price change. This is the first run of the model. We are using region and country to see how strong they are as features. We compare this model against the model containing the additional features: employee_count, revenue, sector. 

### Indexes/Primary Key: 

- Concatinate ticker and date to yield ticker_and_date

#### Features are:
- REGION 
- COUNTRY CODE
- EMPLOYEE COUNT (added)                                                    
- REVENUE (added)
- SECTOR (added)
- PERCENT_CHANGE_VOLUME (calculated) 
- PERCENT_CHANGE_VOLUME_WEIGHT (calculated)

#### Target is:
- PERCENT_CHANGE_VOLUME_WEIGHT (calculated)

#### Throw-aways for modeling:
- TICKER 
- DATE
- CITY NAME
- STATE NAME
- COMPANY NAME
- COMPANY URL
- LATITUDE
- LONGITUDE
- OPEN 
- HIGH 
- LOW
- CLOSE
- VOLUME
- VOLUME WEIGHT
- NUMBER OF TRANSACTIONS
- PERCENT CHANGE (% change from close to open) <-- can't be used, all values are represented as positive values

### Results:

To get the best scores I could come up with, I used these parameters with the gradient boosting function: 
max_depth=10,
n_estimators=2500,
learning_rate=.001

Other settings: 
begin_date = '2022-03-08'
end_date = '2022-03-10'
interval controls: day_range_of_iter = 3


r2_score(y_test, y_pred) => 0.3746194189404347 (Best possible score is 1.0.)

mean_absolute_error(y_test, y_pred) => 0.912472612607337 (negatively-oriented, lower values are better)

mean_squared_error(y_test, y_pred) => 1.5577164280921987 (which ever is lower is better, generally)

### Team Notes: 
*r2_score:* Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a score of 0.0.

*mean_squared_error:* The smaller the mean squared error, the closer you are to finding the line of best fit. Depending on your data, it may be impossible to get a very small value for the mean squared error. For example, the above data is scattered wildly around the regression line, so 6.08 is as good as it gets (and is in fact, the line of best fit). 

What value of RMSE is acceptable?
Based on a rule of thumb, it can be said that RMSE values between 0.2 and 0.5 shows that the model can relatively

predict the data accurately. In addition, Adjusted R-squared more than 0.75 is a very good value for showing the

accuracy. In some cases, Adjusted R-squared of 0.4 or more is acceptable as well.

Thus RMSE can be very sensitive to outliers; in general we call this form of statistic not "robust". 
Robust statistics is a field interested in algorithms that are NOT sensitive to outliers.
Since the errors are squared before they are averaged, the RMSE gives a relatively high weight to large errors. 
This means the RMSE is most useful when large errors are particularly desirable. Both the MAE and RMSE can range
from 0 to ∞. They are negatively-oriented scores: Lower values are better.

In [2]:
# GET Tabled input

# creating database engine
db_name = 'Company_Stock_DB'
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/{db_name}"
engine = create_engine(db_string)

# read data from PostgreSQL database table and load into Dataframe instance
stock_df = pd.read_sql("select * from \"company_all_star\"", engine);

#sort the dataframe by ticker column
stock_df.sort_values(by=['ticker'])

# Print the DataFrame
stock_df.tail()

Unnamed: 0,ticker,date_val,company_name,company_url,employee_count,revenue,sector,city_name,state_name,region,...,latitude,longitude,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change
50869,ZS,2022-03-06,Zscaler Inc,zscaler.com,1k-5k,100m-200m,Technology,San Jose,CA,W,...,37.336191,-121.890583,228.37,229.97,204.36,204.37,4379337.0,210.5799,72096.0,10.509261
50870,ZS,2022-03-07,Zscaler Inc,zscaler.com,1k-5k,100m-200m,Technology,San Jose,CA,W,...,37.336191,-121.890583,203.5,203.92,190.13,198.63,4389634.0,196.9284,71180.0,2.39312
50871,ZS,2022-03-08,Zscaler Inc,zscaler.com,1k-5k,100m-200m,Technology,San Jose,CA,W,...,37.336191,-121.890583,203.84,213.57,199.12,212.35,3050554.0,209.3268,45960.0,4.174843
50872,ZS,2022-03-09,Zscaler Inc,zscaler.com,1k-5k,100m-200m,Technology,San Jose,CA,W,...,37.336191,-121.890583,212.13,213.51,204.87,208.41,2305091.0,208.7971,40754.0,1.753642
50873,ZS,2022-03-10,Zscaler Inc,zscaler.com,1k-5k,100m-200m,Technology,San Jose,CA,W,...,37.336191,-121.890583,211.02,211.67,200.5,201.14,1893573.0,202.9376,37307.0,4.682021


In [3]:
# drop throw-aways 
# stock_df.drop(['open_val', 'high_val', 'low_val', 'close_val', 'number_of_transactions', 'percent_change', 
#                'city_name', 'state_name', 'longitude', 'latitude', 'company_name',
#                'company_url'], axis=1, inplace=True)
stock_df.drop(['number_of_transactions', 'percent_change', 
               'city_name', 'state_name', 'longitude', 'latitude', 'company_name',
               'company_url'], axis=1, inplace=True)

stock_df

Unnamed: 0,ticker,date_val,employee_count,revenue,sector,region,country_code,open_val,high_val,low_val,close_val,volume,volume_weight
0,AMD,2020-03-12,5k-10k,over-1b,Technology,W,US,42.20,43.91,39.60,43.90,86689681.0,41.6701
1,AMD,2020-03-15,5k-10k,over-1b,Technology,W,US,39.08,43.37,38.51,38.71,84545868.0,41.0812
2,AMD,2020-03-16,5k-10k,over-1b,Technology,W,US,40.19,42.88,38.30,41.88,92741881.0,41.1240
3,AMD,2020-03-17,5k-10k,over-1b,Technology,W,US,39.54,41.95,36.75,39.12,106949287.0,39.6363
4,AMD,2020-03-18,5k-10k,over-1b,Technology,W,US,39.56,41.70,37.69,39.82,88939024.0,40.2337
...,...,...,...,...,...,...,...,...,...,...,...,...,...
50869,ZS,2022-03-06,1k-5k,100m-200m,Technology,W,US,228.37,229.97,204.36,204.37,4379337.0,210.5799
50870,ZS,2022-03-07,1k-5k,100m-200m,Technology,W,US,203.50,203.92,190.13,198.63,4389634.0,196.9284
50871,ZS,2022-03-08,1k-5k,100m-200m,Technology,W,US,203.84,213.57,199.12,212.35,3050554.0,209.3268
50872,ZS,2022-03-09,1k-5k,100m-200m,Technology,W,US,212.13,213.51,204.87,208.41,2305091.0,208.7971


In [4]:
# check dtypes
stock_df.dtypes

ticker             object
date_val           object
employee_count     object
revenue            object
sector             object
region             object
country_code       object
open_val          float64
high_val          float64
low_val           float64
close_val         float64
volume            float64
volume_weight     float64
dtype: object

In [5]:
# preserve date column as type object
stock_df['date'] = stock_df['date_val']

# have the user enter beginning date as yyyy-mm-dd
begin_date = '2022-03-08'
# have the user enter ending date as yyyy-mm-dd
end_date = '2022-03-10'
# interval controls
day_range_of_iter = 2

# convert the date to datetime64
stock_df['date_val'] = pd.to_datetime(stock_df['date_val'], format='%Y-%m-%d')

stock_df = stock_df.loc[(stock_df['date_val'] >= begin_date)
                     & (stock_df['date_val'] <= end_date)]

# drop date_val
stock_df.drop(['date_val'], axis=1, inplace=True)

stock_df

Unnamed: 0,ticker,employee_count,revenue,sector,region,country_code,open_val,high_val,low_val,close_val,volume,volume_weight,date
501,AMD,5k-10k,over-1b,Technology,W,US,108.410,111.71,106.850,111.05,102310329.0,109.6319,2022-03-08
502,AMD,5k-10k,over-1b,Technology,W,US,108.890,109.07,103.070,106.46,102557375.0,105.3382,2022-03-09
503,AMD,5k-10k,over-1b,Technology,W,US,108.130,108.19,104.080,104.29,87584432.0,105.9691,2022-03-10
1006,ADBE,over-10k,1m-10m,Technology,W,US,443.800,453.11,438.930,450.87,2905656.0,447.8637,2022-03-08
1007,ADBE,over-10k,1m-10m,Technology,W,US,444.680,447.65,433.010,438.95,2686310.0,437.7568,2022-03-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...
50368,ZM,1k-5k,100m-200m,Technology,W,US,105.835,106.90,101.055,103.33,5030777.0,103.3206,2022-03-09
50369,ZM,1k-5k,100m-200m,Technology,W,US,103.480,103.49,97.900,98.12,6454629.0,99.6973,2022-03-10
50871,ZS,1k-5k,100m-200m,Technology,W,US,203.840,213.57,199.120,212.35,3050554.0,209.3268,2022-03-08
50872,ZS,1k-5k,100m-200m,Technology,W,US,212.130,213.51,204.870,208.41,2305091.0,208.7971,2022-03-09


In [6]:
# check dtypes
stock_df.dtypes

ticker             object
employee_count     object
revenue            object
sector             object
region             object
country_code       object
open_val          float64
high_val          float64
low_val           float64
close_val         float64
volume            float64
volume_weight     float64
date               object
dtype: object

In [7]:
# unique days in df
unique_days = len(pd.unique(stock_df['date']))
print("unique number of days(number of days in df):", unique_days)

# unique stocks in df
unique_stocks = len(pd.unique(stock_df['ticker']))
print(unique_stocks)

# interation sets
iteration_sets = (unique_days - day_range_of_iter + 1)
print("iteration_sets: ", iteration_sets)

# total records captured
length_of_df = len(stock_df)
print(length_of_df)

unique number of days(number of days in df): 3
102
iteration_sets:  2
306


In [8]:
# sort dataframe by date
sort_date_stock_df = stock_df.sort_values(by=['date', 'ticker'])
next_date_stock_df = sort_date_stock_df

# get beginning dataframe records
b = 0 
# ending record for beginning df
ending_records = iteration_sets * unique_stocks

# starting record for end
x = (unique_days - iteration_sets) * unique_stocks
max_records = unique_days * unique_stocks

begin_df = pd.DataFrame()
end_df = pd.DataFrame()
 
# create two dataframes, one containing the beginning interval records and one containing the ending interval records
for rec in sort_date_stock_df.iterrows():
    
    if b < ending_records:
        new_begin_df = sort_date_stock_df.iloc[b]
        begin_df = begin_df.append(new_begin_df,ignore_index=False)
    
    if x < max_records: 
        new_end_df = next_date_stock_df.iloc[x]
        end_df = end_df.append(new_end_df,ignore_index=False)
    b=b+1
    x=x+1
   
begin_df.reset_index(drop=True,inplace=True)
begin_df 

Unnamed: 0,close_val,country_code,date,employee_count,high_val,low_val,open_val,region,revenue,sector,ticker,volume,volume_weight
0,162.95,US,2022-03-08,over-10k,163.410,159.410,161.475,W,over-1b,Technology,AAPL,91445405.0,161.9446
1,148.31,US,2022-03-08,5k-10k,150.990,145.330,145.430,W,200m-1b,Technology,ABNB,7023908.0,148.5454
2,450.87,US,2022-03-08,over-10k,453.110,438.930,443.800,W,1m-10m,Technology,ADBE,2905656.0,447.8637
3,153.62,US,2022-03-08,over-10k,155.060,151.300,153.090,SE,over-1b,Technology,ADI,3046254.0,153.5888
4,209.70,US,2022-03-08,over-10k,211.135,206.070,210.330,NE,over-1b,Consumer Discretionary,ADP,1791687.0,209.3495
...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,48.07,US,2022-03-09,over-10k,48.195,47.100,47.652,MW,over-1b,Healthcare,WBA,4999822.0,47.7198
200,228.45,US,2022-03-09,over-10k,232.060,226.620,228.290,W,over-1b,Technology,WDAY,1478994.0,229.0807
201,70.16,US,2022-03-09,over-10k,70.390,69.250,69.330,MW,over-1b,Utilities,XEL,1945242.0,69.9773
202,103.33,US,2022-03-09,1k-5k,106.900,101.055,105.835,W,100m-200m,Technology,ZM,5030777.0,103.3206


In [9]:
end_df.reset_index(drop=True,inplace=True)
end_df

Unnamed: 0,close_val,country_code,date,employee_count,high_val,low_val,open_val,region,revenue,sector,ticker,volume,volume_weight
0,158.52,US,2022-03-09,over-10k,160.390,155.9800,160.20,W,over-1b,Technology,AAPL,105342033.0,158.0284
1,151.80,US,2022-03-09,5k-10k,152.890,144.4092,145.22,W,200m-1b,Technology,ABNB,5302511.0,149.8916
2,438.95,US,2022-03-09,over-10k,447.650,433.0100,444.68,W,1m-10m,Technology,ADBE,2686310.0,437.7568
3,150.56,US,2022-03-09,over-10k,151.790,147.8500,150.85,SE,over-1b,Technology,ADI,4035714.0,150.1401
4,209.43,US,2022-03-09,over-10k,210.115,205.0100,206.51,NE,over-1b,Consumer Discretionary,ADP,2399303.0,207.7212
...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,47.44,US,2022-03-10,over-10k,48.970,47.3800,48.42,MW,over-1b,Healthcare,WBA,5737993.0,47.9174
200,224.91,US,2022-03-10,over-10k,232.290,224.6900,231.38,W,over-1b,Technology,WDAY,1789032.0,226.1950
201,70.37,US,2022-03-10,over-10k,70.960,69.9900,70.95,MW,over-1b,Utilities,XEL,3031491.0,70.4602
202,98.12,US,2022-03-10,1k-5k,103.490,97.9000,103.48,W,100m-200m,Technology,ZM,6454629.0,99.6973


In [10]:
# merge together the beginning and ending dataframes
merged_df = pd.merge(begin_df, end_df, left_index=True, right_index=True)
merged_df

Unnamed: 0,close_val_x,country_code_x,date_x,employee_count_x,high_val_x,low_val_x,open_val_x,region_x,revenue_x,sector_x,...,employee_count_y,high_val_y,low_val_y,open_val_y,region_y,revenue_y,sector_y,ticker_y,volume_y,volume_weight_y
0,162.95,US,2022-03-08,over-10k,163.410,159.410,161.475,W,over-1b,Technology,...,over-10k,160.390,155.9800,160.20,W,over-1b,Technology,AAPL,105342033.0,158.0284
1,148.31,US,2022-03-08,5k-10k,150.990,145.330,145.430,W,200m-1b,Technology,...,5k-10k,152.890,144.4092,145.22,W,200m-1b,Technology,ABNB,5302511.0,149.8916
2,450.87,US,2022-03-08,over-10k,453.110,438.930,443.800,W,1m-10m,Technology,...,over-10k,447.650,433.0100,444.68,W,1m-10m,Technology,ADBE,2686310.0,437.7568
3,153.62,US,2022-03-08,over-10k,155.060,151.300,153.090,SE,over-1b,Technology,...,over-10k,151.790,147.8500,150.85,SE,over-1b,Technology,ADI,4035714.0,150.1401
4,209.70,US,2022-03-08,over-10k,211.135,206.070,210.330,NE,over-1b,Consumer Discretionary,...,over-10k,210.115,205.0100,206.51,NE,over-1b,Consumer Discretionary,ADP,2399303.0,207.7212
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,48.07,US,2022-03-09,over-10k,48.195,47.100,47.652,MW,over-1b,Healthcare,...,over-10k,48.970,47.3800,48.42,MW,over-1b,Healthcare,WBA,5737993.0,47.9174
200,228.45,US,2022-03-09,over-10k,232.060,226.620,228.290,W,over-1b,Technology,...,over-10k,232.290,224.6900,231.38,W,over-1b,Technology,WDAY,1789032.0,226.1950
201,70.16,US,2022-03-09,over-10k,70.390,69.250,69.330,MW,over-1b,Utilities,...,over-10k,70.960,69.9900,70.95,MW,over-1b,Utilities,XEL,3031491.0,70.4602
202,103.33,US,2022-03-09,1k-5k,106.900,101.055,105.835,W,100m-200m,Technology,...,1k-5k,103.490,97.9000,103.48,W,100m-200m,Technology,ZM,6454629.0,99.6973


In [11]:
# drop *_y fields not needed 
merged_df.drop(columns = ['date_x', 'region_y', 'ticker_y', 'country_code_y'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
merged_df

Unnamed: 0,close_val_x,country_code_x,employee_count_x,high_val_x,low_val_x,open_val_x,region_x,revenue_x,sector_x,ticker_x,...,close_val_y,date_y,employee_count_y,high_val_y,low_val_y,open_val_y,revenue_y,sector_y,volume_y,volume_weight_y
0,162.95,US,over-10k,163.41,159.41,161.475,W,over-1b,Technology,AAPL,...,158.52,2022-03-09,over-10k,160.39,155.98,160.2,over-1b,Technology,105342033.0,158.0284
1,148.31,US,5k-10k,150.99,145.33,145.43,W,200m-1b,Technology,ABNB,...,151.8,2022-03-09,5k-10k,152.89,144.4092,145.22,200m-1b,Technology,5302511.0,149.8916
2,450.87,US,over-10k,453.11,438.93,443.8,W,1m-10m,Technology,ADBE,...,438.95,2022-03-09,over-10k,447.65,433.01,444.68,1m-10m,Technology,2686310.0,437.7568
3,153.62,US,over-10k,155.06,151.3,153.09,SE,over-1b,Technology,ADI,...,150.56,2022-03-09,over-10k,151.79,147.85,150.85,over-1b,Technology,4035714.0,150.1401
4,209.7,US,over-10k,211.135,206.07,210.33,NE,over-1b,Consumer Discretionary,ADP,...,209.43,2022-03-09,over-10k,210.115,205.01,206.51,over-1b,Consumer Discretionary,2399303.0,207.7212
5,204.88,US,over-10k,207.755,200.98,202.71,W,over-1b,Healthcare,ADSK,...,199.79,2022-03-09,over-10k,203.07,198.04,199.88,over-1b,Healthcare,1458925.0,199.9849
6,95.01,US,over-10k,95.9,94.2,95.9,MW,over-1b,Energy,AEP,...,95.99,2022-03-09,over-10k,96.22,93.99,94.61,over-1b,Energy,2897953.0,95.5087
7,442.05,US,over-10k,444.84,426.31,433.22,SW,200m-1b,Technology,ALGN,...,430.09,2022-03-09,over-10k,434.81,417.0,428.27,200m-1b,Technology,532709.0,427.7159
8,128.62,US,over-10k,129.84,126.2,127.7,W,over-1b,Technology,AMAT,...,124.97,2022-03-09,over-10k,126.0,122.68,125.72,over-1b,Technology,5604387.0,124.5674
9,111.05,US,5k-10k,111.71,106.85,108.41,W,over-1b,Technology,AMD,...,106.46,2022-03-09,5k-10k,109.07,103.07,108.89,over-1b,Technology,102557375.0,105.3382


In [12]:
# calculate the percentage change between the beginning (*_x) and the ending (*_y) volume and volume_weights
merged_df['percent_change_volume_weight'] = 100 - merged_df['volume_weight_y']/merged_df['volume_weight_x'] * 100
merged_df['percent_change_volume'] = 100 - merged_df['volume_y']/merged_df['volume_x'] * 100
merged_df

Unnamed: 0,close_val_x,country_code_x,employee_count_x,high_val_x,low_val_x,open_val_x,region_x,revenue_x,sector_x,ticker_x,...,employee_count_y,high_val_y,low_val_y,open_val_y,revenue_y,sector_y,volume_y,volume_weight_y,percent_change_volume_weight,percent_change_volume
0,162.95,US,over-10k,163.41,159.41,161.475,W,over-1b,Technology,AAPL,...,over-10k,160.39,155.98,160.2,over-1b,Technology,105342033.0,158.0284,2.418234,-15.196639
1,148.31,US,5k-10k,150.99,145.33,145.43,W,200m-1b,Technology,ABNB,...,5k-10k,152.89,144.4092,145.22,200m-1b,Technology,5302511.0,149.8916,-0.906255,24.507681
2,450.87,US,over-10k,453.11,438.93,443.8,W,1m-10m,Technology,ADBE,...,over-10k,447.65,433.01,444.68,1m-10m,Technology,2686310.0,437.7568,2.256691,7.548932
3,153.62,US,over-10k,155.06,151.3,153.09,SE,over-1b,Technology,ADI,...,over-10k,151.79,147.85,150.85,over-1b,Technology,4035714.0,150.1401,2.245411,-32.481205
4,209.7,US,over-10k,211.135,206.07,210.33,NE,over-1b,Consumer Discretionary,ADP,...,over-10k,210.115,205.01,206.51,over-1b,Consumer Discretionary,2399303.0,207.7212,0.77779,-33.913066
5,204.88,US,over-10k,207.755,200.98,202.71,W,over-1b,Healthcare,ADSK,...,over-10k,203.07,198.04,199.88,over-1b,Healthcare,1458925.0,199.9849,2.542327,21.140383
6,95.01,US,over-10k,95.9,94.2,95.9,MW,over-1b,Energy,AEP,...,over-10k,96.22,93.99,94.61,over-1b,Energy,2897953.0,95.5087,-0.463669,3.602652
7,442.05,US,over-10k,444.84,426.31,433.22,SW,200m-1b,Technology,ALGN,...,over-10k,434.81,417.0,428.27,200m-1b,Technology,532709.0,427.7159,2.552386,23.280354
8,128.62,US,over-10k,129.84,126.2,127.7,W,over-1b,Technology,AMAT,...,over-10k,126.0,122.68,125.72,over-1b,Technology,5604387.0,124.5674,2.829524,26.482247
9,111.05,US,5k-10k,111.71,106.85,108.41,W,over-1b,Technology,AMD,...,5k-10k,109.07,103.07,108.89,over-1b,Technology,102557375.0,105.3382,3.91647,-0.241467


In [13]:
# drop volume and weight columns
merged_df.drop(columns = ['volume_x', 'volume_weight_x', 'volume_y', 'volume_weight_y', 'date_y', 
                         'sector_y', 'revenue_y', 'employee_count_y'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
merged_df

Unnamed: 0,close_val_x,country_code_x,employee_count_x,high_val_x,low_val_x,open_val_x,region_x,revenue_x,sector_x,ticker_x,close_val_y,high_val_y,low_val_y,open_val_y,percent_change_volume_weight,percent_change_volume
0,162.95,US,over-10k,163.41,159.41,161.475,W,over-1b,Technology,AAPL,158.52,160.39,155.98,160.2,2.418234,-15.196639
1,148.31,US,5k-10k,150.99,145.33,145.43,W,200m-1b,Technology,ABNB,151.8,152.89,144.4092,145.22,-0.906255,24.507681
2,450.87,US,over-10k,453.11,438.93,443.8,W,1m-10m,Technology,ADBE,438.95,447.65,433.01,444.68,2.256691,7.548932
3,153.62,US,over-10k,155.06,151.3,153.09,SE,over-1b,Technology,ADI,150.56,151.79,147.85,150.85,2.245411,-32.481205
4,209.7,US,over-10k,211.135,206.07,210.33,NE,over-1b,Consumer Discretionary,ADP,209.43,210.115,205.01,206.51,0.77779,-33.913066
5,204.88,US,over-10k,207.755,200.98,202.71,W,over-1b,Healthcare,ADSK,199.79,203.07,198.04,199.88,2.542327,21.140383
6,95.01,US,over-10k,95.9,94.2,95.9,MW,over-1b,Energy,AEP,95.99,96.22,93.99,94.61,-0.463669,3.602652
7,442.05,US,over-10k,444.84,426.31,433.22,SW,200m-1b,Technology,ALGN,430.09,434.81,417.0,428.27,2.552386,23.280354
8,128.62,US,over-10k,129.84,126.2,127.7,W,over-1b,Technology,AMAT,124.97,126.0,122.68,125.72,2.829524,26.482247
9,111.05,US,5k-10k,111.71,106.85,108.41,W,over-1b,Technology,AMD,106.46,109.07,103.07,108.89,3.91647,-0.241467


In [14]:
merged_df.dtypes

close_val_x                     float64
country_code_x                   object
employee_count_x                 object
high_val_x                      float64
low_val_x                       float64
open_val_x                      float64
region_x                         object
revenue_x                        object
sector_x                         object
ticker_x                         object
close_val_y                     float64
high_val_y                      float64
low_val_y                       float64
open_val_y                      float64
percent_change_volume_weight    float64
percent_change_volume           float64
dtype: object

In [15]:
# unique values for each column (getting to know your data)
merged_df.nunique()

close_val_x                     203
country_code_x                    8
employee_count_x                  4
high_val_x                      203
low_val_x                       204
open_val_x                      204
region_x                         11
revenue_x                         6
sector_x                          9
ticker_x                        102
close_val_y                     204
high_val_y                      204
low_val_y                       204
open_val_y                      203
percent_change_volume_weight    204
percent_change_volume           204
dtype: int64

In [16]:
# generate our categorical variable list
# categorical preprocessing can be done easiest using Dataframe.dtypes == 'object'
stock_categories = merged_df.dtypes[merged_df.dtypes == "object"].index.tolist()
stock_categories

['country_code_x',
 'employee_count_x',
 'region_x',
 'revenue_x',
 'sector_x',
 'ticker_x']

In [17]:
# Checking the number of unique values in each column
merged_df[stock_categories].nunique()

country_code_x        8
employee_count_x      4
region_x             11
revenue_x             6
sector_x              9
ticker_x            102
dtype: int64

In [18]:
# merge objects having 'set' categories
merged_df['country_code_x'] = merged_df['country_code_x'].astype('category').cat.codes
merged_df['region_x'] = merged_df['region_x'].astype('category').cat.codes
merged_df['sector_x'] = merged_df['sector_x'].astype('category').cat.codes
merged_df['employee_count_x'] = merged_df['employee_count_x'].astype('category').cat.codes
merged_df['revenue_x'] = merged_df['revenue_x'].astype('category').cat.codes

In [19]:
merged_df

Unnamed: 0,close_val_x,country_code_x,employee_count_x,high_val_x,low_val_x,open_val_x,region_x,revenue_x,sector_x,ticker_x,close_val_y,high_val_y,low_val_y,open_val_y,percent_change_volume_weight,percent_change_volume
0,162.95,7,3,163.41,159.41,161.475,10,5,7,AAPL,158.52,160.39,155.98,160.2,2.418234,-15.196639
1,148.31,7,2,150.99,145.33,145.43,10,3,7,ABNB,151.8,152.89,144.4092,145.22,-0.906255,24.507681
2,450.87,7,3,453.11,438.93,443.8,10,2,7,ADBE,438.95,447.65,433.01,444.68,2.256691,7.548932
3,153.62,7,3,155.06,151.3,153.09,8,5,7,ADI,150.56,151.79,147.85,150.85,2.245411,-32.481205
4,209.7,7,3,211.135,206.07,210.33,5,5,1,ADP,209.43,210.115,205.01,206.51,0.77779,-33.913066
5,204.88,7,3,207.755,200.98,202.71,10,5,5,ADSK,199.79,203.07,198.04,199.88,2.542327,21.140383
6,95.01,7,3,95.9,94.2,95.9,4,5,3,AEP,95.99,96.22,93.99,94.61,-0.463669,3.602652
7,442.05,7,3,444.84,426.31,433.22,9,3,7,ALGN,430.09,434.81,417.0,428.27,2.552386,23.280354
8,128.62,7,3,129.84,126.2,127.7,10,5,7,AMAT,124.97,126.0,122.68,125.72,2.829524,26.482247
9,111.05,7,2,111.71,106.85,108.41,10,5,7,AMD,106.46,109.07,103.07,108.89,3.91647,-0.241467


In [20]:
# remvoe ticker
merged_df.drop(columns = ['ticker_x', 'high_val_x', 'low_val_x', 'open_val_x', 'close_val_x'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
merged_df

Unnamed: 0,country_code_x,employee_count_x,region_x,revenue_x,sector_x,close_val_y,high_val_y,low_val_y,open_val_y,percent_change_volume_weight,percent_change_volume
0,7,3,10,5,7,158.52,160.39,155.98,160.2,2.418234,-15.196639
1,7,2,10,3,7,151.8,152.89,144.4092,145.22,-0.906255,24.507681
2,7,3,10,2,7,438.95,447.65,433.01,444.68,2.256691,7.548932
3,7,3,8,5,7,150.56,151.79,147.85,150.85,2.245411,-32.481205
4,7,3,5,5,1,209.43,210.115,205.01,206.51,0.77779,-33.913066
5,7,3,10,5,5,199.79,203.07,198.04,199.88,2.542327,21.140383
6,7,3,4,5,3,95.99,96.22,93.99,94.61,-0.463669,3.602652
7,7,3,9,3,7,430.09,434.81,417.0,428.27,2.552386,23.280354
8,7,3,10,5,7,124.97,126.0,122.68,125.72,2.829524,26.482247
9,7,2,10,5,7,106.46,109.07,103.07,108.89,3.91647,-0.241467


In [3]:
np.random.seed(7)

[ 0.13232762  4.32559514  0.88577575  4.32559514  0.44215557 -0.15460869
  0.17886681  0.71105993  0.94399592  0.79026959  0.60405945  0.33373285
 12.1560252   2.95063721  0.36098073  1.54049782  1.51196933  3.33512574
 -1.58543982  0.45727974  1.67239253 -0.15739135  1.94905724  4.32559514
  1.66767858  0.55327868  0.35390134  1.75447038  2.39725012  5.10563006
  1.21608484  0.95579931  1.67115621 -0.18614615  0.29729031  0.33220664
  0.93752424  1.92877761  0.36098073  5.10563006  5.10563006  1.02462319
  5.63052078  1.86986449  2.84077425  2.59360114  1.26844416  0.24474286
  0.2094833   0.41194701  1.38589008]
[[ 7.00000000e+00  2.00000000e+00  8.00000000e+00  5.00000000e+00
   7.00000000e+00  1.15520000e+02  1.17620000e+02  1.14720000e+02
   1.17620000e+02  2.96822175e+00]
 [ 7.00000000e+00  0.00000000e+00  1.00000000e+01  0.00000000e+00
   7.00000000e+00  2.01140000e+02  2.11670000e+02  2.00500000e+02
   2.11020000e+02  1.78525707e+01]
 [ 7.00000000e+00  2.00000000e+00  8.0000000

In [24]:
# how well the model performed (looking for smallest error)
print ("begin_date: ", begin_date)
print ("end_date: ", end_date)
print ("day_range_of_iter: ", day_range_of_iter)

print(f"r2 Score: {r2_score(y_test, y_pred)}")

print(f"mean absolute error: {mean_absolute_error(y_test, y_pred)}")

print(f"mean squared error: {mean_squared_error(y_test, y_pred)}")

# NOTE: squared=False is the RMSE
print(f"root mean squared error: {mean_squared_error(y_test, y_pred, squared=False)}")

# r2_score: Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model 
# that always predicts the expected value of y, disregarding the input features, would get a score of 0.0.

begin_date:  2022-03-08
end_date:  2022-03-10
day_range_of_iter:  2
r2 Score: 0.5621076283344766
mean absolute error: 1.3745720200815474
mean squared error: 3.6250561799138965
root mean squared error: 1.9039580299769994


In [25]:

# #     # Tomas: correlation analysis to see how your features are correlated to each other
    
# #     # as with any regression you need to minimize the mean square error.
#                                                         ------------------
# #     examples are at : 
# # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html#sklearn.metrics.mean_squared_error
# #     from sklearn.metrics import mean_squared_error
    
# # EMPTY PROCESS DATAFRAME   
    
# #     # accrossed all stocks, what is the average score.
# #     # what is the mean?
# #     # what is the median?
# #     # do we have any outliers that we need to note
# #     # does this work better for same sectors?