In [1]:
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime, timedelta
from functools import partial
import numpy as np
import psycopg2
import pandas as pd
from sqlalchemy import create_engine
from config import db_password
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
#from sklearn.metrics import accuracy_score

#import necessary libraries 
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols


#from joblib import Parallel, delayed, Model 
#from collections import Counter <--????
#from sklearn.metrics import confusion_matrix
#from imblearn.metrics import classification_report_imbalanced

## Indexes, Features (the possible causes), Targets (the desired effects), Throw-Aways

### Inference: 
We believe that location of a stock's company matters when determining a stock's volume weighted price change. This is the first run of the model. We are using region and country to see how strong they are as features. We compare this model against the model containing the additional features: employee_count, revenue, sector. 

### Indexes/Primary Key: 

- Concatinate ticker and date to yield ticker_and_date

#### Features are:
- REGION 
- COUNTRY CODE
- EMPLOYEE COUNT (added)                                                    
- REVENUE (added)
- SECTOR (added)
- PERCENT_CHANGE_VOLUME (calculated) 
- PERCENT_CHANGE_VOLUME_WEIGHT (calculated)

#### Target is:
- PERCENT_CHANGE_VOLUME_WEIGHT (calculated)

#### Throw-aways for modeling:
- TICKER 
- DATE
- CITY NAME
- STATE NAME
- COMPANY NAME
- COMPANY URL
- LATITUDE
- LONGITUDE
- OPEN 
- HIGH 
- LOW
- CLOSE
- VOLUME
- VOLUME WEIGHT
- NUMBER OF TRANSACTIONS
- PERCENT CHANGE (% change from close to open) <-- can't be used, all values are represented as positive values

### Results:

To get the best scores I could come up with, I used these parameters with the gradient boosting function: 
max_depth=10,
n_estimators=2500,
learning_rate=.001

Other settings: 
begin_date = '2022-03-08'
end_date = '2022-03-10'
interval controls: day_range_of_iter = 3


r2_score(y_test, y_pred) => 0.3746194189404347 (Best possible score is 1.0.)

mean_absolute_error(y_test, y_pred) => 0.912472612607337 (negatively-oriented, lower values are better)

mean_squared_error(y_test, y_pred) => 1.5577164280921987 (which ever is lower is better, generally)

### Team Notes: 
*r2_score:* Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a score of 0.0.

*mean_squared_error:* The smaller the mean squared error, the closer you are to finding the line of best fit. Depending on your data, it may be impossible to get a very small value for the mean squared error. For example, the above data is scattered wildly around the regression line, so 6.08 is as good as it gets (and is in fact, the line of best fit). 

What value of RMSE is acceptable?
Based on a rule of thumb, it can be said that RMSE values between 0.2 and 0.5 shows that the model can relatively

predict the data accurately. In addition, Adjusted R-squared more than 0.75 is a very good value for showing the

accuracy. In some cases, Adjusted R-squared of 0.4 or more is acceptable as well.

Thus RMSE can be very sensitive to outliers; in general we call this form of statistic not "robust". 
Robust statistics is a field interested in algorithms that are NOT sensitive to outliers.
Since the errors are squared before they are averaged, the RMSE gives a relatively high weight to large errors. 
This means the RMSE is most useful when large errors are particularly desirable. Both the MAE and RMSE can range
from 0 to ∞. They are negatively-oriented scores: Lower values are better.

In [2]:
# GET Tabled input

# creating database engine
db_name = 'Company_Stock_DB'
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/{db_name}"
engine = create_engine(db_string)

# read data from PostgreSQL database table and load into Dataframe instance
stock_df = pd.read_sql("select * from \"company_all_star\"", engine);

#sort the dataframe by ticker column
stock_df.sort_values(by=['ticker'])

# Print the DataFrame
stock_df.head()

Unnamed: 0,ticker,date_val,company_name,company_url,employee_count,revenue,sector,city_name,state_name,region,...,latitude,longitude,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change
0,AMD,2020-03-12,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,42.2,43.91,39.6,43.9,86689681.0,41.6701,381223.0,4.028436
1,AMD,2020-03-15,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,39.08,43.37,38.51,38.71,84545868.0,41.0812,374962.0,0.946776
2,AMD,2020-03-16,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,40.19,42.88,38.3,41.88,92741881.0,41.124,434519.0,4.205026
3,AMD,2020-03-17,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,39.54,41.95,36.75,39.12,106949287.0,39.6363,591862.0,1.062215
4,AMD,2020-03-18,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,39.56,41.7,37.69,39.82,88939024.0,40.2337,396388.0,0.65723


In [3]:
# drop throw-aways 
stock_df.drop(['open_val', 'high_val', 'low_val', 'close_val', 'number_of_transactions', 'percent_change', 
               'city_name', 'state_name', 'longitude', 'latitude', 'company_name',
               'company_url'], axis=1, inplace=True)
stock_df

Unnamed: 0,ticker,date_val,employee_count,revenue,sector,region,country_code,volume,volume_weight
0,AMD,2020-03-12,5k-10k,over-1b,Technology,W,US,86689681.0,41.6701
1,AMD,2020-03-15,5k-10k,over-1b,Technology,W,US,84545868.0,41.0812
2,AMD,2020-03-16,5k-10k,over-1b,Technology,W,US,92741881.0,41.1240
3,AMD,2020-03-17,5k-10k,over-1b,Technology,W,US,106949287.0,39.6363
4,AMD,2020-03-18,5k-10k,over-1b,Technology,W,US,88939024.0,40.2337
...,...,...,...,...,...,...,...,...,...
50869,ZS,2022-03-06,1k-5k,100m-200m,Technology,W,US,4379337.0,210.5799
50870,ZS,2022-03-07,1k-5k,100m-200m,Technology,W,US,4389634.0,196.9284
50871,ZS,2022-03-08,1k-5k,100m-200m,Technology,W,US,3050554.0,209.3268
50872,ZS,2022-03-09,1k-5k,100m-200m,Technology,W,US,2305091.0,208.7971


In [4]:
# check dtypes
stock_df.dtypes

ticker             object
date_val           object
employee_count     object
revenue            object
sector             object
region             object
country_code       object
volume            float64
volume_weight     float64
dtype: object

In [5]:
# preserve date column as type object
stock_df['date'] = stock_df['date_val']

# have the user enter beginning date as yyyy-mm-dd
begin_date = '2022-03-08'
# have the user enter ending date as yyyy-mm-dd
end_date = '2022-03-10'
# interval controls
day_range_of_iter = 2

# convert the date to datetime64
stock_df['date_val'] = pd.to_datetime(stock_df['date_val'], format='%Y-%m-%d')

stock_df = stock_df.loc[(stock_df['date_val'] >= begin_date)
                     & (stock_df['date_val'] <= end_date)]

# drop date_val
stock_df.drop(['date_val'], axis=1, inplace=True)

stock_df

Unnamed: 0,ticker,employee_count,revenue,sector,region,country_code,volume,volume_weight,date
501,AMD,5k-10k,over-1b,Technology,W,US,102310329.0,109.6319,2022-03-08
502,AMD,5k-10k,over-1b,Technology,W,US,102557375.0,105.3382,2022-03-09
503,AMD,5k-10k,over-1b,Technology,W,US,87584432.0,105.9691,2022-03-10
1006,ADBE,over-10k,1m-10m,Technology,W,US,2905656.0,447.8637,2022-03-08
1007,ADBE,over-10k,1m-10m,Technology,W,US,2686310.0,437.7568,2022-03-09
...,...,...,...,...,...,...,...,...,...
50368,ZM,1k-5k,100m-200m,Technology,W,US,5030777.0,103.3206,2022-03-09
50369,ZM,1k-5k,100m-200m,Technology,W,US,6454629.0,99.6973,2022-03-10
50871,ZS,1k-5k,100m-200m,Technology,W,US,3050554.0,209.3268,2022-03-08
50872,ZS,1k-5k,100m-200m,Technology,W,US,2305091.0,208.7971,2022-03-09


In [6]:
# check dtypes
stock_df.dtypes

ticker             object
employee_count     object
revenue            object
sector             object
region             object
country_code       object
volume            float64
volume_weight     float64
date               object
dtype: object

In [7]:
# unique days in df
unique_days = len(pd.unique(stock_df['date']))
print("unique number of days(number of days in df):", unique_days)

# unique stocks in df
unique_stocks = len(pd.unique(stock_df['ticker']))
print(unique_stocks)

# interation sets
iteration_sets = (unique_days - day_range_of_iter + 1)
print("iteration_sets: ", iteration_sets)

# total records captured
length_of_df = len(stock_df)
print(length_of_df)

unique number of days(number of days in df): 3
102
iteration_sets:  2
306


In [8]:
# sort dataframe by date
sort_date_stock_df = stock_df.sort_values(by=['date', 'ticker'])
next_date_stock_df = sort_date_stock_df

# get beginning dataframe records
b = 0 
# ending record for beginning df
ending_records = iteration_sets * unique_stocks

# starting record for end
x = (unique_days - iteration_sets) * unique_stocks
max_records = unique_days * unique_stocks

begin_df = pd.DataFrame()
end_df = pd.DataFrame()
 
# create two dataframes, one containing the beginning interval records and one containing the ending interval records
for rec in sort_date_stock_df.iterrows():
    
    if b < ending_records:
        new_begin_df = sort_date_stock_df.iloc[b]
        begin_df = begin_df.append(new_begin_df,ignore_index=False)
    
    if x < max_records: 
        new_end_df = next_date_stock_df.iloc[x]
        end_df = end_df.append(new_end_df,ignore_index=False)
    b=b+1
    x=x+1
   
begin_df.reset_index(drop=True,inplace=True)
begin_df 

Unnamed: 0,country_code,date,employee_count,region,revenue,sector,ticker,volume,volume_weight
0,US,2022-03-08,over-10k,W,over-1b,Technology,AAPL,91445405.0,161.9446
1,US,2022-03-08,5k-10k,W,200m-1b,Technology,ABNB,7023908.0,148.5454
2,US,2022-03-08,over-10k,W,1m-10m,Technology,ADBE,2905656.0,447.8637
3,US,2022-03-08,over-10k,SE,over-1b,Technology,ADI,3046254.0,153.5888
4,US,2022-03-08,over-10k,NE,over-1b,Consumer Discretionary,ADP,1791687.0,209.3495
...,...,...,...,...,...,...,...,...,...
199,US,2022-03-09,over-10k,MW,over-1b,Healthcare,WBA,4999822.0,47.7198
200,US,2022-03-09,over-10k,W,over-1b,Technology,WDAY,1478994.0,229.0807
201,US,2022-03-09,over-10k,MW,over-1b,Utilities,XEL,1945242.0,69.9773
202,US,2022-03-09,1k-5k,W,100m-200m,Technology,ZM,5030777.0,103.3206


In [9]:
end_df.reset_index(drop=True,inplace=True)
end_df

Unnamed: 0,country_code,date,employee_count,region,revenue,sector,ticker,volume,volume_weight
0,US,2022-03-09,over-10k,W,over-1b,Technology,AAPL,105342033.0,158.0284
1,US,2022-03-09,5k-10k,W,200m-1b,Technology,ABNB,5302511.0,149.8916
2,US,2022-03-09,over-10k,W,1m-10m,Technology,ADBE,2686310.0,437.7568
3,US,2022-03-09,over-10k,SE,over-1b,Technology,ADI,4035714.0,150.1401
4,US,2022-03-09,over-10k,NE,over-1b,Consumer Discretionary,ADP,2399303.0,207.7212
...,...,...,...,...,...,...,...,...,...
199,US,2022-03-10,over-10k,MW,over-1b,Healthcare,WBA,5737993.0,47.9174
200,US,2022-03-10,over-10k,W,over-1b,Technology,WDAY,1789032.0,226.1950
201,US,2022-03-10,over-10k,MW,over-1b,Utilities,XEL,3031491.0,70.4602
202,US,2022-03-10,1k-5k,W,100m-200m,Technology,ZM,6454629.0,99.6973


In [10]:
# merge together the beginning and ending dataframes
merged_df = pd.merge(begin_df, end_df, left_index=True, right_index=True)
merged_df

Unnamed: 0,country_code_x,date_x,employee_count_x,region_x,revenue_x,sector_x,ticker_x,volume_x,volume_weight_x,country_code_y,date_y,employee_count_y,region_y,revenue_y,sector_y,ticker_y,volume_y,volume_weight_y
0,US,2022-03-08,over-10k,W,over-1b,Technology,AAPL,91445405.0,161.9446,US,2022-03-09,over-10k,W,over-1b,Technology,AAPL,105342033.0,158.0284
1,US,2022-03-08,5k-10k,W,200m-1b,Technology,ABNB,7023908.0,148.5454,US,2022-03-09,5k-10k,W,200m-1b,Technology,ABNB,5302511.0,149.8916
2,US,2022-03-08,over-10k,W,1m-10m,Technology,ADBE,2905656.0,447.8637,US,2022-03-09,over-10k,W,1m-10m,Technology,ADBE,2686310.0,437.7568
3,US,2022-03-08,over-10k,SE,over-1b,Technology,ADI,3046254.0,153.5888,US,2022-03-09,over-10k,SE,over-1b,Technology,ADI,4035714.0,150.1401
4,US,2022-03-08,over-10k,NE,over-1b,Consumer Discretionary,ADP,1791687.0,209.3495,US,2022-03-09,over-10k,NE,over-1b,Consumer Discretionary,ADP,2399303.0,207.7212
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,US,2022-03-09,over-10k,MW,over-1b,Healthcare,WBA,4999822.0,47.7198,US,2022-03-10,over-10k,MW,over-1b,Healthcare,WBA,5737993.0,47.9174
200,US,2022-03-09,over-10k,W,over-1b,Technology,WDAY,1478994.0,229.0807,US,2022-03-10,over-10k,W,over-1b,Technology,WDAY,1789032.0,226.1950
201,US,2022-03-09,over-10k,MW,over-1b,Utilities,XEL,1945242.0,69.9773,US,2022-03-10,over-10k,MW,over-1b,Utilities,XEL,3031491.0,70.4602
202,US,2022-03-09,1k-5k,W,100m-200m,Technology,ZM,5030777.0,103.3206,US,2022-03-10,1k-5k,W,100m-200m,Technology,ZM,6454629.0,99.6973


In [11]:
# drop *_y fields not needed 
merged_df.drop(columns = ['date_x', 'region_y', 'ticker_y', 'country_code_y'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
merged_df

Unnamed: 0,country_code_x,employee_count_x,region_x,revenue_x,sector_x,ticker_x,volume_x,volume_weight_x,date_y,employee_count_y,revenue_y,sector_y,volume_y,volume_weight_y
0,US,over-10k,W,over-1b,Technology,AAPL,91445405.0,161.9446,2022-03-09,over-10k,over-1b,Technology,105342033.0,158.0284
1,US,5k-10k,W,200m-1b,Technology,ABNB,7023908.0,148.5454,2022-03-09,5k-10k,200m-1b,Technology,5302511.0,149.8916
2,US,over-10k,W,1m-10m,Technology,ADBE,2905656.0,447.8637,2022-03-09,over-10k,1m-10m,Technology,2686310.0,437.7568
3,US,over-10k,SE,over-1b,Technology,ADI,3046254.0,153.5888,2022-03-09,over-10k,over-1b,Technology,4035714.0,150.1401
4,US,over-10k,NE,over-1b,Consumer Discretionary,ADP,1791687.0,209.3495,2022-03-09,over-10k,over-1b,Consumer Discretionary,2399303.0,207.7212
5,US,over-10k,W,over-1b,Healthcare,ADSK,1850028.0,205.2018,2022-03-09,over-10k,over-1b,Healthcare,1458925.0,199.9849
6,US,over-10k,MW,over-1b,Energy,AEP,3006258.0,95.0679,2022-03-09,over-10k,over-1b,Energy,2897953.0,95.5087
7,US,over-10k,SW,200m-1b,Technology,ALGN,694358.0,438.9188,2022-03-09,over-10k,200m-1b,Technology,532709.0,427.7159
8,US,over-10k,W,over-1b,Technology,AMAT,7623175.0,128.1947,2022-03-09,over-10k,over-1b,Technology,5604387.0,124.5674
9,US,5k-10k,W,over-1b,Technology,AMD,102310329.0,109.6319,2022-03-09,5k-10k,over-1b,Technology,102557375.0,105.3382


In [12]:
# calculate the percentage change between the beginning (*_x) and the ending (*_y) volume and volume_weights
merged_df['percent_change_volume_weight'] = 100 - merged_df['volume_weight_y']/merged_df['volume_weight_x'] * 100
merged_df['percent_change_volume'] = 100 - merged_df['volume_y']/merged_df['volume_x'] * 100
merged_df

Unnamed: 0,country_code_x,employee_count_x,region_x,revenue_x,sector_x,ticker_x,volume_x,volume_weight_x,date_y,employee_count_y,revenue_y,sector_y,volume_y,volume_weight_y,percent_change_volume_weight,percent_change_volume
0,US,over-10k,W,over-1b,Technology,AAPL,91445405.0,161.9446,2022-03-09,over-10k,over-1b,Technology,105342033.0,158.0284,2.418234,-15.196639
1,US,5k-10k,W,200m-1b,Technology,ABNB,7023908.0,148.5454,2022-03-09,5k-10k,200m-1b,Technology,5302511.0,149.8916,-0.906255,24.507681
2,US,over-10k,W,1m-10m,Technology,ADBE,2905656.0,447.8637,2022-03-09,over-10k,1m-10m,Technology,2686310.0,437.7568,2.256691,7.548932
3,US,over-10k,SE,over-1b,Technology,ADI,3046254.0,153.5888,2022-03-09,over-10k,over-1b,Technology,4035714.0,150.1401,2.245411,-32.481205
4,US,over-10k,NE,over-1b,Consumer Discretionary,ADP,1791687.0,209.3495,2022-03-09,over-10k,over-1b,Consumer Discretionary,2399303.0,207.7212,0.77779,-33.913066
5,US,over-10k,W,over-1b,Healthcare,ADSK,1850028.0,205.2018,2022-03-09,over-10k,over-1b,Healthcare,1458925.0,199.9849,2.542327,21.140383
6,US,over-10k,MW,over-1b,Energy,AEP,3006258.0,95.0679,2022-03-09,over-10k,over-1b,Energy,2897953.0,95.5087,-0.463669,3.602652
7,US,over-10k,SW,200m-1b,Technology,ALGN,694358.0,438.9188,2022-03-09,over-10k,200m-1b,Technology,532709.0,427.7159,2.552386,23.280354
8,US,over-10k,W,over-1b,Technology,AMAT,7623175.0,128.1947,2022-03-09,over-10k,over-1b,Technology,5604387.0,124.5674,2.829524,26.482247
9,US,5k-10k,W,over-1b,Technology,AMD,102310329.0,109.6319,2022-03-09,5k-10k,over-1b,Technology,102557375.0,105.3382,3.91647,-0.241467


In [13]:
# drop volume and weight columns
merged_df.drop(columns = ['volume_x', 'volume_weight_x', 'volume_y', 'volume_weight_y', 'date_y', 
                         'sector_y', 'revenue_y', 'employee_count_y'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
merged_df

Unnamed: 0,country_code_x,employee_count_x,region_x,revenue_x,sector_x,ticker_x,percent_change_volume_weight,percent_change_volume
0,US,over-10k,W,over-1b,Technology,AAPL,2.418234,-15.196639
1,US,5k-10k,W,200m-1b,Technology,ABNB,-0.906255,24.507681
2,US,over-10k,W,1m-10m,Technology,ADBE,2.256691,7.548932
3,US,over-10k,SE,over-1b,Technology,ADI,2.245411,-32.481205
4,US,over-10k,NE,over-1b,Consumer Discretionary,ADP,0.77779,-33.913066
5,US,over-10k,W,over-1b,Healthcare,ADSK,2.542327,21.140383
6,US,over-10k,MW,over-1b,Energy,AEP,-0.463669,3.602652
7,US,over-10k,SW,200m-1b,Technology,ALGN,2.552386,23.280354
8,US,over-10k,W,over-1b,Technology,AMAT,2.829524,26.482247
9,US,5k-10k,W,over-1b,Technology,AMD,3.91647,-0.241467


In [14]:
merged_df.dtypes

country_code_x                   object
employee_count_x                 object
region_x                         object
revenue_x                        object
sector_x                         object
ticker_x                         object
percent_change_volume_weight    float64
percent_change_volume           float64
dtype: object

In [15]:
# unique values for each column (getting to know your data)
merged_df.nunique()

country_code_x                    8
employee_count_x                  4
region_x                         11
revenue_x                         6
sector_x                          9
ticker_x                        102
percent_change_volume_weight    204
percent_change_volume           204
dtype: int64

In [16]:
# generate our categorical variable list
# categorical preprocessing can be done easiest using Dataframe.dtypes == 'object'
stock_categories = merged_df.dtypes[merged_df.dtypes == "object"].index.tolist()
stock_categories

['country_code_x',
 'employee_count_x',
 'region_x',
 'revenue_x',
 'sector_x',
 'ticker_x']

In [17]:
# Checking the number of unique values in each column
merged_df[stock_categories].nunique()

country_code_x        8
employee_count_x      4
region_x             11
revenue_x             6
sector_x              9
ticker_x            102
dtype: int64

In [18]:
# merge objects having 'set' categories
merged_df['country_code_x'] = merged_df['country_code_x'].astype('category').cat.codes
merged_df['region_x'] = merged_df['region_x'].astype('category').cat.codes
merged_df['sector_x'] = merged_df['sector_x'].astype('category').cat.codes
merged_df['employee_count_x'] = merged_df['employee_count_x'].astype('category').cat.codes
merged_df['revenue_x'] = merged_df['revenue_x'].astype('category').cat.codes


In [19]:
# replace stock's region string with integer
merged_df.loc[(merged_df['region_x'] == 'W'), 'region_x'] = 11
merged_df.loc[(merged_df['region_x'] == 'MW'), 'region_x'] = 9
merged_df.loc[(merged_df['region_x'] == 'SW'), 'region_x'] = 5
merged_df.loc[(merged_df['region_x'] == 'NW'), 'region_x'] = 8
merged_df.loc[(merged_df['region_x'] == 'SE'), 'region_x'] = 10
merged_df.loc[(merged_df['region_x'] == 'NL'), 'region_x'] = 4
merged_df.loc[(merged_df['region_x'] == 'AU'), 'region_x'] = 1
merged_df.loc[(merged_df['region_x'] == 'NE'), 'region_x'] = 7
merged_df.loc[(merged_df['region_x'] == 'GB'), 'region_x'] = 3
merged_df.loc[(merged_df['region_x'] == 'CH'), 'region_x'] = 6
merged_df.loc[(merged_df['region_x'] == 'CA'), 'region_x'] = 2

# replace stock's sector string with integer
merged_df.loc[(merged_df['sector_x'] == 'Technology'), 'sector_x'] = 9
merged_df.loc[(merged_df['sector_x'] == 'Energy'), 'sector_x'] = 2
merged_df.loc[(merged_df['sector_x'] == 'Healthcare'), 'sector_x'] = 8
merged_df.loc[(merged_df['sector_x'] == 'Consumer Discretionary'), 'sector_x'] = 7
merged_df.loc[(merged_df['sector_x'] == 'Industrials'), 'sector_x'] = 6
merged_df.loc[(merged_df['sector_x'] == 'Consumer Staples'), 'sector_x'] = 5
merged_df.loc[(merged_df['sector_x'] == 'Communication Services'), 'sector_x'] = 4
merged_df.loc[(merged_df['sector_x'] == 'Financials'), 'sector_x'] = 3
merged_df.loc[(merged_df['sector_x'] == 'Utilities'), 'sector_x'] = 1

# replace stock's country code string with integer (Note: China was CN and CH for some reason)
merged_df.loc[(merged_df['country_code_x'] == 'US'), 'country_code_x'] = 7
merged_df.loc[(merged_df['country_code_x'] == 'Netherlands'), 'country_code_x'] = 6
merged_df.loc[(merged_df['country_code_x'] == 'Australia'), 'country_code_x'] = 1
merged_df.loc[(merged_df['country_code_x'] == 'UK'), 'country_code_x'] = 4
merged_df.loc[(merged_df['country_code_x'] == 'CH'), 'country_code_x'] = 5
merged_df.loc[(merged_df['country_code_x'] == 'CN'), 'country_code_x'] = 5
merged_df.loc[(merged_df['country_code_x'] == 'CA'), 'country_code_x'] = 3
merged_df.loc[(merged_df['country_code_x'] == 'Argentina'), 'country_code_x'] = 2

In [20]:
merged_df

Unnamed: 0,country_code_x,employee_count_x,region_x,revenue_x,sector_x,ticker_x,percent_change_volume_weight,percent_change_volume
0,7,3,10,5,7,AAPL,2.418234,-15.196639
1,7,2,10,3,7,ABNB,-0.906255,24.507681
2,7,3,10,2,7,ADBE,2.256691,7.548932
3,7,3,8,5,7,ADI,2.245411,-32.481205
4,7,3,5,5,1,ADP,0.77779,-33.913066
5,7,3,10,5,5,ADSK,2.542327,21.140383
6,7,3,4,5,3,AEP,-0.463669,3.602652
7,7,3,9,3,7,ALGN,2.552386,23.280354
8,7,3,10,5,7,AMAT,2.829524,26.482247
9,7,2,10,5,7,AMD,3.91647,-0.241467


In [21]:
# unique_percent_change_vw = pd.unique(merged_df['percent_change_volume_weight'])
# unique_percent_change_vw.sort()
# unique_percent_change_vw

In [22]:


#discover the bucket you need to create 
# unique_percent_change_v = pd.unique(merged_df['percent_change_volume'])
# unique_percent_change_v.sort()
# unique_percent_change_v

In [23]:
# based on the number of volume weight percentage changes and volume percent changes, I create buckets

# # volume_weight_percent_changes
# merged_df.loc[(merged_df['percent_change_volume_weight'] < 0), 'percent_change_volume'] = 6
# merged_df.loc[(merged_df['percent_change_volume_weight'] >= 0) & (merged_df['percent_change_volume_weight'] < 1), 'percent_change_volume_weight'] = 7
# merged_df.loc[(merged_df['percent_change_volume_weight'] >= 1) & (merged_df['percent_change_volume_weight'] < 2), 'percent_change_volume_weight'] = 5
# merged_df.loc[(merged_df['percent_change_volume_weight'] >= 2) & (merged_df['percent_change_volume_weight'] < 3), 'percent_change_volume_weight'] = 2
# merged_df.loc[(merged_df['percent_change_volume_weight'] >= 3) & (merged_df['percent_change_volume_weight'] < 4), 'percent_change_volume_weight'] = 4
# merged_df.loc[(merged_df['percent_change_volume_weight'] >= 4) & (merged_df['percent_change_volume_weight'] < 5), 'percent_change_volume_weight'] = 1
# merged_df.loc[(merged_df['percent_change_volume_weight'] >= 5), 'percent_change_volume_weight'] = 3
# merged_df

# # volume_weight_percent_changes
# merged_df.loc[(merged_df['percent_change_volume'] < 0), 'percent_change_volume'] = 5
# merged_df.loc[(merged_df['percent_change_volume'] > 0) & (merged_df['percent_change_volume'] <= 10), 'percent_change_volume'] = 5
# merged_df.loc[(merged_df['percent_change_volume'] > 10) & (merged_df['percent_change_volume'] <= 20), 'percent_change_volume'] = 4
# merged_df.loc[(merged_df['percent_change_volume'] > 20) & (merged_df['percent_change_volume'] <= 30), 'percent_change_volume'] = 2
# merged_df.loc[(merged_df['percent_change_volume'] > 30) & (merged_df['percent_change_volume'] <= 40), 'percent_change_volume'] = 3
# merged_df.loc[(merged_df['percent_change_volume'] > 40), 'percent_change_volume'] = 1
# merged_df

In [24]:
# remvoe ticker
merged_df.drop(columns = ['ticker_x'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
merged_df

Unnamed: 0,country_code_x,employee_count_x,region_x,revenue_x,sector_x,percent_change_volume_weight,percent_change_volume
0,7,3,10,5,7,2.418234,-15.196639
1,7,2,10,3,7,-0.906255,24.507681
2,7,3,10,2,7,2.256691,7.548932
3,7,3,8,5,7,2.245411,-32.481205
4,7,3,5,5,1,0.77779,-33.913066
5,7,3,10,5,5,2.542327,21.140383
6,7,3,4,5,3,-0.463669,3.602652
7,7,3,9,3,7,2.552386,23.280354
8,7,3,10,5,7,2.829524,26.482247
9,7,2,10,5,7,3.91647,-0.241467


In [25]:
# create features array
X = merged_df.drop(columns=["percent_change_volume_weight"]).values
    
# create target
y = merged_df["percent_change_volume_weight"].values

In [26]:
# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [32]:
# max_depth refers to the number of leaves of each tree 
# n_estimators refers to the total number of trees in the ensemble
# learning_rate hyperparameter scales the contribution of each tree NOTE: If you set it to a low value, 
# you will need more trees in the ensemble to fit the training set, but the overall variance will be lower.

# best way to tune the model: https://neptune.ai/blog/lightgbm-parameters-guide
    
regressor = GradientBoostingRegressor(
max_depth=15,
n_estimators=1000,
learning_rate=.01,
criterion='mse' # ‘mse’, ‘mae’}
)
regressor.fit(X_train, y_train)


GradientBoostingRegressor(criterion='mse', learning_rate=0.01, max_depth=15,
                          n_estimators=1000)

In [33]:
# Use staged_predict() method to measures the validation error at each stage of training 
# (i.e. with one tree, with two trees…) to find the optimal number of trees.
errors = [mean_squared_error(y_test, y_pred) for y_pred in 
           regressor.staged_predict(X_test)]
print(errors)

# mean_squared_error: 
# The smaller the mean squared error, the closer you are to finding the line of best fit. Depending on your data, 
# it may be impossible to get a very small value for the mean squared error. For example, the above data is scattered 
# wildly around the regression line, so 6.08 is as good as it gets (and is in fact, the line of best fit). 

# What value of RMSE is acceptable?
# Based on a rule of thumb, it can be said that RMSE values between 0.2 and 0.5 shows that the model can relatively
#                                               -------------------------------   
# predict the data accurately. In addition, Adjusted R-squared more than 0.75 is a very good value for showing the
#                                                    ------------------------
# accuracy. In some cases, Adjusted R-squared of 0.4 or more is acceptable as well.
#                          -------------------------   
# Thus RMSE can be very sensitive to outliers; in general we call this form of statistic not "robust". 
# Robust statistics is a field interested in algorithms that are NOT sensitive to outliers.
#
# Since the errors are squared before they are averaged, the RMSE gives a relatively high weight to large errors. 
# This means the RMSE is most useful when large errors are particularly undesirable. Both the MAE and RMSE can range
# from 0 to ∞. They are negatively-oriented scores: Lower values are better.
#                                                   ----------------------- 

[8.319603541422081, 8.220431566266186, 8.112835651368048, 8.02110444486011, 7.929932593393308, 7.842518819019518, 7.746796845193776, 7.666578159513992, 7.583579005152276, 7.497022328607229, 7.412792849111883, 7.3410524135818225, 7.2592429429404195, 7.187453083217685, 7.116291735997531, 7.046999791040748, 6.98622749736153, 6.92395096868725, 6.861118468270999, 6.802501541429544, 6.7387629956862884, 6.679444727750033, 6.62620630907361, 6.568848754391147, 6.511543725226725, 6.456605324714168, 6.410262151870565, 6.359437686823853, 6.310176342255769, 6.265111226386739, 6.223100742094148, 6.176368024924156, 6.132346706223451, 6.094488746804998, 6.04957357783801, 6.007987283503817, 5.972284420462145, 5.93632838869633, 5.89891976062479, 5.865763256564471, 5.831017064615231, 5.802998030199642, 5.767481469118155, 5.740267844593335, 5.708078312573402, 5.6772229132194445, 5.649235423025101, 5.620199596896796, 5.590677358447146, 5.569073365722594, 5.550533666408812, 5.525855542268476, 5.504033454231

In [None]:
best_n_estimators = np.argmin(errors)

print(best_n_estimators)

In [None]:
# build and fit our model using the optimal number of trees
best_regressor = GradientBoostingRegressor(
     max_depth=15,
     n_estimators=best_n_estimators,
     learning_rate=.01
)

best_regressor.fit(X_train, y_train)

# # Sklearn provides numerous metrics to evaluate 
# # the performance of our machine learning models.
# # They categorize the each metric according 
# # to the problem domain which they’re applicable. 
# # https://scikit-learn.org/stable/modules/model_evaluation.html <-- GO TO THIS SITE TO SEE WHICH METRICS YOU WILL USE.

# # We use the mean absolute error 
# # which can be interpreted as 
# # the average distance from 
# # our predictions and the actual values

# # this will give you the value of the stocks for the next period of time
y_pred = best_regressor.predict(X_test)
print(y_pred)
print(X_test)
print(X_train)
print(y_train)


In [None]:
# should we be using r2_score?
# how do you do residual plots?

In [None]:
# how well the model performed (looking for smallest error)
print(r2_score(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))

# r2_score: Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model 
# that always predicts the expected value of y, disregarding the input features, would get a score of 0.0.

In [None]:

# #     # Tomas: correlation analysis to see how your features are correlated to each other
    
# #     # as with any regression you need to minimize the mean square error.
#                                                         ------------------
# #     examples are at : 
# # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html#sklearn.metrics.mean_squared_error
# #     from sklearn.metrics import mean_squared_error
    
# # EMPTY PROCESS DATAFRAME   
    
# #     # accrossed all stocks, what is the average score.
# #     # what is the mean?
# #     # what is the median?
# #     # do we have any outliers that we need to note
# #     # does this work better for same sectors?