In [1]:
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime, timedelta
from functools import partial
import numpy as np
import psycopg2
import pandas as pd
from sqlalchemy import create_engine
from config import db_password
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

#import necessary libraries 
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols


#from joblib import Parallel, delayed, Model 
#from collections import Counter <--????
#from sklearn.metrics import confusion_matrix
#from imblearn.metrics import classification_report_imbalanced

In [2]:
# GET Tabled input

# creating database engine
db_name = 'Company_Stock_DB'
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/{db_name}"
engine = create_engine(db_string)

# read data from PostgreSQL database table and load into Dataframe instance
stock_df = pd.read_sql("select * from \"company_all_star\"", engine);

#sort the dataframe by ticker column
stock_df.sort_values(by=['ticker'])

# Print the DataFrame
stock_df.head()

Unnamed: 0,ticker,date_val,company_name,company_url,employee_count,revenue,sector,city_name,state_name,region,...,latitude,longitude,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change
0,AMD,2020-03-12,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,42.2,43.91,39.6,43.9,86689681.0,41.6701,381223.0,4.028436
1,AMD,2020-03-15,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,39.08,43.37,38.51,38.71,84545868.0,41.0812,374962.0,0.946776
2,AMD,2020-03-16,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,40.19,42.88,38.3,41.88,92741881.0,41.124,434519.0,4.205026
3,AMD,2020-03-17,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,39.54,41.95,36.75,39.12,106949287.0,39.6363,591862.0,1.062215
4,AMD,2020-03-18,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,39.56,41.7,37.69,39.82,88939024.0,40.2337,396388.0,0.65723


In [3]:
# preserve date column as type object
stock_df['date'] = stock_df['date_val']

# have the user enter beginning date as yyyy-mm-dd
begin_date = '2022-03-08'
# have the user enter ending date as yyyy-mm-dd
end_date = '2022-03-10'
# iteration controls
day_range_of_iter = 3

# Convert the date to datetime64
stock_df['date_val'] = pd.to_datetime(stock_df['date_val'], format='%Y-%m-%d')

stock_df = stock_df.loc[(stock_df['date_val'] >= begin_date)
                     & (stock_df['date_val'] <= end_date)]

# drop throw-aways 
stock_df.drop(["longitude", "latitude", "company_name", "company_url","date_val"], axis=1, inplace=True)

stock_df

Unnamed: 0,ticker,employee_count,revenue,sector,city_name,state_name,region,country_code,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change,date
501,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,108.410,111.71,106.850,111.05,102310329.0,109.6319,602679.0,2.435200,2022-03-08
502,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,108.890,109.07,103.070,106.46,102557375.0,105.3382,639388.0,2.231610,2022-03-09
503,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,108.130,108.19,104.080,104.29,87584432.0,105.9691,542478.0,3.551281,2022-03-10
1006,ADBE,over-10k,1m-10m,Technology,San Jose,CA,W,US,443.800,453.11,438.930,450.87,2905656.0,447.8637,67082.0,1.593060,2022-03-08
1007,ADBE,over-10k,1m-10m,Technology,San Jose,CA,W,US,444.680,447.65,433.010,438.95,2686310.0,437.7568,66371.0,1.288567,2022-03-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50368,ZM,1k-5k,100m-200m,Technology,San Jose,CA,W,US,105.835,106.90,101.055,103.33,5030777.0,103.3206,88819.0,2.366892,2022-03-09
50369,ZM,1k-5k,100m-200m,Technology,San Jose,CA,W,US,103.480,103.49,97.900,98.12,6454629.0,99.6973,104681.0,5.179745,2022-03-10
50871,ZS,1k-5k,100m-200m,Technology,San Jose,CA,W,US,203.840,213.57,199.120,212.35,3050554.0,209.3268,45960.0,4.174843,2022-03-08
50872,ZS,1k-5k,100m-200m,Technology,San Jose,CA,W,US,212.130,213.51,204.870,208.41,2305091.0,208.7971,40754.0,1.753642,2022-03-09


In [4]:
# drop fields that will not be used to represent a period of time
stock_df.drop(columns = ['open_val', 'high_val', 'low_val', 'close_val', 'number_of_transactions', 'city_name', 'state_name', 'number_of_transactions', 'percent_change'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
stock_df

Unnamed: 0,ticker,employee_count,revenue,sector,region,country_code,volume,volume_weight,date
501,AMD,5k-10k,over-1b,Technology,W,US,102310329.0,109.6319,2022-03-08
502,AMD,5k-10k,over-1b,Technology,W,US,102557375.0,105.3382,2022-03-09
503,AMD,5k-10k,over-1b,Technology,W,US,87584432.0,105.9691,2022-03-10
1006,ADBE,over-10k,1m-10m,Technology,W,US,2905656.0,447.8637,2022-03-08
1007,ADBE,over-10k,1m-10m,Technology,W,US,2686310.0,437.7568,2022-03-09
1008,ADBE,over-10k,1m-10m,Technology,W,US,4434498.0,422.5279,2022-03-10
1321,ABNB,5k-10k,200m-1b,Technology,W,US,7023908.0,148.5454,2022-03-08
1322,ABNB,5k-10k,200m-1b,Technology,W,US,5302511.0,149.8916,2022-03-09
1323,ABNB,5k-10k,200m-1b,Technology,W,US,4577255.0,147.8527,2022-03-10
1825,ALGN,over-10k,200m-1b,Technology,SW,US,694358.0,438.9188,2022-03-08


In [5]:
# unique days in df
unique_days = len(pd.unique(stock_df['date']))
print("unique number of days(number of days in df):", unique_days)

# unique stocks in df
unique_stocks = len(pd.unique(stock_df['ticker']))
print(unique_stocks)

# interation sets
iteration_sets = (unique_days - day_range_of_iter + 1)
print("iteration_sets: ", iteration_sets)

# total records captured
length_of_df = len(stock_df)
print(length_of_df)


unique number of days(number of days in df): 3
102
iteration_sets:  1
306


In [6]:
# sort dataframe by date
sort_date_stock_df = stock_df.sort_values(by=['date', 'ticker'])
next_date_stock_df = sort_date_stock_df

# get beginning dataframe records
b = 0 
# ending record for beginning df
ending_records = iteration_sets * unique_stocks

# starting record for end
x = (unique_days - iteration_sets) * unique_stocks
max_records = unique_days * unique_stocks

begin_df = pd.DataFrame()
end_df = pd.DataFrame()
               
for rec in sort_date_stock_df.iterrows():
    
    if b < ending_records:
        new_begin_df = sort_date_stock_df.iloc[b]
        begin_df = begin_df.append(new_begin_df,ignore_index=False)
    
    if x < max_records: 
        new_end_df = next_date_stock_df.iloc[x]
        end_df = end_df.append(new_end_df,ignore_index=False)
    b=b+1
    x=x+1
   
begin_df.reset_index(drop=True,inplace=True)
begin_df 


Unnamed: 0,country_code,date,employee_count,region,revenue,sector,ticker,volume,volume_weight
0,US,2022-03-08,over-10k,W,over-1b,Technology,AAPL,91445405.0,161.9446
1,US,2022-03-08,5k-10k,W,200m-1b,Technology,ABNB,7023908.0,148.5454
2,US,2022-03-08,over-10k,W,1m-10m,Technology,ADBE,2905656.0,447.8637
3,US,2022-03-08,over-10k,SE,over-1b,Technology,ADI,3046254.0,153.5888
4,US,2022-03-08,over-10k,NE,over-1b,Consumer Discretionary,ADP,1791687.0,209.3495
5,US,2022-03-08,over-10k,W,over-1b,Healthcare,ADSK,1850028.0,205.2018
6,US,2022-03-08,over-10k,MW,over-1b,Energy,AEP,3006258.0,95.0679
7,US,2022-03-08,over-10k,SW,200m-1b,Technology,ALGN,694358.0,438.9188
8,US,2022-03-08,over-10k,W,over-1b,Technology,AMAT,7623175.0,128.1947
9,US,2022-03-08,5k-10k,W,over-1b,Technology,AMD,102310329.0,109.6319


In [7]:
# # sort dataframe by date
# sort_date_stock_df = stock_df.sort_values(by=['date', 'ticker'])
# sort_date_stock_df

end_df.reset_index(drop=True,inplace=True)
end_df

Unnamed: 0,country_code,date,employee_count,region,revenue,sector,ticker,volume,volume_weight
0,US,2022-03-10,over-10k,W,over-1b,Technology,AAPL,96917302.0,156.1598
1,US,2022-03-10,5k-10k,W,200m-1b,Technology,ABNB,4577255.0,147.8527
2,US,2022-03-10,over-10k,W,1m-10m,Technology,ADBE,4434498.0,422.5279
3,US,2022-03-10,over-10k,SE,over-1b,Technology,ADI,3964265.0,149.4415
4,US,2022-03-10,over-10k,NE,over-1b,Consumer Discretionary,ADP,1664796.0,208.5401
5,US,2022-03-10,over-10k,W,over-1b,Healthcare,ADSK,1987062.0,195.0114
6,US,2022-03-10,over-10k,MW,over-1b,Energy,AEP,2254945.0,95.8241
7,US,2022-03-10,over-10k,SW,200m-1b,Technology,ALGN,957114.0,409.9722
8,US,2022-03-10,over-10k,W,over-1b,Technology,AMAT,6015969.0,125.9169
9,US,2022-03-10,5k-10k,W,over-1b,Technology,AMD,87584432.0,105.9691


In [8]:
vwa_df = pd.merge(begin_df, end_df, left_index=True, right_index=True)
vwa_df

Unnamed: 0,country_code_x,date_x,employee_count_x,region_x,revenue_x,sector_x,ticker_x,volume_x,volume_weight_x,country_code_y,date_y,employee_count_y,region_y,revenue_y,sector_y,ticker_y,volume_y,volume_weight_y
0,US,2022-03-08,over-10k,W,over-1b,Technology,AAPL,91445405.0,161.9446,US,2022-03-10,over-10k,W,over-1b,Technology,AAPL,96917302.0,156.1598
1,US,2022-03-08,5k-10k,W,200m-1b,Technology,ABNB,7023908.0,148.5454,US,2022-03-10,5k-10k,W,200m-1b,Technology,ABNB,4577255.0,147.8527
2,US,2022-03-08,over-10k,W,1m-10m,Technology,ADBE,2905656.0,447.8637,US,2022-03-10,over-10k,W,1m-10m,Technology,ADBE,4434498.0,422.5279
3,US,2022-03-08,over-10k,SE,over-1b,Technology,ADI,3046254.0,153.5888,US,2022-03-10,over-10k,SE,over-1b,Technology,ADI,3964265.0,149.4415
4,US,2022-03-08,over-10k,NE,over-1b,Consumer Discretionary,ADP,1791687.0,209.3495,US,2022-03-10,over-10k,NE,over-1b,Consumer Discretionary,ADP,1664796.0,208.5401
5,US,2022-03-08,over-10k,W,over-1b,Healthcare,ADSK,1850028.0,205.2018,US,2022-03-10,over-10k,W,over-1b,Healthcare,ADSK,1987062.0,195.0114
6,US,2022-03-08,over-10k,MW,over-1b,Energy,AEP,3006258.0,95.0679,US,2022-03-10,over-10k,MW,over-1b,Energy,AEP,2254945.0,95.8241
7,US,2022-03-08,over-10k,SW,200m-1b,Technology,ALGN,694358.0,438.9188,US,2022-03-10,over-10k,SW,200m-1b,Technology,ALGN,957114.0,409.9722
8,US,2022-03-08,over-10k,W,over-1b,Technology,AMAT,7623175.0,128.1947,US,2022-03-10,over-10k,W,over-1b,Technology,AMAT,6015969.0,125.9169
9,US,2022-03-08,5k-10k,W,over-1b,Technology,AMD,102310329.0,109.6319,US,2022-03-10,5k-10k,W,over-1b,Technology,AMD,87584432.0,105.9691


In [9]:
# drop fields that will not be used to represent a period of time
vwa_df.drop(columns = ['date_x', 'employee_count_y', 'region_y', 'revenue_y', 'sector_y', 'ticker_y', 'country_code_y', 'date_y'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
vwa_df

Unnamed: 0,country_code_x,employee_count_x,region_x,revenue_x,sector_x,ticker_x,volume_x,volume_weight_x,volume_y,volume_weight_y
0,US,over-10k,W,over-1b,Technology,AAPL,91445405.0,161.9446,96917302.0,156.1598
1,US,5k-10k,W,200m-1b,Technology,ABNB,7023908.0,148.5454,4577255.0,147.8527
2,US,over-10k,W,1m-10m,Technology,ADBE,2905656.0,447.8637,4434498.0,422.5279
3,US,over-10k,SE,over-1b,Technology,ADI,3046254.0,153.5888,3964265.0,149.4415
4,US,over-10k,NE,over-1b,Consumer Discretionary,ADP,1791687.0,209.3495,1664796.0,208.5401
5,US,over-10k,W,over-1b,Healthcare,ADSK,1850028.0,205.2018,1987062.0,195.0114
6,US,over-10k,MW,over-1b,Energy,AEP,3006258.0,95.0679,2254945.0,95.8241
7,US,over-10k,SW,200m-1b,Technology,ALGN,694358.0,438.9188,957114.0,409.9722
8,US,over-10k,W,over-1b,Technology,AMAT,7623175.0,128.1947,6015969.0,125.9169
9,US,5k-10k,W,over-1b,Technology,AMD,102310329.0,109.6319,87584432.0,105.9691


In [10]:
vwa_df['vwa'] = 100 - vwa_df['volume_weight_y']/vwa_df['volume_weight_x'] * 100
vwa_df['va'] = 100 - vwa_df['volume_y']/vwa_df['volume_x'] * 100

In [11]:
vwa_df

Unnamed: 0,country_code_x,employee_count_x,region_x,revenue_x,sector_x,ticker_x,volume_x,volume_weight_x,volume_y,volume_weight_y,vwa,va
0,US,over-10k,W,over-1b,Technology,AAPL,91445405.0,161.9446,96917302.0,156.1598,3.572086,-5.983786
1,US,5k-10k,W,200m-1b,Technology,ABNB,7023908.0,148.5454,4577255.0,147.8527,0.466322,34.833215
2,US,over-10k,W,1m-10m,Technology,ADBE,2905656.0,447.8637,4434498.0,422.5279,5.657034,-52.61607
3,US,over-10k,SE,over-1b,Technology,ADI,3046254.0,153.5888,3964265.0,149.4415,2.700262,-30.135734
4,US,over-10k,NE,over-1b,Consumer Discretionary,ADP,1791687.0,209.3495,1664796.0,208.5401,0.386626,7.082208
5,US,over-10k,W,over-1b,Healthcare,ADSK,1850028.0,205.2018,1987062.0,195.0114,4.966038,-7.407131
6,US,over-10k,MW,over-1b,Energy,AEP,3006258.0,95.0679,2254945.0,95.8241,-0.795431,24.991634
7,US,over-10k,SW,200m-1b,Technology,ALGN,694358.0,438.9188,957114.0,409.9722,6.594978,-37.841575
8,US,over-10k,W,over-1b,Technology,AMAT,7623175.0,128.1947,6015969.0,125.9169,1.776829,21.083158
9,US,5k-10k,W,over-1b,Technology,AMD,102310329.0,109.6319,87584432.0,105.9691,3.340998,14.393363


In [12]:
# unique values for each column (getting to know your data)
vwa_df.nunique()

country_code_x        8
employee_count_x      4
region_x             11
revenue_x             6
sector_x              9
ticker_x            102
volume_x            102
volume_weight_x     102
volume_y            102
volume_weight_y     102
vwa                 102
va                  102
dtype: int64

In [13]:
vwa_df.drop(columns = ['ticker_x', 'volume_x', 'volume_weight_x', 'volume_y', 'volume_weight_y'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
vwa_df

Unnamed: 0,country_code_x,employee_count_x,region_x,revenue_x,sector_x,vwa,va
0,US,over-10k,W,over-1b,Technology,3.572086,-5.983786
1,US,5k-10k,W,200m-1b,Technology,0.466322,34.833215
2,US,over-10k,W,1m-10m,Technology,5.657034,-52.61607
3,US,over-10k,SE,over-1b,Technology,2.700262,-30.135734
4,US,over-10k,NE,over-1b,Consumer Discretionary,0.386626,7.082208
5,US,over-10k,W,over-1b,Healthcare,4.966038,-7.407131
6,US,over-10k,MW,over-1b,Energy,-0.795431,24.991634
7,US,over-10k,SW,200m-1b,Technology,6.594978,-37.841575
8,US,over-10k,W,over-1b,Technology,1.776829,21.083158
9,US,5k-10k,W,over-1b,Technology,3.340998,14.393363


In [14]:
stock_df.dtypes

ticker             object
employee_count     object
revenue            object
sector             object
region             object
country_code       object
volume            float64
volume_weight     float64
date               object
dtype: object

## Indexes, Features (the possible causes), Targets (the desired effects), Throw-Aways

### NOTE: we have to keep our ticker columns (so all this must called within the gradient_boosting_decision_tree_model)

#### Indexes/Primary Key: 

- Concatinate ticker and date to yield ticker_and_date

#### Features are:
- TICKER, 
- DATE
- EMPLOYEE COUNT
- REVENUE
- SECTOR
- COUNTRY CODE
- VOLUME 
- VOLUME WEIGHT 
- AVERAGE_VOLUME (calculate average using begin_volumn/end_value) 
- AVERAGE_VOLUME_WEIGHT (calculate average using begin_date/end_date)
- PERCENT CHANGE (% change from close to open)

#### Target is:
- PERCENT CHANGE (and/or) Volume Weight (???)(I think the percent change matters more because percent change yields better 

#### Throw-aways for modeling:
- COMPANY NAME
- COMPANY URL
- CITY NAME
- STATE NAME
- LATITUDE
- LONGITUDE
- OPEN 
- HIGH 
- LOW
- CLOSE
- VOLUME
- VOLUME WEIGHT
- NUMBER OF TRANSACTIONS


In [15]:
stock_df = vwa_df
# drop stock ticker
# filtered_df = filtered_df.drop(columns = ['city_name'])
# filtered_df.head()
# stock_df.drop(columns="ticker", inplace=True)
# stock_df.drop(columns="city_name", inplace=True)
# stock_df.drop(columns="state_name", inplace=True)

In [16]:
# generate our categorical variable list
# categorical preprocessing can be done easiest using Dataframe.dtypes == 'object'
stock_categories = stock_df.dtypes[stock_df.dtypes == "object"].index.tolist()
stock_categories

['country_code_x', 'employee_count_x', 'region_x', 'revenue_x', 'sector_x']

In [17]:
# Checking the number of unique values in each column
stock_df[stock_categories].nunique()
# there needs to be only 10 at most in each categorie, how are we going to make this smaller...by sector ???

country_code_x       8
employee_count_x     4
region_x            11
revenue_x            6
sector_x             9
dtype: int64

In [19]:
#stock_df['employee_count'] = stock_df['employee_count'].astype('category').cat.codes
stock_df['revenue'] = stock_df['revenue'].astype('category').cat.codes
stock_df['sector'] = stock_df['sector'].astype('category').cat.codes
#stock_df['city_name'] = stock_df['city_name'].astype('category').cat.codes
# stock_df['state_name'] = stock_df['state_name'].astype('category').cat.codes
stock_df['country_code'] = stock_df['country_code'].astype('category').cat.codes
stock_df['region'] = stock_df['region'].astype('category').cat.codes

stock_df

KeyError: 'revenue'

In [20]:
# I am catagorizing my own shiza from the tiza
# replace stock's employee count string with integer
stock_df.loc[(stock_df['employee_count_x'] == '5k-10k'), 'employee_count_x'] = 0
stock_df.loc[(stock_df['employee_count_x'] == 'over-10k'), 'employee_count_x'] = 1
stock_df.loc[(stock_df['employee_count_x'] == '1k-5k'), 'employee_count_x'] = 2
stock_df.loc[(stock_df['employee_count_x'] == '500-1k'), 'employee_count_x'] = 3

# replace stock's revenue string with integer
stock_df.loc[(stock_df['revenue_x'] == '1m-10m'), 'revenue_x'] = 0
stock_df.loc[(stock_df['revenue_x'] == '10m-50m'), 'revenue_x'] = 1
stock_df.loc[(stock_df['revenue_x'] == '50m-100m'), 'revenue_x'] = 2
stock_df.loc[(stock_df['revenue_x'] == '100m-200m'), 'revenue_x'] = 3
stock_df.loc[(stock_df['revenue_x'] == '200m-1b'), 'revenue_x'] = 4
stock_df.loc[(stock_df['revenue_x'] == 'over-1b'), 'revenue_x'] = 5

# replace stock's sector string with integer
stock_df.loc[(stock_df['sector_x'] == 'Technology'), 'sector_x'] = 0
stock_df.loc[(stock_df['sector_x'] == 'Energy'), 'sector_x'] = 1
stock_df.loc[(stock_df['sector_x'] == 'Healthcare'), 'sector_x'] = 2
stock_df.loc[(stock_df['sector_x'] == 'Consumer Discretionary'), 'sector_x'] = 3
stock_df.loc[(stock_df['sector_x'] == 'Industrials'), 'sector_x'] = 4
stock_df.loc[(stock_df['sector_x'] == 'Consumer Staples'), 'sector_x'] = 5
stock_df.loc[(stock_df['sector_x'] == 'Communication Services'), 'sector_x'] = 6
stock_df.loc[(stock_df['sector_x'] == 'Financials'), 'sector_x'] = 7
stock_df.loc[(stock_df['sector_x'] == 'Utilities'), 'sector_x'] = 8

# replace stock's country code string with integer (Note: China was CN and CH for some reason)
stock_df.loc[(stock_df['country_code_x'] == 'US'), 'country_code_x'] = 0
stock_df.loc[(stock_df['country_code_x'] == 'Netherlands'), 'country_code_x'] = 1
stock_df.loc[(stock_df['country_code_x'] == 'Australia'), 'country_code_x'] = 2
stock_df.loc[(stock_df['country_code_x'] == 'UK'), 'country_code_x'] = 3
stock_df.loc[(stock_df['country_code_x'] == 'CH'), 'country_code_x'] = 4
stock_df.loc[(stock_df['country_code_x'] == 'CN'), 'country_code_x'] = 4
stock_df.loc[(stock_df['country_code_x'] == 'CA'), 'country_code_x'] = 5
stock_df.loc[(stock_df['country_code_x'] == 'Argentina'), 'country_code_x'] = 6

# replace stock's region string with integer 
stock_df.loc[(stock_df['region_x'] == 'W'), 'region_x'] = 0
stock_df.loc[(stock_df['region_x'] == 'MW'), 'region_x'] = 1
stock_df.loc[(stock_df['region_x'] == 'SW'), 'region_x'] = 2
stock_df.loc[(stock_df['region_x'] == 'NW'), 'region_x'] = 3
stock_df.loc[(stock_df['region_x'] == 'SE'), 'region_x'] = 4
stock_df.loc[(stock_df['region_x'] == 'NL'), 'region_x'] = 5
stock_df.loc[(stock_df['region_x'] == 'AU'), 'region_x'] = 6
stock_df.loc[(stock_df['region_x'] == 'NE'), 'region_x'] = 7
stock_df.loc[(stock_df['region_x'] == 'GB'), 'region_x'] = 8
stock_df.loc[(stock_df['region_x'] == 'CH'), 'region_x'] = 9
stock_df.loc[(stock_df['region_x'] == 'CA'), 'region_x'] = 10


#create buckets for vwa
stock_df.loc[(stock_df['vwa'] < 0), 'vwa'] = 0
stock_df.loc[(stock_df['vwa'] > 0) & (stock_df['vwa'] <= 1), 'vwa'] = 1
stock_df.loc[(stock_df['vwa'] > 1) & (stock_df['vwa'] <= 2), 'vwa'] = 2
stock_df.loc[(stock_df['vwa'] > 2) & (stock_df['vwa'] <= 3), 'vwa'] = 3
stock_df.loc[(stock_df['vwa'] > 3) & (stock_df['vwa'] <= 4), 'vwa'] = 4
stock_df.loc[(stock_df['vwa'] > 4) & (stock_df['vwa'] <= 5), 'vwa'] = 5
stock_df.loc[(stock_df['vwa'] > 5) & (stock_df['vwa'] <= 6), 'vwa'] = 6
# stock_df.loc[(stock_df['vwa'] > 6) & (stock_df['vwa'] <= 7), 'vwa'] = 7
# stock_df.loc[(stock_df['vwa'] > 7) & (stock_df['vwa'] <= 8), 'vwa'] = 8
# stock_df.loc[(stock_df['vwa'] > 8) & (stock_df['vwa'] <= 9), 'vwa'] = 9
# stock_df.loc[(stock_df['vwa'] > 9) & (stock_df['vwa'] <= 10), 'vwa'] = 10
stock_df.loc[(stock_df['vwa'] > 6), 'vwa'] = 7

#create buckets for va
stock_df.loc[(stock_df['va'] < 0), 'vwa'] = 0
stock_df.loc[(stock_df['va'] > 0) & (stock_df['va'] <= 1), 'va'] = 1
stock_df.loc[(stock_df['va'] > 1) & (stock_df['va'] <= 2), 'va'] = 2
stock_df.loc[(stock_df['va'] > 2) & (stock_df['va'] <= 3), 'va'] = 3
stock_df.loc[(stock_df['va'] > 3) & (stock_df['va'] <= 4), 'va'] = 4
stock_df.loc[(stock_df['va'] > 4) & (stock_df['va'] <= 5), 'va'] = 5
stock_df.loc[(stock_df['va'] > 5) & (stock_df['va'] <= 6), 'va'] = 6
# stock_df.loc[(stock_df['va'] > 6) & (stock_df['va'] <= 7), 'va'] = 7
# stock_df.loc[(stock_df['va'] > 7) & (stock_df['va'] <= 8), 'va'] = 8
# stock_df.loc[(stock_df['va'] > 8) & (stock_df['va'] <= 9), 'va'] = 9
# stock_df.loc[(stock_df['va'] > 9) & (stock_df['va'] <= 10), 'va'] = 10
stock_df.loc[(stock_df['va'] > 6), 'va'] = 7
stock_df

Unnamed: 0,country_code_x,employee_count_x,region_x,revenue_x,sector_x,vwa,va
0,0,1,0,5,0,0.0,-5.983786
1,0,0,0,4,0,1.0,7.0
2,0,1,0,0,0,0.0,-52.61607
3,0,1,4,5,0,0.0,-30.135734
4,0,1,7,5,3,1.0,7.0
5,0,1,0,5,2,0.0,-7.407131
6,0,1,1,5,1,0.0,7.0
7,0,1,2,4,0,0.0,-37.841575
8,0,1,0,5,0,2.0,7.0
9,0,0,0,5,0,4.0,7.0


In [21]:
# Check volumne weight average buckets
vwa_counts = stock_df['vwa'].value_counts()
vwa_counts

0.0    44
1.0    16
2.0    15
4.0    10
3.0     9
7.0     3
5.0     3
6.0     2
Name: vwa, dtype: int64

In [22]:
# create features array
X = stock_df.drop(columns=["vwa"]).values
    
# create target
y = stock_df["vwa"].values

In [23]:
# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [24]:
# max_depth refers to the number of leaves of each tree 
# n_estimators refers to the total number of trees in the ensemble
# learning_rate hyperparameter scales the contribution of each tree NOTE: If you set it to a low value, 
# you will need more trees in the ensemble to fit the training set, but the overall variance will be lower.

# best way to tune the model: https://neptune.ai/blog/lightgbm-parameters-guide
    
regressor = GradientBoostingRegressor(
max_depth=16,
n_estimators=100,
learning_rate=.01
)
regressor.fit(X_train, y_train)


GradientBoostingRegressor(learning_rate=0.01, max_depth=16)

In [25]:
# Use staged_predict() method to measures the validation error at each stage of training 
# (i.e. with one tree, with two trees…) to find the optimal number of trees.
errors = [mean_squared_error(y_test, y_pred) for y_pred in 
           regressor.staged_predict(X_test)]
print(errors)

# mean_squared_error: 
# The smaller the mean squared error, the closer you are to finding the line of best fit. Depending on your data, 
# it may be impossible to get a very small value for the mean squared error. For example, the above data is scattered 
# wildly around the regression line, so 6.08 is as good as it gets (and is in fact, the line of best fit). It is 
# bucketting the VWA that works. 


[4.077731818000213, 4.054869885996097, 4.033337851738025, 4.008260015966011, 3.9881344816565667, 3.9690762100485566, 3.947229314468227, 3.9297005997856624, 3.9128219464605367, 3.8935782104325907, 3.875199026836447, 3.861149322302102, 3.8475196372988902, 3.8346114055435008, 3.819353302725018, 3.807765180907851, 3.7934666370643555, 3.783206830723066, 3.770677106970614, 3.7616788732735724, 3.7537023974013577, 3.7430880133743614, 3.733029080735233, 3.723108086687607, 3.714204025540199, 3.7088970105238355, 3.7036493150254746, 3.69586044183504, 3.688582297782447, 3.6848726744855, 3.681920765043209, 3.678968587832303, 3.676626867915539, 3.674724018745687, 3.673041996674302, 3.6692908378726767, 3.668581492295253, 3.6654231288963257, 3.663103446718784, 3.663920571605861, 3.664836657389269, 3.665916246435251, 3.6650424845810603, 3.6664806658320357, 3.668507485952518, 3.6709691864458587, 3.673713758566943, 3.6739681714638714, 3.6768127341827475, 3.6776391870182294, 3.681718059289182, 3.6835114665

In [26]:
# best_n_estimators = np.argmin(errors) + 1
best_n_estimators = np.argmin(errors)

print(best_n_estimators)

38


In [27]:
# build and fit our model using the optimal number of trees
best_regressor = GradientBoostingRegressor(
     max_depth=10,
     n_estimators=best_n_estimators,
     learning_rate=.01
)

best_regressor.fit(X_train, y_train)

# # Sklearn provides numerous metrics to evaluate 
# # the performance of our machine learning models.
# # They categorize the each metric according 
# # to the problem domain which they’re applicable. 
# # https://scikit-learn.org/stable/modules/model_evaluation.html <-- GO TO THIS SITE TO SEE WHICH METRICS YOU WILL USE.

# # We use the mean absolute error 
# # which can be interpreted as 
# # the average distance from 
# # our predictions and the actual values

# # this will give you the value of the stocks for the next period of time
y_pred = best_regressor.predict(X_test)

print(X_train)
print(y_train)

print(X_test)
print(y_pred)

# # this is the how well the model performed (looking for smallest error)
mean_absolute_error(y_test, y_pred)

[[0 1 0 5 0 5.0]
 [0 1 0 5 0 -0.9909673158758494]
 [0 1 0 0 0 -52.616070174858976]
 [0 1 0 5 5 7.0]
 [0 1 3 5 0 -15.974561289171007]
 [0 2 0 3 0 7.0]
 [0 1 7 5 2 7.0]
 [0 1 0 5 0 -6.158737760860404]
 [0 1 0 5 0 7.0]
 [0 1 4 5 0 7.0]
 [0 1 3 4 3 2.0]
 [0 1 1 5 8 -5.741780004122958]
 [0 1 1 5 3 7.0]
 [0 3 4 5 0 -247.44474045497788]
 [0 1 1 5 4 7.0]
 [0 1 4 5 6 3.0]
 [0 0 0 4 2 7.0]
 [0 0 0 4 0 -3.4513879482128687]
 [0 1 0 5 5 7.0]
 [4 1 9 5 0 -29.77104437540669]
 [0 1 0 5 0 7.0]
 [0 3 3 4 6 7.0]
 [0 0 0 0 7 7.0]
 [0 1 4 5 0 6.0]
 [0 1 4 5 6 7.0]
 [0 2 4 2 4 7.0]
 [0 2 0 3 0 -0.42196764624293337]
 [0 1 0 5 2 7.0]
 [0 1 2 1 3 7.0]
 [0 1 3 5 3 7.0]
 [0 1 7 5 3 7.0]
 [0 2 0 5 6 7.0]
 [2 2 6 3 0 7.0]
 [0 0 0 5 0 -18.083987792430307]
 [0 0 0 5 0 7.0]
 [0 0 0 5 3 7.0]
 [0 1 4 5 0 -30.13573392107159]
 [0 1 1 5 2 -6.094447470611357]
 [0 1 7 5 4 7.0]
 [0 0 0 5 2 7.0]
 [0 1 3 5 0 7.0]
 [0 1 0 5 0 7.0]
 [0 1 1 5 0 -7.760398594713422]
 [0 1 0 5 0 -5.983785625969944]
 [0 1 4 5 0 -13.290759286183288]
 

1.2106941397457875

In [28]:
# should we be using r2_score?
# how do you do residual plots?

In [29]:
print(r2_score(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))

0.08332658527352521
1.2106941397457875


In [30]:

# #     # Tomas: correlation analysis to see how your features are correlated to each other
    
# #     # as with any regression you need to minimize the mean square error.
#                                                         ------------------
# #     examples are at : 
# # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html#sklearn.metrics.mean_squared_error
# #     from sklearn.metrics import mean_squared_error
    
# # EMPTY PROCESS DATAFRAME   
    
# #     # accrossed all stocks, what is the average score.
# #     # what is the mean?
# #     # what is the median?
# #     # do we have any outliers that we need to note
# #     # does this work better for same sectors?