In [1]:
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime, timedelta
from functools import partial
import numpy as np
import psycopg2
import pandas as pd
from sqlalchemy import create_engine
from config import db_password
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

#import necessary libraries 
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols


#from joblib import Parallel, delayed, Model 
#from collections import Counter <--????
#from sklearn.metrics import confusion_matrix
#from imblearn.metrics import classification_report_imbalanced

In [2]:
# GET Tabled input

# creating database engine
db_name = 'Company_Stock_DB'
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/{db_name}"
engine = create_engine(db_string)

# read data from PostgreSQL database table and load into Dataframe instance
stock_df = pd.read_sql("select * from \"company_all_star\"", engine);

#sort the dataframe by ticker column
stock_df.sort_values(by=['ticker'])

# Print the DataFrame
stock_df.head()

Unnamed: 0,ticker,date_val,company_name,company_url,employee_count,revenue,sector,city_name,state_name,region,...,latitude,longitude,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change
0,AMD,2020-03-12,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,42.2,43.91,39.6,43.9,86689681.0,41.6701,381223.0,4.028436
1,AMD,2020-03-15,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,39.08,43.37,38.51,38.71,84545868.0,41.0812,374962.0,0.946776
2,AMD,2020-03-16,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,40.19,42.88,38.3,41.88,92741881.0,41.124,434519.0,4.205026
3,AMD,2020-03-17,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,39.54,41.95,36.75,39.12,106949287.0,39.6363,591862.0,1.062215
4,AMD,2020-03-18,Advanced Micro Devices Inc,amd.com,5k-10k,over-1b,Technology,Santa Clara,CA,W,...,37.233325,-121.684635,39.56,41.7,37.69,39.82,88939024.0,40.2337,396388.0,0.65723


In [3]:
# preserve date column as type object
stock_df['date'] = stock_df['date_val']

# have the user enter beginning date as yyyy-mm-dd
begin_date = '2022-03-03'
# have the user enter ending date as yyyy-mm-dd
end_date = '2022-03-10'
# iteration controls
day_range_of_iter = 2

# Convert the date to datetime64
stock_df['date_val'] = pd.to_datetime(stock_df['date_val'], format='%Y-%m-%d')

stock_df = stock_df.loc[(stock_df['date_val'] >= begin_date)
                     & (stock_df['date_val'] <= end_date)]

# drop throw-aways 
stock_df.drop(["longitude", "latitude", "company_name", "company_url","date_val"], axis=1, inplace=True)

stock_df

Unnamed: 0,ticker,employee_count,revenue,sector,city_name,state_name,region,country_code,open_val,high_val,low_val,close_val,volume,volume_weight,number_of_transactions,percent_change,date
498,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,112.00,113.00,106.81,108.41,100671339.0,109.0619,674554.0,3.205357,2022-03-03
499,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,108.53,109.53,102.82,102.95,92599972.0,105.5087,659639.0,5.141436,2022-03-06
500,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,102.81,109.90,100.08,105.53,135348316.0,105.1526,845843.0,2.645657,2022-03-07
501,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,108.41,111.71,106.85,111.05,102310329.0,109.6319,602679.0,2.435200,2022-03-08
502,AMD,5k-10k,over-1b,Technology,Santa Clara,CA,W,US,108.89,109.07,103.07,106.46,102557375.0,105.3382,639388.0,2.231610,2022-03-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50869,ZS,1k-5k,100m-200m,Technology,San Jose,CA,W,US,228.37,229.97,204.36,204.37,4379337.0,210.5799,72096.0,10.509261,2022-03-06
50870,ZS,1k-5k,100m-200m,Technology,San Jose,CA,W,US,203.50,203.92,190.13,198.63,4389634.0,196.9284,71180.0,2.393120,2022-03-07
50871,ZS,1k-5k,100m-200m,Technology,San Jose,CA,W,US,203.84,213.57,199.12,212.35,3050554.0,209.3268,45960.0,4.174843,2022-03-08
50872,ZS,1k-5k,100m-200m,Technology,San Jose,CA,W,US,212.13,213.51,204.87,208.41,2305091.0,208.7971,40754.0,1.753642,2022-03-09


In [4]:
# drop fields that will not be used to represent a period of time
stock_df.drop(columns = ['open_val', 'high_val', 'low_val', 'close_val', 'number_of_transactions', 'city_name', 'state_name', 'number_of_transactions', 'percent_change'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
stock_df

Unnamed: 0,ticker,employee_count,revenue,sector,region,country_code,volume,volume_weight,date
498,AMD,5k-10k,over-1b,Technology,W,US,100671339.0,109.0619,2022-03-03
499,AMD,5k-10k,over-1b,Technology,W,US,92599972.0,105.5087,2022-03-06
500,AMD,5k-10k,over-1b,Technology,W,US,135348316.0,105.1526,2022-03-07
501,AMD,5k-10k,over-1b,Technology,W,US,102310329.0,109.6319,2022-03-08
502,AMD,5k-10k,over-1b,Technology,W,US,102557375.0,105.3382,2022-03-09
503,AMD,5k-10k,over-1b,Technology,W,US,87584432.0,105.9691,2022-03-10
1003,ADBE,over-10k,1m-10m,Technology,W,US,2752177.0,452.735,2022-03-03
1004,ADBE,over-10k,1m-10m,Technology,W,US,3676072.0,444.4155,2022-03-06
1005,ADBE,over-10k,1m-10m,Technology,W,US,3734842.0,432.4705,2022-03-07
1006,ADBE,over-10k,1m-10m,Technology,W,US,2905656.0,447.8637,2022-03-08


In [5]:
# unique days in df
unique_days = len(pd.unique(stock_df['date']))
print("unique number of days(number of days in df):", unique_days)

# unique stocks in df
unique_stocks = len(pd.unique(stock_df['ticker']))
print(unique_stocks)

# interation sets
iteration_sets = (unique_days - day_range_of_iter + 1)
print("iteration_sets: ", iteration_sets)

# total records captured
length_of_df = len(stock_df)
print(length_of_df)


unique number of days(number of days in df): 6
102
iteration_sets:  5
612


In [6]:
# sort dataframe by date
sort_date_stock_df = stock_df.sort_values(by=['date', 'ticker'])
next_date_stock_df = sort_date_stock_df

# get beginning dataframe records
b = 0 
# ending record for beginning df
ending_records = iteration_sets * unique_stocks

# starting record for end
x = (unique_days - iteration_sets) * unique_stocks
max_records = unique_days * unique_stocks

begin_df = pd.DataFrame()
end_df = pd.DataFrame()
               
for rec in sort_date_stock_df.iterrows():
    
    if b < ending_records:
        new_begin_df = sort_date_stock_df.iloc[b]
        begin_df = begin_df.append(new_begin_df,ignore_index=False)
    
    if x < max_records: 
        new_end_df = next_date_stock_df.iloc[x]
        end_df = end_df.append(new_end_df,ignore_index=False)
    b=b+1
    x=x+1
   
begin_df.reset_index(drop=True,inplace=True)
begin_df 


Unnamed: 0,country_code,date,employee_count,region,revenue,sector,ticker,volume,volume_weight
0,US,2022-03-03,over-10k,W,over-1b,Technology,AAPL,83819592.0,163.398
1,US,2022-03-03,5k-10k,W,200m-1b,Technology,ABNB,8397063.0,143.411
2,US,2022-03-03,over-10k,W,1m-10m,Technology,ADBE,2752177.0,452.735
3,US,2022-03-03,over-10k,SE,over-1b,Technology,ADI,2778580.0,157.0311
4,US,2022-03-03,over-10k,NE,over-1b,Consumer Discretionary,ADP,1492278.0,206.8779
5,US,2022-03-03,over-10k,W,over-1b,Healthcare,ADSK,1964126.0,208.8198
6,US,2022-03-03,over-10k,MW,over-1b,Energy,AEP,4675287.0,95.5682
7,US,2022-03-03,over-10k,SW,200m-1b,Technology,ALGN,755752.0,461.5372
8,US,2022-03-03,over-10k,W,over-1b,Technology,AMAT,7098065.0,126.1637
9,US,2022-03-03,5k-10k,W,over-1b,Technology,AMD,100671339.0,109.0619


In [7]:
# # sort dataframe by date
# sort_date_stock_df = stock_df.sort_values(by=['date', 'ticker'])
# sort_date_stock_df

end_df.reset_index(drop=True,inplace=True)
end_df

Unnamed: 0,country_code,date,employee_count,region,revenue,sector,ticker,volume,volume_weight
0,US,2022-03-06,over-10k,W,over-1b,Technology,AAPL,96418845.0,161.4026
1,US,2022-03-06,5k-10k,W,200m-1b,Technology,ABNB,9177270.0,136.4099
2,US,2022-03-06,over-10k,W,1m-10m,Technology,ADBE,3676072.0,444.4155
3,US,2022-03-06,over-10k,SE,over-1b,Technology,ADI,5701277.0,148.8865
4,US,2022-03-06,over-10k,NE,over-1b,Consumer Discretionary,ADP,2535505.0,209.5547
5,US,2022-03-06,over-10k,W,over-1b,Healthcare,ADSK,1930463.0,202.5602
6,US,2022-03-06,over-10k,MW,over-1b,Energy,AEP,4111340.0,97.2618
7,US,2022-03-06,over-10k,SW,200m-1b,Technology,ALGN,1075983.0,445.6873
8,US,2022-03-06,over-10k,W,over-1b,Technology,AMAT,9207137.0,121.1384
9,US,2022-03-06,5k-10k,W,over-1b,Technology,AMD,92599972.0,105.5087


In [8]:
vwa_df = pd.merge(begin_df, end_df, left_index=True, right_index=True)

In [9]:
# drop fields that will not be used to represent a period of time
vwa_df.drop(columns = ['date_x', 'employee_count_y', 'region_y', 'revenue_y', 'sector_y', 'ticker_y', 'country_code_y', 'date_y'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
vwa_df

Unnamed: 0,country_code_x,employee_count_x,region_x,revenue_x,sector_x,ticker_x,volume_x,volume_weight_x,volume_y,volume_weight_y
0,US,over-10k,W,over-1b,Technology,AAPL,83819592.0,163.398,96418845.0,161.4026
1,US,5k-10k,W,200m-1b,Technology,ABNB,8397063.0,143.411,9177270.0,136.4099
2,US,over-10k,W,1m-10m,Technology,ADBE,2752177.0,452.735,3676072.0,444.4155
3,US,over-10k,SE,over-1b,Technology,ADI,2778580.0,157.0311,5701277.0,148.8865
4,US,over-10k,NE,over-1b,Consumer Discretionary,ADP,1492278.0,206.8779,2535505.0,209.5547
5,US,over-10k,W,over-1b,Healthcare,ADSK,1964126.0,208.8198,1930463.0,202.5602
6,US,over-10k,MW,over-1b,Energy,AEP,4675287.0,95.5682,4111340.0,97.2618
7,US,over-10k,SW,200m-1b,Technology,ALGN,755752.0,461.5372,1075983.0,445.6873
8,US,over-10k,W,over-1b,Technology,AMAT,7098065.0,126.1637,9207137.0,121.1384
9,US,5k-10k,W,over-1b,Technology,AMD,100671339.0,109.0619,92599972.0,105.5087


In [10]:
vwa_df['vwa'] = 100 - vwa_df['volume_weight_y']/vwa_df['volume_weight_x'] * 100
vwa_df['va'] = 100 - vwa_df['volume_y']/vwa_df['volume_x'] * 100

In [11]:
vwa_df

Unnamed: 0,country_code_x,employee_count_x,region_x,revenue_x,sector_x,ticker_x,volume_x,volume_weight_x,volume_y,volume_weight_y,vwa,va
0,US,over-10k,W,over-1b,Technology,AAPL,83819592.0,163.398,96418845.0,161.4026,1.22119,-15.031394
1,US,5k-10k,W,200m-1b,Technology,ABNB,8397063.0,143.411,9177270.0,136.4099,4.881843,-9.291427
2,US,over-10k,W,1m-10m,Technology,ADBE,2752177.0,452.735,3676072.0,444.4155,1.837609,-33.569607
3,US,over-10k,SE,over-1b,Technology,ADI,2778580.0,157.0311,5701277.0,148.8865,5.186616,-105.186714
4,US,over-10k,NE,over-1b,Consumer Discretionary,ADP,1492278.0,206.8779,2535505.0,209.5547,-1.293903,-69.908355
5,US,over-10k,W,over-1b,Healthcare,ADSK,1964126.0,208.8198,1930463.0,202.5602,2.997608,1.713892
6,US,over-10k,MW,over-1b,Energy,AEP,4675287.0,95.5682,4111340.0,97.2618,-1.772138,12.062297
7,US,over-10k,SW,200m-1b,Technology,ALGN,755752.0,461.5372,1075983.0,445.6873,3.434154,-42.372498
8,US,over-10k,W,over-1b,Technology,AMAT,7098065.0,126.1637,9207137.0,121.1384,3.983158,-29.713337
9,US,5k-10k,W,over-1b,Technology,AMD,100671339.0,109.0619,92599972.0,105.5087,3.257966,8.017542


In [12]:
# unique values for each column (getting to know your data)
vwa_df.nunique()

country_code_x        8
employee_count_x      4
region_x             11
revenue_x             6
sector_x              9
ticker_x            102
volume_x            510
volume_weight_x     510
volume_y            510
volume_weight_y     510
vwa                 510
va                  510
dtype: int64

In [13]:
vwa_df.drop(columns = ['ticker_x', 'volume_x', 'volume_weight_x', 'volume_y', 'volume_weight_y'], axis=1, inplace=True)
pd.set_option('display.max_rows', None)
vwa_df

Unnamed: 0,country_code_x,employee_count_x,region_x,revenue_x,sector_x,vwa,va
0,US,over-10k,W,over-1b,Technology,1.22119,-15.031394
1,US,5k-10k,W,200m-1b,Technology,4.881843,-9.291427
2,US,over-10k,W,1m-10m,Technology,1.837609,-33.569607
3,US,over-10k,SE,over-1b,Technology,5.186616,-105.186714
4,US,over-10k,NE,over-1b,Consumer Discretionary,-1.293903,-69.908355
5,US,over-10k,W,over-1b,Healthcare,2.997608,1.713892
6,US,over-10k,MW,over-1b,Energy,-1.772138,12.062297
7,US,over-10k,SW,200m-1b,Technology,3.434154,-42.372498
8,US,over-10k,W,over-1b,Technology,3.983158,-29.713337
9,US,5k-10k,W,over-1b,Technology,3.257966,8.017542


In [14]:
stock_df.dtypes

ticker             object
employee_count     object
revenue            object
sector             object
region             object
country_code       object
volume            float64
volume_weight     float64
date               object
dtype: object

## Indexes, Features (the possible causes), Targets (the desired effects), Throw-Aways

### NOTE: we have to keep our ticker columns (so all this must called within the gradient_boosting_decision_tree_model)

#### Indexes/Primary Key: 

- Concatinate ticker and date to yield ticker_and_date

#### Features are:
- TICKER, 
- DATE
- EMPLOYEE COUNT
- REVENUE
- SECTOR
- COUNTRY CODE
- VOLUME 
- VOLUME WEIGHT 
- AVERAGE_VOLUME (calculate average using begin_volumn/end_value) 
- AVERAGE_VOLUME_WEIGHT (calculate average using begin_date/end_date)
- PERCENT CHANGE (% change from close to open)

#### Target is:
- PERCENT CHANGE (and/or) Volume Weight (???)(I think the percent change matters more because percent change yields better 

#### Throw-aways for modeling:
- COMPANY NAME
- COMPANY URL
- CITY NAME
- STATE NAME
- LATITUDE
- LONGITUDE
- OPEN 
- HIGH 
- LOW
- CLOSE
- VOLUME
- VOLUME WEIGHT
- NUMBER OF TRANSACTIONS


In [15]:
stock_df = vwa_df
# drop stock ticker
# filtered_df = filtered_df.drop(columns = ['city_name'])
# filtered_df.head()
# stock_df.drop(columns="ticker", inplace=True)
# stock_df.drop(columns="city_name", inplace=True)
# stock_df.drop(columns="state_name", inplace=True)

In [16]:
# generate our categorical variable list
# categorical preprocessing can be done easiest using Dataframe.dtypes == 'object'
stock_categories = stock_df.dtypes[stock_df.dtypes == "object"].index.tolist()
stock_categories

['country_code_x', 'employee_count_x', 'region_x', 'revenue_x', 'sector_x']

In [17]:
# Checking the number of unique values in each column
stock_df[stock_categories].nunique()
# there needs to be only 10 at most in each categorie, how are we going to make this smaller...by sector ???

country_code_x       8
employee_count_x     4
region_x            11
revenue_x            6
sector_x             9
dtype: int64

In [18]:
# I am catagorizing my own shiza from the tiza
# replace stock's employee count string with integer
stock_df.loc[(stock_df['employee_count_x'] == '5k-10k'), 'employee_count_x'] = 0
stock_df.loc[(stock_df['employee_count_x'] == 'over-10k'), 'employee_count_x'] = 1
stock_df.loc[(stock_df['employee_count_x'] == '1k-5k'), 'employee_count_x'] = 2
stock_df.loc[(stock_df['employee_count_x'] == '500-1k'), 'employee_count_x'] = 3

# replace stock's revenue string with integer
stock_df.loc[(stock_df['revenue_x'] == '1m-10m'), 'revenue_x'] = 0
stock_df.loc[(stock_df['revenue_x'] == '10m-50m'), 'revenue_x'] = 1
stock_df.loc[(stock_df['revenue_x'] == '50m-100m'), 'revenue_x'] = 2
stock_df.loc[(stock_df['revenue_x'] == '100m-200m'), 'revenue_x'] = 3
stock_df.loc[(stock_df['revenue_x'] == '200m-1b'), 'revenue_x'] = 4
stock_df.loc[(stock_df['revenue_x'] == 'over-1b'), 'revenue_x'] = 5

# replace stock's sector string with integer
stock_df.loc[(stock_df['sector_x'] == 'Technology'), 'sector_x'] = 0
stock_df.loc[(stock_df['sector_x'] == 'Energy'), 'sector_x'] = 1
stock_df.loc[(stock_df['sector_x'] == 'Healthcare'), 'sector_x'] = 2
stock_df.loc[(stock_df['sector_x'] == 'Consumer Discretionary'), 'sector_x'] = 3
stock_df.loc[(stock_df['sector_x'] == 'Industrials'), 'sector_x'] = 4
stock_df.loc[(stock_df['sector_x'] == 'Consumer Staples'), 'sector_x'] = 5
stock_df.loc[(stock_df['sector_x'] == 'Communication Services'), 'sector_x'] = 6
stock_df.loc[(stock_df['sector_x'] == 'Financials'), 'sector_x'] = 7
stock_df.loc[(stock_df['sector_x'] == 'Utilities'), 'sector_x'] = 8

# replace stock's country code string with integer (Note: China was CN and CH for some reason)
stock_df.loc[(stock_df['country_code_x'] == 'US'), 'country_code_x'] = 0
stock_df.loc[(stock_df['country_code_x'] == 'Netherlands'), 'country_code_x'] = 1
stock_df.loc[(stock_df['country_code_x'] == 'Australia'), 'country_code_x'] = 2
stock_df.loc[(stock_df['country_code_x'] == 'UK'), 'country_code_x'] = 3
stock_df.loc[(stock_df['country_code_x'] == 'CH'), 'country_code_x'] = 4
stock_df.loc[(stock_df['country_code_x'] == 'CN'), 'country_code_x'] = 4
stock_df.loc[(stock_df['country_code_x'] == 'CA'), 'country_code_x'] = 5
stock_df.loc[(stock_df['country_code_x'] == 'Argentina'), 'country_code_x'] = 6

# replace stock's region string with integer 
stock_df.loc[(stock_df['region_x'] == 'W'), 'region_x'] = 0
stock_df.loc[(stock_df['region_x'] == 'MW'), 'region_x'] = 1
stock_df.loc[(stock_df['region_x'] == 'SW'), 'region_x'] = 2
stock_df.loc[(stock_df['region_x'] == 'NW'), 'region_x'] = 3
stock_df.loc[(stock_df['region_x'] == 'SE'), 'region_x'] = 4
stock_df.loc[(stock_df['region_x'] == 'NL'), 'region_x'] = 5
stock_df.loc[(stock_df['region_x'] == 'AU'), 'region_x'] = 6
stock_df.loc[(stock_df['region_x'] == 'NE'), 'region_x'] = 7
stock_df.loc[(stock_df['region_x'] == 'GB'), 'region_x'] = 8
stock_df.loc[(stock_df['region_x'] == 'CH'), 'region_x'] = 9
stock_df.loc[(stock_df['region_x'] == 'CA'), 'region_x'] = 10


#create buckets for vwa
stock_df.loc[(stock_df['vwa'] < 0), 'vwa'] = 0
stock_df.loc[(stock_df['vwa'] > 0) & (stock_df['vwa'] <= 1), 'vwa'] = 1
stock_df.loc[(stock_df['vwa'] > 1) & (stock_df['vwa'] <= 2), 'vwa'] = 2
stock_df.loc[(stock_df['vwa'] > 2) & (stock_df['vwa'] <= 3), 'vwa'] = 3
stock_df.loc[(stock_df['vwa'] > 3) & (stock_df['vwa'] <= 4), 'vwa'] = 4
stock_df.loc[(stock_df['vwa'] > 4) & (stock_df['vwa'] <= 5), 'vwa'] = 5
stock_df.loc[(stock_df['vwa'] > 5) & (stock_df['vwa'] <= 6), 'vwa'] = 6
# stock_df.loc[(stock_df['vwa'] > 6) & (stock_df['vwa'] <= 7), 'vwa'] = 7
# stock_df.loc[(stock_df['vwa'] > 7) & (stock_df['vwa'] <= 8), 'vwa'] = 8
# stock_df.loc[(stock_df['vwa'] > 8) & (stock_df['vwa'] <= 9), 'vwa'] = 9
# stock_df.loc[(stock_df['vwa'] > 9) & (stock_df['vwa'] <= 10), 'vwa'] = 10
stock_df.loc[(stock_df['vwa'] > 6), 'vwa'] = 7

#create buckets for va
stock_df.loc[(stock_df['va'] < 0), 'vwa'] = 0
stock_df.loc[(stock_df['va'] > 0) & (stock_df['va'] <= 1), 'va'] = 1
stock_df.loc[(stock_df['va'] > 1) & (stock_df['va'] <= 2), 'va'] = 2
stock_df.loc[(stock_df['va'] > 2) & (stock_df['va'] <= 3), 'va'] = 3
stock_df.loc[(stock_df['va'] > 3) & (stock_df['va'] <= 4), 'va'] = 4
stock_df.loc[(stock_df['va'] > 4) & (stock_df['va'] <= 5), 'va'] = 5
stock_df.loc[(stock_df['va'] > 5) & (stock_df['va'] <= 6), 'va'] = 6
# stock_df.loc[(stock_df['va'] > 6) & (stock_df['va'] <= 7), 'va'] = 7
# stock_df.loc[(stock_df['va'] > 7) & (stock_df['va'] <= 8), 'va'] = 8
# stock_df.loc[(stock_df['va'] > 8) & (stock_df['va'] <= 9), 'va'] = 9
# stock_df.loc[(stock_df['va'] > 9) & (stock_df['va'] <= 10), 'va'] = 10
stock_df.loc[(stock_df['va'] > 6), 'va'] = 7
stock_df

Unnamed: 0,country_code_x,employee_count_x,region_x,revenue_x,sector_x,vwa,va
0,0,1,0,5,0,0.0,-15.031394
1,0,0,0,4,0,0.0,-9.291427
2,0,1,0,0,0,0.0,-33.569607
3,0,1,4,5,0,0.0,-105.186714
4,0,1,7,5,3,0.0,-69.908355
5,0,1,0,5,2,3.0,2.0
6,0,1,1,5,1,0.0,7.0
7,0,1,2,4,0,0.0,-42.372498
8,0,1,0,5,0,0.0,-29.713337
9,0,0,0,5,0,4.0,7.0


In [19]:
# Check volumne weight average buckets
vwa_counts = stock_df['vwa'].value_counts()
vwa_counts

0.0    345
1.0     55
2.0     51
3.0     36
4.0      8
5.0      7
7.0      4
6.0      4
Name: vwa, dtype: int64

In [20]:
# create features array
X = stock_df.drop(columns=["vwa"]).values
    
# create target
y = stock_df["vwa"].values

In [21]:
# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [22]:
# max_depth refers to the number of leaves of each tree 
# n_estimators refers to the total number of trees in the ensemble
# learning_rate hyperparameter scales the contribution of each tree NOTE: If you set it to a low value, 
# you will need more trees in the ensemble to fit the training set, but the overall variance will be lower.

# best way to tune the model: https://neptune.ai/blog/lightgbm-parameters-guide
    
regressor = GradientBoostingRegressor(
max_depth=16,
n_estimators=100,
learning_rate=.01
)
regressor.fit(X_train, y_train)


GradientBoostingRegressor(learning_rate=0.01, max_depth=16)

In [23]:
# Use staged_predict() method to measures the validation error at each stage of training 
# (i.e. with one tree, with two trees…) to find the optimal number of trees.
errors = [mean_squared_error(y_test, y_pred) for y_pred in 
           regressor.staged_predict(X_test)]
print(errors)

# mean_squared_error: 
# The smaller the mean squared error, the closer you are to finding the line of best fit. Depending on your data, 
# it may be impossible to get a very small value for the mean squared error. For example, the above data is scattered 
# wildly around the regression line, so 6.08 is as good as it gets (and is in fact, the line of best fit). It is 
# bucketting the VWA that works. 


[2.0283586658354467, 2.0178792338325606, 2.0072357649479917, 1.9969676582582292, 1.987016437563987, 1.977459406825627, 1.9680808770882883, 1.959070160669831, 1.9505643417401544, 1.9420738045085004, 1.9340719050275716, 1.9263438891187907, 1.9186248647110729, 1.9111521089421735, 1.903919883270548, 1.8968556698065338, 1.890077718009581, 1.8836812428333536, 1.8775768696634034, 1.8713541489177599, 1.8654228645781021, 1.8599334190009909, 1.8543791382774213, 1.8492838183776552, 1.8441391612420106, 1.839175738262729, 1.8344946264222055, 1.8301220030426855, 1.8256825550482043, 1.821274799643103, 1.8171938963567884, 1.8135232817717397, 1.8100758331390654, 1.8068763712301634, 1.8038263602440836, 1.8007227847948146, 1.7977328949572606, 1.7946354507131663, 1.792005925326702, 1.7895847195735195, 1.7872246425511122, 1.7846223048249186, 1.7822724681569886, 1.7799342789989296, 1.7778140103882525, 1.7758315597094811, 1.7742030662784476, 1.7726093521777078, 1.770804588750702, 1.7692961753179255, 1.768056

In [24]:
# best_n_estimators = np.argmin(errors) + 1
best_n_estimators = np.argmin(errors)

print(best_n_estimators)

72


In [25]:
# build and fit our model using the optimal number of trees
best_regressor = GradientBoostingRegressor(
     max_depth=10,
     n_estimators=best_n_estimators,
     learning_rate=.01
)

best_regressor.fit(X_train, y_train)

# # Sklearn provides numerous metrics to evaluate 
# # the performance of our machine learning models.
# # They categorize the each metric according 
# # to the problem domain which they’re applicable. 
# # https://scikit-learn.org/stable/modules/model_evaluation.html <-- GO TO THIS SITE TO SEE WHICH METRICS YOU WILL USE.

# # We use the mean absolute error 
# # which can be interpreted as 
# # the average distance from 
# # our predictions and the actual values

# # this will give you the value of the stocks for the next period of time
y_pred = best_regressor.predict(X_test)

print(X_train)
print(y_train)

print(X_test)
print(y_pred)

# # this is the how well the model performed (looking for smallest error)
mean_absolute_error(y_test, y_pred)

[[0 1 0 5 0 -20.96276252641998]
 [1 1 5 5 0 -33.890534136151615]
 [0 1 0 5 7 -31.602563682826712]
 ...
 [2 2 6 3 0 7.0]
 [0 2 4 2 0 4.0]
 [0 1 1 5 0 -11.321091523581401]]
[0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 1. 2. 2. 1. 2. 0. 0. 0. 0. 0. 0. 3.
 0. 1. 7. 0. 0. 0. 0. 0. 0. 0. 0. 0. 2. 2. 2. 3. 0. 3. 0. 0. 1. 0. 0. 7.
 3. 0. 0. 0. 3. 0. 0. 3. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 3. 0. 0. 0. 2.
 0. 0. 0. 1. 3. 2. 1. 1. 3. 0. 0. 2. 0. 0. 0. 2. 1. 0. 3. 0. 5. 0. 2. 4.
 0. 0. 0. 0. 3. 1. 4. 1. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 1. 2. 0. 0. 1. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 1. 2. 0. 0. 0. 2. 0. 0. 2. 0. 2. 1. 0. 0. 0. 1.
 0. 0. 3. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 2. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 6. 0. 0. 0. 2. 2. 3. 0. 0. 0. 2. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 5. 0. 0. 0. 2. 1. 0. 0. 3. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 2. 0. 0. 4. 0. 1. 0. 0. 0. 0. 0. 3. 1. 2. 0. 0. 0. 1.
 0. 0. 0. 1. 0. 0. 0. 1. 2. 0. 0. 0. 0. 3. 0. 0. 2. 0. 0. 0. 4. 4. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 2

0.9219711744491497

In [26]:
# should we be using r2_score?
# how do you do residual plots?

In [27]:
print(r2_score(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))

0.12909632552262973
0.9219711744491497


In [28]:

# #     # Tomas: correlation analysis to see how your features are correlated to each other
    
# #     # as with any regression you need to minimize the mean square error.
#                                                         ------------------
# #     examples are at : 
# # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html#sklearn.metrics.mean_squared_error
# #     from sklearn.metrics import mean_squared_error
    
# # EMPTY PROCESS DATAFRAME   
    
# #     # accrossed all stocks, what is the average score.
# #     # what is the mean?
# #     # what is the median?
# #     # do we have any outliers that we need to note
# #     # does this work better for same sectors?